# Entity linking with Spacy plugins

Use pip environment : cf. instructions.md


* EntityLinker
* Tapioca

In [1]:
import spacy
import psycopg2
import pandas as pd
from spacy.tokens import Span
from spacy import displacy
from time import strftime, gmtime

In [2]:
from itables import init_notebook_mode, show
import re
from importlib import reload

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import postgresql_functions as pgf
import settings as stt

In [5]:
# reload(pgf)

In [7]:
### connect to the local database
conn = psycopg2.connect(host="localhost", port = 5432, database="espace_intellectuel", 
                        user="postgres", password=stt.dbw)
#conn

In [5]:
#conn.close()

In [8]:
q1 = """
select pk_mathshistory, "name", url, dates, length(biography) as eff, coreferenced_txt, biography 
from mathshistory.mathshistory m 
where pk_mathshistory in (103, 117, 133, 159, 186);
"""

In [12]:
result = pgf.sql_explore(q1, conn)
# print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

In [13]:
textes = pd.DataFrame(result[0])
textes.columns = ['id', 'name', 'url', 'dates', 'length_bio', 'coref_texte', 'biog']

In [14]:
textes.head()

Unnamed: 0,id,name,url,dates,length_bio,coref_texte,biog
0,103,Christopher Clavius,https://mathshistory.st-andrews.ac.uk/Biograph...,1538-1612,10616,Christopher Clavius was born in a German regio...,Christopher Clavius was born in a German regio...
1,117,Michael Mästlin,https://mathshistory.st-andrews.ac.uk/Biograph...,1550-1631,10122,Michael Mästlin was born in Göppingen which wa...,Michael Mästlin was born in Göppingen which wa...
2,133,Giuseppe Biancani,https://mathshistory.st-andrews.ac.uk/Biograph...,1566-1624,10035,Giuseppe Biancani's name also appears in name ...,Giuseppe Biancani's name also appears in its L...
3,159,Wilhelm Schickard,https://mathshistory.st-andrews.ac.uk/Biograph...,1592-1635,10272,Wilhelm Schickard's name is sometimes written ...,Wilhelm Schickard's name is sometimes written ...
4,186,Johannes Hevelius,https://mathshistory.st-andrews.ac.uk/Biograph...,1611-1687,10856,The first problem that we have to address is t...,The first problem that we have to address is t...


In [22]:
### Choose one document
txt = textes.iloc[1].coref_texte #[:1600]
print(txt)

Michael Mästlin was born in Göppingen which was a village about 50 km east of Tübingen. Mästlin father, Jakob Mästlin, and Mästlin mother, Dorothea Simon, were both devout Lutherans and Michael was brought up in that faith and remained strongly committed to father throughout Mästlin life. Mästlin was the middle child of the family, having an older sister and a younger brother. Mästlin attended the monastic school in Königsbronn then, after Mästlin studies there, entered Tübingen University in 1568. [3]:-
As was the case with many young scholars including Kepler, Mästlin most famous student, [Mästlin ] did Mästlin undergraduate studies at a preparatory school and came to the university to take Mästlin final exams and pick up Mästlin baccalaureate degree.At Tübingen University Mästlin studied mathematics and astronomy for a Master's degree under Philipp Apian who was Peter Apian's son. In 1570, while a student, Mästlin purchased a copy of Copernicus's De revolutionibus from the widow of 

## EntityLinker

* https://spacy.io/api/entitylinker


In [41]:
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("entityLinker", last=True)

<spacy_entity_linker.EntityLinker.EntityLinker at 0x7f5b526a8f40>

In [23]:
doc = nlp(txt)

In [None]:
all_linked_entities = doc._.linkedEntities
for sent in doc.sents:
    sent._.linkedEntities.pretty_print()

## Tapioca

In [42]:
nlpt = spacy.blank('en')
nlpt.add_pipe('opentapioca')

<spacyopentapioca.entity_linker.EntityLinker at 0x7f5b15eda050>

In [43]:
doc = nlpt(txt)

In [44]:
for span in doc.ents:
    print((span.text, span.kb_id_, span.label_, span._.description, span._.score))

('Michael Mästlin', 'Q75797', 'PERSON', ['German astronomer and mathematician'], 0.2854680000713362)
('Tübingen', 'Q3806', 'LOC', ['town in central Baden-Württemberg, Germany'], 0.17486283790026913)
('Lutherans', 'Q75809', 'ORG', ['form of Protestantism commonly associated with the teachings of Martin Luther'], 0.5236875221224303)
('baccalaureate degree', 'Q163727', 'ORGLOC', ['undergraduate academic degree lasting from three to seven years'], 0.9245065136261129)
('Philipp Apian', 'Q68523', 'PERSON', ['German mathematician and cartographer'], 0.8226047751456572)
('who', 'Q7817', 'ORG', ['specialized agency of the United Nations that is concerned with international public health'], 0.29657195961276084)
('Peter Apian', 'Q58662', 'PERSON', ['German astronomer, mathematician and cartographer'], 0.4184618420675229)
('Copernicus', 'Q619', 'PERSON', ['Polish mathematician and astronomer (1473–1543)'], 0.27609939171053066)
('De', 'Q183', 'LOC', ['country in Central Europe'], 0.9377298576625565

* https://www.wikidata.org/wiki/Q544169


## Test on a list of named entities

In [31]:
q2 = """
select regexp_replace(e_text, 'the |The ', '') e_name, e_label as e_class, count(*) as eff
from mathshistory.entity e 
where TRUE
and e_label in ('ORG', 'GPE')
--and e_text ~ 'Society'
group by regexp_replace(e_text, 'the |The ', ''), e_label 
order by eff desc;
"""

In [32]:
result = pgf.sql_explore(q2, conn)
print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

Lines count: 28495, errors count: 0, 
First lines: [('Paris', 'GPE', 2593), ('United States', 'GPE', 1781), ('Germany', 'GPE', 1664), ('France', 'GPE', 1568), ('London', 'GPE', 1481)]


In [33]:
nam_en = pd.DataFrame(result[0])
nam_en.columns = ['e_name', 'e_class', 'eff']

In [34]:
nam_en.head()

Unnamed: 0,e_name,e_class,eff
0,Paris,GPE,2593
1,United States,GPE,1781
2,Germany,GPE,1664
3,France,GPE,1568
4,London,GPE,1481


In [35]:
nam_en.groupby(by='e_class').size().sort_values(ascending=False)

e_class
ORG    21125
GPE     7370
dtype: int64

In [36]:
# apply function on two columns 

def create_label(row):
    if row.e_class == 'GPE':
        label = row.e_name + ' is a geographical place'
    elif row.e_class == 'ORG':
        label = row.e_name + ' is an organization'
    else:
        label = row.e_name

    return label


In [37]:
nam_en['label'] = nam_en.apply(create_label, axis=1)
nam_en.head()

Unnamed: 0,e_name,e_class,eff,label
0,Paris,GPE,2593,Paris is a geographical place
1,United States,GPE,1781,United States is a geographical place
2,Germany,GPE,1664,Germany is a geographical place
3,France,GPE,1568,France is a geographical place
4,London,GPE,1481,London is a geographical place


### Test with Tapioca

In [45]:
doc = nlpt(', '.join(list(nam_en.label)[:30]))

In [46]:
entities = [span for span in doc.ents]

for e in entities:
    print((e.text, e.kb_id_, e.label_, e._.description, e._.score))

('Paris', 'Q90', 'LOC', ['capital and largest city of France'], 0.1214906373005522)
('United States', 'Q30', 'LOC', ['country in North America'], 1.0127116327362797)
('Germany', 'Q183', 'LOC', ['country in Central Europe'], 0.7112720552567512)
('France', 'Q142', 'LOC', ['country in Western Europe'], 1.3557242937411331)
('London', 'Q84', 'LOC', ['capital and largest city of the United Kingdom'], 1.2238206631655175)
('England', 'Q21', 'LOC', ['country in north-west Europe, part of the United Kingdom'], 0.7924021860505607)
('Cambridge', 'Q350', 'LOC', ['city in Cambridgeshire, England'], 0.4328834914830876)
('Berlin', 'Q64', 'LOC', ['federal state, capital and largest city of Germany'], 0.680059181539742)
('Cambridge', 'Q350', 'LOC', ['city in Cambridgeshire, England'], 0.4206936526319923)
('Italy', 'Q38', 'LOC', ['country in Southern Europe'], 0.5552404961659491)
('Rome', 'Q220', 'LOC', ['capital and largest city of Italy'], 0.8699414931328439)
('New York', 'Q60', 'LOC', ['most populous 

### Test with entityLinker

In [48]:
doc = nlp(', '.join(list(nam_en.label)[:30]))

In [49]:
all_linked_entities = doc._.linkedEntities
for sent in doc.sents:
    sent._.linkedEntities.pretty_print()

<EntityElement: https://www.wikidata.org/wiki/Q90 Paris                     capital and largest city of France                >
<EntityElement: https://www.wikidata.org/wiki/Q82794 geographic region         2D or 3D defined space, mainly in terrestrial and astrophysics sciences>
<EntityElement: https://www.wikidata.org/wiki/Q30 United States of America  federal republic in North America                 >
<EntityElement: https://www.wikidata.org/wiki/Q82794 geographic region         2D or 3D defined space, mainly in terrestrial and astrophysics sciences>
<EntityElement: https://www.wikidata.org/wiki/Q183 Germany                   federal parliamentary republic in central-western Europe>
<EntityElement: https://www.wikidata.org/wiki/Q82794 geographic region         2D or 3D defined space, mainly in terrestrial and astrophysics sciences>
<EntityElement: https://www.wikidata.org/wiki/Q142 France                    republic with mainland in Europe and numerous oversea territories>
<EntityEl

In [50]:
ent = doc._.linkedEntities[0]
ent

<EntityElement: https://www.wikidata.org/wiki/Q90 Paris                     capital and largest city of France                >

In [51]:
print(ent.get_span(), ent.get_label(), 'https://www.wikidata.org/wiki/Q' + str(ent.get_id()), ent.get_description())

Paris Paris https://www.wikidata.org/wiki/Q90 capital and largest city of France


In [52]:
en_linked = []
for ent in doc._.linkedEntities:
    en_linked.append([str(ent.get_span()), ent.get_label(), 'https://www.wikidata.org/wiki/Q' + str(ent.get_id()), ent.get_description()])
len(en_linked)    

60

In [53]:
df_en_linked = pd.DataFrame(en_linked)
df_en_linked. columns = ['label_loc', 'label_wkd', 'URI_Wkd', 'definition']
df_en_linked.head()

Unnamed: 0,label_loc,label_wkd,URI_Wkd,definition
0,Paris,Paris,https://www.wikidata.org/wiki/Q90,capital and largest city of France
1,place,geographic region,https://www.wikidata.org/wiki/Q82794,"2D or 3D defined space, mainly in terrestrial ..."
2,United States,United States of America,https://www.wikidata.org/wiki/Q30,federal republic in North America
3,place,geographic region,https://www.wikidata.org/wiki/Q82794,"2D or 3D defined space, mainly in terrestrial ..."
4,Germany,Germany,https://www.wikidata.org/wiki/Q183,federal parliamentary republic in central-west...


In [54]:
nam_en_linked = pd.merge(nam_en, df_en_linked, how="left", left_on="e_name", right_on="label_loc", copy=True)

In [56]:
nam_en_linked.head(10)

Unnamed: 0,e_name,e_class,eff,label,label_loc,label_wkd,URI_Wkd,definition
0,Paris,GPE,2593,Paris is a geographical place,Paris,Paris,https://www.wikidata.org/wiki/Q90,capital and largest city of France
1,United States,GPE,1781,United States is a geographical place,United States,United States of America,https://www.wikidata.org/wiki/Q30,federal republic in North America
2,Germany,GPE,1664,Germany is a geographical place,Germany,Germany,https://www.wikidata.org/wiki/Q183,federal parliamentary republic in central-west...
3,France,GPE,1568,France is a geographical place,France,France,https://www.wikidata.org/wiki/Q142,republic with mainland in Europe and numerous ...
4,London,GPE,1481,London is a geographical place,London,London,https://www.wikidata.org/wiki/Q84,capital and largest city of the United Kingdom
5,England,GPE,1309,England is a geographical place,England,England,https://www.wikidata.org/wiki/Q21,"country in north-west Europe, part of the Unit..."
6,Cambridge,GPE,1236,Cambridge is a geographical place,Cambridge,University of Cambridge,https://www.wikidata.org/wiki/Q35794,collegiate public research university in Cambr...
7,Cambridge,GPE,1236,Cambridge is a geographical place,Cambridge,University of Cambridge,https://www.wikidata.org/wiki/Q35794,collegiate public research university in Cambr...
8,Berlin,GPE,1182,Berlin is a geographical place,Berlin,Berlin,https://www.wikidata.org/wiki/Q64,capital and largest city of Germany
9,Cambridge,ORG,1051,Cambridge is an organization,Cambridge,University of Cambridge,https://www.wikidata.org/wiki/Q35794,collegiate public research university in Cambr...


## Use entities list

In [92]:
doc = nlp(', '.join(list(nam_en.label)[:30]))

In [93]:
en_linked = []
for span in doc.ents:
    en_linked.append([span.text, span.kb_id_, span.label_, span._.description, span._.score])
len(en_linked)    

20

In [94]:
df_en_linked = pd.DataFrame(en_linked)
df_en_linked. columns = ['label', 'Wiki_Q', 'class', 'definitions', 'matching_score']
df_en_linked.head()

Unnamed: 0,label,Wiki_Q,class,definitions,matching_score
0,Paris,Q90,LOC,[capital and largest city of France],0.121491
1,United States,Q30,LOC,[country in North America],1.012712
2,Germany,Q183,LOC,[country in Central Europe],0.711272
3,France,Q142,LOC,[country in Western Europe],1.355724
4,London,Q84,LOC,[capital and largest city of the United Kingdom],1.223821


In [95]:
nam_en_linked = pd.merge(nam_en, df_en_linked, how="left", left_on="e_name", right_on="label", copy=True)

In [96]:
nam_en_linked.head(50)

Unnamed: 0,e_name,e_class,eff,label_x,label_y,Wiki_Q,class,definitions,matching_score
0,Paris,GPE,2593,Paris is a geographical place,Paris,Q90,LOC,[capital and largest city of France],0.121491
1,United States,GPE,1781,United States is a geographical place,United States,Q30,LOC,[country in North America],1.012712
2,Germany,GPE,1664,Germany is a geographical place,Germany,Q183,LOC,[country in Central Europe],0.711272
3,France,GPE,1568,France is a geographical place,France,Q142,LOC,[country in Western Europe],1.355724
4,London,GPE,1481,London is a geographical place,London,Q84,LOC,[capital and largest city of the United Kingdom],1.223821
5,England,GPE,1309,England is a geographical place,England,Q21,LOC,"[country in north-west Europe, part of the Uni...",0.792402
6,Cambridge,GPE,1236,Cambridge is a geographical place,Cambridge,Q350,LOC,"[city in Cambridgeshire, England]",0.432883
7,Cambridge,GPE,1236,Cambridge is a geographical place,Cambridge,Q350,LOC,"[city in Cambridgeshire, England]",0.420694
8,Berlin,GPE,1182,Berlin is a geographical place,Berlin,Q64,LOC,"[federal state, capital and largest city of Ge...",0.680059
9,Cambridge,ORG,1051,Cambridge is an organization,Cambridge,Q350,LOC,"[city in Cambridgeshire, England]",0.432883
