# Link persons to wikidata URIs

* Find URIs of resources in Wikidata corresponding to the persons metioned in the biographies



In [115]:
import psycopg2
from psycopg2.extras import execute_batch

from importlib import reload
# from matplotlib import pyplot as plt

import spacy

import pandas as pd
import re
import csv
from itables import init_notebook_mode, show

In [2]:
### itables : activation facultative avec la fonction show()
init_notebook_mode(all_interactive=False)

<IPython.core.display.Javascript object>

In [3]:
import postgresql_functions as pgf
import settings as stt

In [5]:
# reload(pgf)

In [13]:
conn = psycopg2.connect(host="localhost", port = 5432, database="espace_intellectuel", 
                        user="postgres", password=stt.dbw)

In [12]:
conn.close()

### Get persons list from database

In [14]:
q2 = """
select pk_mathshistory, "name", substring(dates, 1, 4)::INT as birth_date
from mathshistory.mathshistory m 
order by birth_date
"""

In [15]:
### Get the list of persons with birth year
with conn.cursor() as curs:
    curs.execute(q2)
    rs = curs.fetchall()

In [16]:
rs[100:103]

[(101, 'Egnatio Danti', 1536),
 (102, 'Francesco Barozzi', 1537),
 (103, 'Christopher Clavius', 1538)]

In [47]:
mats = [[s[1] + ' birthYear ' + str(s[2]), s[0]]   for s in rs]
mats[101:111]

[['Francesco Barozzi birthYear 1537', 102],
 ['Christopher Clavius birthYear 1538', 103],
 ['François Viète birthYear 1540', 104],
 ['Ludolph Van Ceulen birthYear 1540', 105],
 ['Thomas Allen birthYear 1540', 106],
 ['Guidobaldo del Monte birthYear 1545', 107],
 ['Paul Wittich birthYear 1546', 108],
 ['Tycho Brahe birthYear 1546', 110],
 ['Thomas Digges birthYear 1546', 109],
 ["Baha' al-Din al-Amili birthYear 1547", 111]]

## Tapioca

In [25]:
nlpt = spacy.blank('en')
nlpt.add_pipe('opentapioca')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/francesco/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/francesco/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


<spacyopentapioca.entity_linker.EntityLinker at 0x7feeeb48d360>

In [33]:
doc = nlpt('. '.join(mats[101:111]))

In [44]:
for span in doc.ents:
    print((span.text, span.kb_id_, span.label_, span._.description, span._.score))

('Christopher Clavius', 'Q76728', 'PERSON', ['German astronomer and mathematician (1538–1612)'], 0.2945102219102421)
('François Viète', 'Q188623', 'PERSON', ['French mathematician'], 0.44839082305086986)
('Ludolph Van Ceulen', 'Q310771', 'PERSON', ['German-Dutch mathematician'], 1.2828819202702326)
('Thomas Allen', 'Q953628', 'PERSON', ['English operatic baritone'], 0.040562816761049114)
('Guidobaldo del Monte', 'Q266017', 'PERSON', ['Italian mathematician, astronomer and philosopher'], 0.9464905171587287)
('Paul Wittich', 'Q88089', 'PERSON', ['German astronomer'], 0.21548868504353458)
('Tycho Brahe', 'Q36620', 'PERSON', ['Danish astronomer and alchemist, 1546–1601'], 0.619930193466644)
('Thomas Digges', 'Q531345', 'PERSON', ['English mathematician and astronomer (c.1546–1595)'], 0.06281734695954888)


In [51]:
pers_list = []
for p in mats[101:111]:
    doc = nlpt(p[0])
    for span in doc.ents:
        pers_list.append([p[1], span.text, span.kb_id_, span.label_, span._.description, span._.score])

In [54]:
len(pers_list), pers_list[:2]

(6,
 [[103,
   'Christopher Clavius',
   'Q76728',
   'PERSON',
   ['German astronomer and mathematician (1538–1612)'],
   0.5445257934961686],
  [104,
   'François Viète',
   'Q188623',
   'PERSON',
   ['French mathematician'],
   0.5093603884317542]])

In [55]:
len(mats)

3010

In [57]:
pers_list = []
errors = []
for p in mats:
    try:
        doc = nlpt(p[0])
        for span in doc.ents:
            pers_list.append([p[1], span.text, span.kb_id_, span.label_, span._.description, span._.score])
    except Exception as e:
        errors.append([p[1]],e )



In [58]:
len(errors), len(pers_list)

(0, 1687)

In [64]:
df_pers_list = pd.DataFrame(pers_list)
df_pers_list.columns = ['pk_mathshistory', 'name', 'id_wikidata', 'ner_type', 'notice', 'score']
df_pers_list.head()

Unnamed: 0,pk_mathshistory,name,id_wikidata,ner_type,notice,score
0,3,Hermann of Reichenau,Q68490,PERSON,"[German 11th-century Benedictine monk, histori...",0.201611
1,7,Omar Khayyam,Q35900,PERSON,[Persian mathematician and poet (1048–1131)],0.283835
2,9,Abraham bar Hiyya,Q31439,PERSON,[mathematician and astronomer],1.011805
3,12,Ben Ezra,Q28643144,PERSON,,0.604074
4,13,Jabir ibn Aflah,Q288111,PERSON,[Al-Andalus mathematician and astronomer],1.305197


In [66]:
df_pers_list.groupby(by='ner_type').size()

ner_type
LOC         27
ORG          3
ORGLOC       5
PERSON    1652
dtype: int64

In [70]:
df_pers_list[df_pers_list.ner_type == 'ORGLOC']

Unnamed: 0,pk_mathshistory,name,id_wikidata,ner_type,notice,score
163,320,La Condamine,Q99308636,ORGLOC,[former commune of the Principality of Monaco ...,0.700431
177,365,Paolo Frisi,Q52808196,ORGLOC,[high school in Milano in the province of Mila...,0.098529
368,786,Thorvald,Q97892379,ORGLOC,"[Cheese Makers in Upper Moutere, New Zealand]",0.29753
1022,1910,McVittie,Q115262287,ORGLOC,"[geographic township in Ontario, Canada]",0.120945
1480,2658,Spinadel,Q109572312,ORGLOC,"[former commune in Cantal, France]",0.12247


https://www.wikidata.org/entity/Q52808196

In [76]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(df_pers_list[df_pers_list.ner_type == 'PERSON'].sort_values(by='pk_mathshistory'), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,pk_mathshistory,name,id_wikidata,ner_type,notice,score
Loading... (need help?),,,,,,


In [73]:
d_file = "data/pers_wikidata.csv"

In [74]:
df_pers_list[df_pers_list.ner_type == 'PERSON'].to_csv(d_file)

In [87]:
df_pers = df_pers_list[df_pers_list.ner_type == 'PERSON']

In [None]:
df_pers['notice_agg'] = df_pers.notice.apply(lambda x: '; '.join(x) if x else '')

In [None]:
df_pers['uri'] = df_pers.id_wikidata.apply(lambda x: 'https://www.wikidata.org/entity/'+x)

In [98]:
df_pers.head()

Unnamed: 0,pk_mathshistory,name,id_wikidata,ner_type,notice,score,notice_agg,uri
0,3,Hermann of Reichenau,Q68490,PERSON,"[German 11th-century Benedictine monk, histori...",0.201611,"German 11th-century Benedictine monk, historia...",https://www.wikidata.org/entity/Q68490
1,7,Omar Khayyam,Q35900,PERSON,[Persian mathematician and poet (1048–1131)],0.283835,Persian mathematician and poet (1048–1131),https://www.wikidata.org/entity/Q35900
2,9,Abraham bar Hiyya,Q31439,PERSON,[mathematician and astronomer],1.011805,mathematician and astronomer,https://www.wikidata.org/entity/Q31439
3,12,Ben Ezra,Q28643144,PERSON,,0.604074,,https://www.wikidata.org/entity/Q28643144
4,13,Jabir ibn Aflah,Q288111,PERSON,[Al-Andalus mathematician and astronomer],1.305197,Al-Andalus mathematician and astronomer,https://www.wikidata.org/entity/Q288111


In [122]:
cp = df_pers.groupby(by='pk_mathshistory').size()

In [125]:
### pas de doublons
cp.describe()

count    1652.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
dtype: float64

## Import into database

### Create a new table

In [108]:
cq = """
create table if not exists mathshistory.wikidata (id integer, name varchar, uri varchar, notice text, score float);
"""

In [109]:
with conn.cursor() as cur:
    cur.execute(cq)

In [110]:
conn.rollback()

In [79]:
conn.commit()

### Prepare and insert tuples

In [100]:
lv = df_pers[['pk_mathshistory', 'name', 'uri',  'notice_agg', 'score']].values.tolist()
len(lv), lv[:3]

(1652,
 [[3,
   'Hermann of Reichenau',
   'https://www.wikidata.org/entity/Q68490',
   'German 11th-century Benedictine monk, historian, astronomer, mathematician, poet, and musical composer',
   0.20161108761658275],
  [7,
   'Omar Khayyam',
   'https://www.wikidata.org/entity/Q35900',
   'Persian mathematician and poet (1048–1131)',
   0.2838350995879192],
  [9,
   'Abraham bar Hiyya',
   'https://www.wikidata.org/entity/Q31439',
   'mathematician and astronomer',
   1.0118052171061187]])

In [111]:
tuples = [tuple(e) for e in lv] ; len(tuples), tuples[:3]

(1652,
 [(3,
   'Hermann of Reichenau',
   'https://www.wikidata.org/entity/Q68490',
   'German 11th-century Benedictine monk, historian, astronomer, mathematician, poet, and musical composer',
   0.20161108761658275),
  (7,
   'Omar Khayyam',
   'https://www.wikidata.org/entity/Q35900',
   'Persian mathematician and poet (1048–1131)',
   0.2838350995879192),
  (9,
   'Abraham bar Hiyya',
   'https://www.wikidata.org/entity/Q31439',
   'mathematician and astronomer',
   1.0118052171061187)])

In [118]:
with conn.cursor() as cur:
    execute_batch(cur, """INSERT INTO mathshistory.wikidata (id, name, uri, notice, score) VALUES (%s, %s, %s, %s, %s)""", tuples)

In [117]:
conn.rollback()

In [119]:
conn.commit()

In [127]:
iq = """
insert into mathshistory."statement" (fk_subject_instance, text_value, fk_property, fk_graph, import_metadata)
select id, uri, 1, 3011, '20230627_2_wd'  from mathshistory.wikidata;
"""

In [128]:
with conn.cursor() as cur:
    ### Commented to avoid disruption
    # cur.execute(iq)

In [129]:
conn.commit()