# Link biographies with URIs

* Find URIs of resources in DBPedia etc. corresponding to the persons metioned in the biographies



In [78]:
### Importation de modules
#import requests
import psycopg2
from psycopg2.extras import execute_batch
# import lxml.html
# from lxml import etree
from importlib import reload
from matplotlib import pyplot as plt
import sqlite3 as sql
import settings as st
import pandas as pd
import re
import csv
from itables import init_notebook_mode, show

In [2]:
### itables : activation facultative avec la fonction show()
init_notebook_mode(all_interactive=False)

<IPython.core.display.Javascript object>

In [3]:
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML

In [4]:
import sparql_functions as spqf

In [5]:
reload(spqf)

<module 'sparql_functions' from '/home/francesco/shared_files/python_notebooks/Early-Modern-Astronomy/mathshistory/sparql_functions.py'>

## Explore mathematician's list

## PostgreSQL

## Documentation

* https://www.psycopg.org/docs/usage.html
* https://towardsdatascience.com/python-and-postgresql-how-to-access-a-postgresql-database-like-a-data-scientist-b5a9c5a0ea43
  * noter toutefois que la connection directe avec psychopg2 et pandas ne marche plus
  
__NB__ 
* soit on passe par du SQL pur, et alors on peut utiliser _psycopg2_ comme on utilise sqlite3
* soit on utilise Pandas, mais là il faut un objet SQLAlchemy pour accéder à la base de données  
  
  
  

In [21]:
conn = psycopg2.connect(host="localhost", port = 5432, database="espace_intellectuel", 
                        user="postgres", password=st.dbw)

In [19]:
conn.close()

astronomers.mathshistory :

pk_mathshistory serial PRIMARY KEY,<br/>
name varchar NOT NULL,<br/>
dates varchar,<br/>
url varchar,<br/>
short_name varchar,<br/>
page_html xml,<br/>
page_html_txt text,<br/>
notes text,<br/> 
ts_creation timestamp

## Find DBPedia's persons' URIs

### Get persons list from database

In [15]:
q2 = """
select pk_mathshistory, "name", substring(dates, 1, 4)::INT as birth_date
from mathshistory.mathshistory m 
order by birth_date
"""

In [22]:
### Get the list of persons with birth year
with conn.cursor() as curs:
    curs.execute(q2)
    rs = curs.fetchall()

In [23]:
rs[100:103]

[(101, 'Egnatio Danti', 1536),
 (102, 'Francesco Barozzi', 1537),
 (103, 'Christopher Clavius', 1538)]

### Prepare SPARQL query

#### DBPedia text search syntax

* https://docs.openlinksw.com/virtuoso/textexprsyntax/
* Search string here: https://dbpedia.org/fct/ (e.g. Mästlin or Hérigone)
  * https://dbpedia.org/fct/facet.vsp?cmd=text&sid=11757
  * Then click on View query as SPARQL


This syntax seems to work:

 * ?label bif:contains '("pierre" AND "Hérigone")'
 * the language is to restrictive, I commented the line
  

In [25]:
endpoint = "https://dbpedia.org/sparql"

In [26]:
q1 = """
SELECT DISTINCT ?s ?label ( substr(str(?date), 1, 4)  as ?birthYear) ?dct 
WHERE {
            ?s a dbo:Person;
                   rdfs:label ?label . 
     OPTIONAL {?s   dbo:birthDate|dbp:birthDate ?date.}
      OPTIONAL { ?s dcterms:subject ?dct.
      FILTER (contains(str(?dct), 'births'))
}
            FILTER (lang(?label) = 'en'). 
            ?label bif:contains "'Francesco Barozzi'"
}
"""

In [27]:
rs[:3]

[(2, 'Jia Xian', 1010),
 (1, 'al-Nasawi', 1010),
 (3, 'Hermann of Reichenau', 1013)]

In [28]:
### Execute the SPARQL query
qr = spqf.get_json_sparql_result(endpoint,q1)


In [29]:
r = [l for l in spqf.sparql_result_to_list(qr)]
print(len(r))
r[:50]

2


[['http://dbpedia.org/resource/Francesco_Barozzi',
  'Francesco Barozzi',
  '',
  'http://dbpedia.org/resource/Category:1537_births'],
 ['http://dbpedia.org/resource/Francesco_Barozzi_(bishop)',
  'Francesco Barozzi (bishop)',
  '',
  '']]

In [30]:
type(r[:50])

list

In [31]:
for t_pers in rs[:3]:
    
    p = list(t_pers)
    parts = p[1].replace("'", ' ')\
                .replace('-', ' ').replace('  ', ' ').split(' ')
    pp = [part for part in parts if len(part) > 2]
    ppt = '"'+ '" AND "'.join(pp) + '"'
    print(ppt)

"Jia" AND "Xian"
"Nasawi"
"Hermann" AND "Reichenau"


### Find mathematicians in DBPedia

This SPARQL query leverages the Virtuoso full text query syntax of DBPedia


In [33]:
l = ['pk_mathshistory', 'name', 'birthDate', 'error']
### Commented to avoid emptying logs
# with open("data/logs_1_bis.csv", "w") as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(l)

In [34]:
### This script retrieves the DBPedia persons using the name
q_result= []
for t_pers in rs:
    
    p = list(t_pers)
    
    parts = p[1].replace("'", ' ')\
                .replace('-', ' ').replace('  ', ' ').split(' ')
    pp = [part for part in parts if len(part) > 2]
    ppt = '"'+ '" AND "'.join(pp) + '"'
    
    q = """
        SELECT DISTINCT ?s ?label ( substr(str(?date), 1, 4)  as ?birthYear) ?dct 
        WHERE {
                    ?s a dbo:Person;
                           rdfs:label ?label . 
             OPTIONAL {?s   dbo:birthDate|dbp:birthDate ?date.}
              OPTIONAL { ?s dcterms:subject ?dct.
              FILTER (contains(str(?dct), 'births'))
        }
                    ### This filter was omitted in order to raise the recall
                    # FILTER (lang(?label) = 'en'). 
                    ?label bif:contains '(""" + ppt + """)'
        }
        """

    
    try:
        sparql = SPARQLWrapper(endpoint)
        sparql.setQuery(q)
        sparql.setReturnFormat(JSON)
        sparql.setMethod('POST')
        qr = sparql.queryAndConvert()
        r = [l for l in spqf.sparql_result_to_list(qr)]
        q_result.append(p + [r[:20]])
    except Exception as e:
        
        print(e)

        ttt = p + [str(e)]
        with open("data/logs_1.csv", "a") as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow(ttt)


In [46]:
### All rows where treated without errors
len(q_result), q_result[:5]

(3010,
 [[2,
   'Jia Xian',
   1010,
   [['http://dbpedia.org/resource/Jia_Xian', 'Jia Xian', '', ''],
    ['http://dbpedia.org/resource/Jia_Xian', 'Jia Xian', '', ''],
    ['http://dbpedia.org/resource/Jia_Xian', 'Jia Xian', '', ''],
    ['http://dbpedia.org/resource/Jia_Xian', 'Jia Xian', '', ''],
    ['http://dbpedia.org/resource/Jia_Xian', 'Jia Xian', '', '']]],
  [1,
   'al-Nasawi',
   1010,
   [['http://dbpedia.org/resource/Alī_ibn_Ahmad_al-Nasawī',
     'Ali ibn Ahmad al-Nasawi',
     '',
     'http://dbpedia.org/resource/Category:1010s_births'],
    ['http://dbpedia.org/resource/Shihab_al-Din_Muhammad_al-Nasawi',
     'Shihab al-Din Muhammad al-Nasawi',
     '',
     '']]],
  [3,
   'Hermann of Reichenau',
   1013,
   [['http://dbpedia.org/resource/Hermann_of_Reichenau',
     'Hermann of Reichenau',
     '1013',
     'http://dbpedia.org/resource/Category:1013_births'],
    ['http://dbpedia.org/resource/Hermann_of_Reichenau',
     'Hermann de Reichenau',
     '1013',
     'http:

[27 June 2023] no errors in logs, 3010 results

### Identify persons using the year of birth

N.B. There are multiple rows in the SPARQL result because of the multiple languages. The 'en' language filter was omittet in order to raise recall

The next cell will reduce the redundant URIs

In [None]:
l_match = []
l_no_match_alternative = []
l_no_match = []
for r in q_result:  #[:20]:
    
    lm = []
    if len(r[3]) > 0:
        # for each element in the DBPedia answer
        for e in r[3]:
            
            # take the date if it exists
            if e[2]:
                d3 = e[2]
                try:
                    ## add a span of 2 years before or after
                    if ((r[2] <= int(d3) + 2 ) and (r[2] >= int(d3) - 2 )):
                        lm =[r[0],r[1],r[2], d3, e[1], e[0]]
                        l_match.append(lm)
                except Exception as e:
                    print(r[0],r[1],r[2],e)
            if len(lm) > 0:
                break

        if len(lm) == 0:
            l_no_match_alternative.append(r)
    else:
        l_no_match.append(r)

#### Matches

1761 matches

In [41]:
len(l_match), l_match[99:102]

(1761,
 [[244,
   'Gottfried Leibniz',
   1646,
   '1646',
   'Gottfried Wilhelm Leibniz',
   'http://dbpedia.org/resource/Gottfried_Wilhelm_Leibniz'],
  [246,
   'Denis Papin',
   1647,
   '1647',
   'Denis Papin',
   'http://dbpedia.org/resource/Denis_Papin'],
  [248,
   'Giovanni Ceva',
   1647,
   '1647',
   'Giovanni Ceva',
   'http://dbpedia.org/resource/Giovanni_Ceva']])

#### Persons with no query answer (no match in SPARQL query result)

558 without any match

In [43]:
len(l_no_match), l_no_match[30:35]

(558,
 [[188, 'Andrea Tacquet', 1612, []],
  [202, 'Nicolaus Mercator', 1620, []],
  [211, 'Stephano degli Angeli', 1623, []],
  [218, 'Pietro Mengoli', 1626, []],
  [221, 'Johann Hudde', 1628, []]])

In [39]:
df_no_match = pd.DataFrame([e[:3] for e in l_no_match])
df_no_match.columns = ['pk_mathshistory', 'name_year_mathshistory', 'birth_year_mathshistory']
df_no_match.head()

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory
0,6,Shen Kua,1031
1,12,Rabbi Ben Ezra,1092
2,15,Gherard,1114
3,17,Sharaf al-Din al-Tusi,1135
4,18,Reinher of Paderborn,1140


In [391]:
### Persons with no query answer
fp = "data/df_no_match_dbpedia.csv"
df_no_match.to_csv(fp)

#### Persons with query answer and match using birthy year

In [50]:
df_match = pd.DataFrame(l_match)
df_match.columns = ['pk_mathshistory', 'name_year_mathshistory', 
                    'birth_year_mathshistory', 'birth_year_dbpedia', 'name_year_dbpedia', 'uri' ]
df_match.head(3)

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_year_dbpedia,uri
0,3,Hermann of Reichenau,1013,1013,Hermann of Reichenau,http://dbpedia.org/resource/Hermann_of_Reichenau
1,5,al-Zarqali,1029,1029,Az-Zarqali,http://dbpedia.org/resource/Abū_Isḥāq_Ibrāhīm_...
2,7,Omar Khayyam,1048,1048,Omar Khayyam,http://dbpedia.org/resource/Omar_Khayyam


In [389]:
fp = "data/df_match_dbpedia.csv"
df_match.to_csv(fp)

### Persons with no match but additional information

In [52]:
len(l_no_match_alternative), l_no_match_alternative[10:12]

(691,
 [[22,
   'Li Zhi',
   1192,
   [['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi',
     'Cao Zhi',
     '--12',
     'http://dbpedia.org/resource/Category:192_births'],
    ['http://dbpedia.org/resource/Cao_Zhi_(politician)'

### New match using additional information

In [None]:
l_match_2 = []
l_no_match_alternative_2 = []
l_no_match_2 = []
for r in l_no_match_alternative:
    
    lm = []
    # for each element in the DBPedia answer
    for e in r[3]:

        # take the date if it exists
        if e[3] and 'birth' in e[3]:
            
            try:
                d3 = str(re.findall(r'\d{4}', e[3])[0])
                # print(d3)
                if (str(r[2])[:3] == str(d3)[:3]):
                    lm =[r[0],r[1],r[2], d3, e[1], e[0]]
                    l_match_2.append(lm)
            except Exception as e:
                print(r[0],r[1],r[2],e)
        if len(lm) > 0:
            break

    if len(lm) == 0:
        l_no_match_alternative_2.append(r)


In [54]:
len(l_match_2), l_match_2[:5]

(507,
 [[1,
   'al-Nasawi',
   1010,
   '1010',
   'Ali ibn Ahmad al-Nasawi',
   'http://dbpedia.org/resource/Alī_ibn_Ahmad_al-Nasawī'],
  [9,
   'Abraham bar Hiyya',
   1070,
   '1070',
   'Abraham bar Hiyya',
   'http://dbpedia.org/resource/Abraham_bar_Hiyya'],
  [14,
   'Bhaskara II',
   1114,
   '1110',
   'Bhaskara II',
   'http://dbpedia.org/resource/Bhāskara_II'],
  [19,
   'Robert Grosseteste',
   1168,
   '1168',
   'Robert Grosseteste',
   'http://dbpedia.org/resource/Robert_Grosseteste'],
  [20,
   'Fibonacci',
   1170,
   '1170',
   'Fibonacci',
   'http://dbpedia.org/resource/Fibonacci']])

### Merge matches

In [55]:
print(len(l_match))
l_match_1 = l_match + l_match_2; len(l_match_1), len(l_match_2)

1761


(2268, 507)

In [63]:
df_match = pd.DataFrame(sorted(l_match_1, key= lambda row: row[2]))
df_match.columns = ['pk_mathshistory', 'name_year_mathshistory', 
                    'birth_year_mathshistory', 'birth_year_dbpedia', 'name_dbpedia', 'uri' ]
len(df_match)

2268

In [64]:
df_match.iloc[200:203]

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_dbpedia,uri
200,279,Samuel Clarke,1675,1675,Samuel Clarke,http://dbpedia.org/resource/Samuel_Clarke
201,281,Jacopo Riccati,1676,1676,Jacopo Riccati,http://dbpedia.org/resource/Jacopo_Riccati
202,282,Jacques Cassini,1677,1677,Jacques Cassini,http://dbpedia.org/resource/Jacques_Cassini


In [65]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(df_match.sort_values(by='pk_mathshistory'), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_dbpedia,uri
Loading... (need help?),,,,,,


In [58]:
### replace former file
fp = "data/df_match_dbpedia.csv"
df_match.to_csv(fp)

### Import into database

In [74]:
lv = df_match[['pk_mathshistory', 'uri']].values.tolist()
len(lv), lv[:3]

(2268,
 [[1, 'http://dbpedia.org/resource/Alī_ibn_Ahmad_al-Nasawī'],
  [3, 'http://dbpedia.org/resource/Hermann_of_Reichenau'],
  [5, 'http://dbpedia.org/resource/Abū_Isḥāq_Ibrāhīm_al-Zarqālī']])

In [82]:
tuples = [tuple(e) for e in lv] ; tuples[:3]

[(1, 'http://dbpedia.org/resource/Alī_ibn_Ahmad_al-Nasawī'),
 (3, 'http://dbpedia.org/resource/Hermann_of_Reichenau'),
 (5, 'http://dbpedia.org/resource/Abū_Isḥāq_Ibrāhīm_al-Zarqālī')]

In [83]:
cur = conn.cursor()

In [84]:
cq = """
create table mathshistory.dbpedia (id integer, uri varchar);
"""

In [85]:
cur.execute(cq)

In [86]:
execute_batch(cur, """INSERT INTO mathshistory.dbpedia (id, uri)
VALUES (%s, %s)""", tuples)

In [88]:
conn.commit()

In [89]:
iq = """
insert into mathshistory."statement" (fk_subject_instance, text_value, fk_property, fk_graph, import_metadata)
select id, uri, 1, 3011, '20230627_1'  from mathshistory.dbpedia;
"""

In [90]:
with conn.cursor() as cur:
    ### Commented to avoid disruption
    # cur.execute(iq)

In [91]:
conn.commit()

## Remainining non matched with additional information

In [61]:
len(l_no_match_alternative_2)  #, l_no_match_alternative_2[5:7]

184

In [440]:
df_no_match_alternative_2 = pd.DataFrame(sorted(l_no_match_alternative_2, key= lambda row: row[2]))
df_no_match_alternative_2.columns = ['pk_mathshistory', 'name_year_mathshistory', 
                    'birth_year_mathshistory', 'dbpedia_query result']
df_no_match_alternative_2.head()

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,dbpedia_query result
0,2,Jia Xian,1010,"[[http://dbpedia.org/resource/Jia_Xian, Jia Xi..."
1,4,Sripati,1019,"[[http://dbpedia.org/resource/Sripati_Mishra, ..."
2,11,Hemchandra,1089,[[http://dbpedia.org/resource/Hemchandra_Barua...
3,16,Al-Samawal,1130,[[http://dbpedia.org/resource/Samawal_Merghani...
4,22,Li Zhi,1192,"[[http://dbpedia.org/resource/Cao_Zhi, Cao Zhi..."


In [410]:
fp = "data/df_no_match_alternative_2.csv"
df_no_match_alternative_2.to_csv(fp)

### New match attempt

[27 June 2023]

* Given the results of this matching, and their uncertainty for the later period:
  * first inspect wikidata matching and overall matching analysis in the database
  * analyse and import manually




In [441]:
len(df_no_match), df_no_match.head()

(558,
    pk_mathshistory name_year_mathshistory  birth_year_mathshistory
 0                6               Shen Kua                     1031
 1               12         Rabbi Ben Ezra                     1092
 2               15                Gherard                     1114
 3               17  Sharaf al-Din al-Tusi                     1135
 4               18   Reinher of Paderborn                     1140)

In [442]:
len(df_no_match_alternative_2), df_no_match_alternative_2.head()

(177,
    pk_mathshistory name_year_mathshistory  birth_year_mathshistory  \
 0                2               Jia Xian                     1010   
 1                4                Sripati                     1019   
 2               11             Hemchandra                     1089   
 3               16             Al-Samawal                     1130   
 4               22                 Li Zhi                     1192   
 
                                 dbpedia_query result  
 0  [[http://dbpedia.org/resource/Jia_Xian, Jia Xi...  
 1  [[http://dbpedia.org/resource/Sripati_Mishra, ...  
 2  [[http://dbpedia.org/resource/Hemchandra_Barua...  
 3  [[http://dbpedia.org/resource/Samawal_Merghani...  
 4  [[http://dbpedia.org/resource/Cao_Zhi, Cao Zhi...  )

In [443]:
df_no_match_alternative_2.columns

Index(['pk_mathshistory', 'name_year_mathshistory', 'birth_year_mathshistory',
       'dbpedia_query result'],
      dtype='object')

In [444]:
df_to_find = pd.concat([df_no_match, df_no_match_alternative_2[['pk_mathshistory', 'name_year_mathshistory', 'birth_year_mathshistory']]],
                      names=['pk_mathshistory', 'name_year_mathshistory', 'birth_year_mathshistory',])
df_to_find.head()

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory
0,6,Shen Kua,1031
1,12,Rabbi Ben Ezra,1092
2,15,Gherard,1114
3,17,Sharaf al-Din al-Tusi,1135
4,18,Reinher of Paderborn,1140


In [445]:
len(df_to_find), df_to_find.tail()

(735,
      pk_mathshistory name_year_mathshistory  birth_year_mathshistory
 172             2773       Nicolas Bourbaki                     1935
 173             2768      Hitoshi Kumano-Go                     1935
 174             2892          John O'Connor                     1945
 175             2923              John Carr                     1948
 176             2987         Richard Taylor                     1962)

In [446]:
df_values = df_to_find.sort_values(by='birth_year_mathshistory').values.tolist()
list(df_values)

[[2, 'Jia Xian', 1010],
 [4, 'Sripati', 1019],
 [6, 'Shen Kua', 1031],
 [11, 'Hemchandra', 1089],
 [12, 'Rabbi Ben Ezra', 1092],
 [15, 'Gherard', 1114],
 [16, 'Al-Samawal', 1130],
 [17, 'Sharaf al-Din al-Tusi', 1135],
 [18, 'Reinher of Paderborn', 1140],
 [22, 'Li Zhi', 1192],
 [23, 'John Sacrobosco', 1195],
 [24, 'Albertus', 1200],
 [27, 'Roger Bacon', 1214],
 [28, 'Johannes Campanus', 1220],
 [29, "Muhyi l'din al-Maghribi", 1220],
 [30, 'Jordanus Nemorarius', 1225],
 [33, 'Jacob ben Tibbon', 1236],
 [35, 'Shams al-Din al-Samarqandi', 1250],
 [38, 'Zhu Shijie', 1260],
 [40, 'Levi ben Gerson', 1288],
 [42, 'Thomas Bradwardine', 1295],
 [44, 'Al-Khalili', 1320],
 [45, 'Nicholas Oresme', 1323],
 [46, 'Mahendra Suri', 1340],
 [47, 'Narayana', 1340],
 [48, 'Madhava', 1350],
 [50, 'Paramesvara', 1370],
 [52, 'al-Kashi', 1390],
 [54, 'Al-Umawi', 1400],
 [55, 'Nicholas Kryffs', 1401],
 [57, 'Leone Battista Alberti', 1404],
 [59, 'Piero della Francesca', 1420],
 [65, 'Nicolas Chuquet', 1445],


In [449]:
### Query on just last or long name 
q_result= []
for t_pers in df_values: #[30:40]:
    
    p = list(t_pers)
    
    parts = p[1].replace("'", ' ')\
                .replace('-', ' ').replace('  ', ' ').split(' ')
    pp = [part for part in parts if len(part) > 2][-1]
    # print(pp)
    ppt = '"'+pp+'"'
    
    q = """
        SELECT DISTINCT ?s ?label ( substr(str(?date), 1, 4)  as ?birthYear) ?dct 
        WHERE {
                    ?s a dbo:Person;
                           rdfs:label ?label . 
             OPTIONAL {?s   dbo:birthDate|dbp:birthDate ?date.}
              OPTIONAL { ?s dcterms:subject ?dct.
              FILTER (contains(str(?dct), 'births'))
        }
                   # FILTER (lang(?label) IN ('en', 'de', 'fr', 'it', 'es' )). 
                    ?label bif:contains '(""" + ppt + """)'
        }
        """

    
    try:
        sparql = SPARQLWrapper(endpoint)
        sparql.setQuery(q)
        sparql.setReturnFormat(JSON)
        sparql.setMethod('POST')
        qr = sparql.queryAndConvert()
        r = [l for l in spqf.sparql_result_to_list(qr)]
        q_result.append(p + [r[:20]])
    except Exception as e:
        
        print(e)

        ttt = p + [str(e)]
        with open("data/logs_1.csv", "a") as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow(ttt)


In [450]:
len(q_result), q_result[90:100]

(735,
 [[357, 'Nicole-Reine Etable de Labrière', 1723, []],
  [380,
   'Monteiro da Rocha',
   1734,
   [['http://dbpedia.org/resource/Caio_Vidal',
     'Caio Vidal Rocha',
     '2000',
     'http://dbpedia.org/resource/Category:2000_births'],
    ['http://dbpedia.org/resource/Carey_Morgan',
     'José Cayetano Valadés Rocha',
     '1884',
     'http://dbpedia.org/resource/Category:1884_births'],
    ['http://dbpedia.org/resource/Carla_Salomé_Rocha',
     'Carla Salomé Rocha',
     '1990',
     'http://dbpedia.org/resource/Category:1990_births'],
    ['http://dbpedia.org/resource/Carla_Salomé_Rocha',
     'Carla Salomé Rocha',
     '1990',
     'http://dbpedia.org/resource/Category:1990_births'],
    ['http://dbpedia.org/resource/Carla_Salomé_Rocha',
     'Carla Salomé Rocha',
     '1990',
     'http://dbpedia.org/resource/Category:1990_births'],
    ['http://dbpedia.org/resource/Carles_Fontserè',
     'José Cayetano Valadés Rocha',
     '1916',
     'http://dbpedia.org/resource/Catego

### New identification attempt

In [470]:
l_match = []
l_no_match_alternative = []
l_no_match = []
for r in q_result:  #[:20]:
    
    lm = []
    if len(r[3]) > 0:
        # for each element in the DBPedia answer
        for e in r[3]:
            
            # take the date if it exists
            if e[2]:
                d3 = e[2]
                try:
                    ### Restrict time span for better precision ! 
                    if ((r[2] <= int(d3) + 0 ) and (r[2] >= int(d3) - 0 )):
                        lm =[r[0],r[1],r[2], d3, e[1], e[0]]
                        l_match.append(lm)
                except Exception as e:
                    print(r[0],r[1],r[2],e)
                    
            if len(lm) > 0:
                break

        if len(lm) == 0:
            l_no_match_alternative.append(r)
    else:
        l_no_match.append(r)

2 Jia Xian 1010 invalid literal for int() with base 10: '--04'
2 Jia Xian 1010 invalid literal for int() with base 10: '--04'
2 Jia Xian 1010 invalid literal for int() with base 10: '--04'
2 Jia Xian 1010 invalid literal for int() with base 10: '--04'
2 Jia Xian 1010 invalid literal for int() with base 10: '--03'
6 Shen Kua 1031 invalid literal for int() with base 10: 'Nove'
16 Al-Samawal 1130 invalid literal for int() with base 10: 'c. 1'
16 Al-Samawal 1130 invalid literal for int() with base 10: 'c. 1'
17 Sharaf al-Din al-Tusi 1135 invalid literal for int() with base 10: 'http'
17 Sharaf al-Din al-Tusi 1135 invalid literal for int() with base 10: 'http'
17 Sharaf al-Din al-Tusi 1135 invalid literal for int() with base 10: 'http'
17 Sharaf al-Din al-Tusi 1135 invalid literal for int() with base 10: 'http'
17 Sharaf al-Din al-Tusi 1135 invalid literal for int() with base 10: 'http'
17 Sharaf al-Din al-Tusi 1135 invalid literal for int() with base 10: 'http'
17 Sharaf al-Din al-Tusi 113

In [471]:
### Persons with no query answer
len(l_no_match), l_no_match[:5]

(91,
 [[15, 'Gherard', 1114, []],
  [50, 'Paramesvara', 1370, []],
  [55, 'Nicholas Kryffs', 1401, []],
  [73, 'Charles de Bouvelles', 1471, []],
  [82, 'Petrus Apianus', 1495, []]])

In [472]:
len(l_match), l_match[:5]

(117,
 [[6,
   'Shen Kua',
   1031,
   '1031',
   'Šen Kua',
   'http://dbpedia.org/resource/Shen_Kuo'],
  [94,
   'Peter Ramus',
   1515,
   '1515',
   'Petrus Ramus',
   'http://dbpedia.org/resource/Petrus_Ramus'],
  [221,
   'Johann Hudde',
   1628,
   '1628',
   'Johannes Hudde',
   'http://dbpedia.org/resource/Johannes_Hudde'],
  [382,
   'Antonio Mario Lorgna',
   1735,
   '1735',
   'Antonio Maria Lorgna',
   'http://dbpedia.org/resource/Antonio_Maria_Lorgna'],
  [430,
   'Nikolai Fuss',
   1755,
   '1755',
   'Nicolas Fuss',
   'http://dbpedia.org/resource/Nicolas_Fuss']])

In [473]:
df_match_3 = pd.DataFrame(l_match)
df_match_3.columns = ['pk_mathshistory', 'name_year_mathshistory', 
                    'birth_year_mathshistory', 'birth_year_dbpedia', 'name_year_dbpedia', 'uri' ]
df_match_3.head()

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_year_dbpedia,uri
0,6,Shen Kua,1031,1031,Šen Kua,http://dbpedia.org/resource/Shen_Kuo
1,94,Peter Ramus,1515,1515,Petrus Ramus,http://dbpedia.org/resource/Petrus_Ramus
2,221,Johann Hudde,1628,1628,Johannes Hudde,http://dbpedia.org/resource/Johannes_Hudde
3,382,Antonio Mario Lorgna,1735,1735,Antonio Maria Lorgna,http://dbpedia.org/resource/Antonio_Maria_Lorgna
4,430,Nikolai Fuss,1755,1755,Nicolas Fuss,http://dbpedia.org/resource/Nicolas_Fuss


In [474]:
## Explorer les données importées: on voit que les différences des noms sont importantes
# Configuration:  https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(df_match_3.sort_values(by='birth_year_mathshistory'), classes="display", scrollY="200px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_year_dbpedia,uri
Loading... (need help?),,,,,,


#### Again new match using additional information

In [None]:
l_match_2 = []
l_no_match_alternative_2 = []
l_no_match_2 = []
for r in l_no_match_alternative:
    
    lm = []
    # for each element in the DBPedia answer
    for e in r[3]:

        # take the date if it exists
        if e[3] and 'birth' in e[3]:
            
            try:
                d3 = str(re.findall(r'\d{4}', e[3])[0])
                # print(d3)
                if (str(r[2])[:3] == str(d3)[:3]):
                    lm =[r[0],r[1],r[2], d3, e[1], e[0]]
                    l_match_2.append(lm)
            except Exception as e:
                print(r[0],r[1],r[2],e)
        if len(lm) > 0:
            break

    if len(lm) == 0:
        l_no_match_alternative_2.append(r)


In [476]:
len(l_match_2), l_match_2[:5]

(127,
 [[23,
   'John Sacrobosco',
   1195,
   '1195',
   'Sacrobosco',
   'http://dbpedia.org/resource/Johannes_de_Sacrobosco'],
  [28,
   'Johannes Campanus',
   1220,
   '1220',
   'Campanus van Novara',
   'http://dbpedia.org/resource/Campanus_of_Novara'],
  [45,
   'Nicholas Oresme',
   1323,
   '1320',
   'Mikołaj z Oresme',
   'http://dbpedia.org/resource/Nicole_Oresme'],
  [88,
   'Frederico Commandino',
   1506,
   '1509',
   'Federico Commandino',
   'http://dbpedia.org/resource/Federico_Commandino'],
  [101,
   'Egnatio Danti',
   1536,
   '1530',
   'Vincenzo Danti',
   'http://dbpedia.org/resource/Vincenzo_Danti']])

In [477]:
df_match_4 = pd.DataFrame(l_match_2)
df_match_4.columns = ['pk_mathshistory', 'name_year_mathshistory', 
                    'birth_year_mathshistory', 'birth_year_dbpedia', 'name_year_dbpedia', 'uri' ]
df_match_4.head()

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_year_dbpedia,uri
0,23,John Sacrobosco,1195,1195,Sacrobosco,http://dbpedia.org/resource/Johannes_de_Sacrob...
1,28,Johannes Campanus,1220,1220,Campanus van Novara,http://dbpedia.org/resource/Campanus_of_Novara
2,45,Nicholas Oresme,1323,1320,Mikołaj z Oresme,http://dbpedia.org/resource/Nicole_Oresme
3,88,Frederico Commandino,1506,1509,Federico Commandino,http://dbpedia.org/resource/Federico_Commandino
4,101,Egnatio Danti,1536,1530,Vincenzo Danti,http://dbpedia.org/resource/Vincenzo_Danti


In [478]:
## Explorer les données importées: on voit que les différences des noms sont importantes
# Configuration:  https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(df_match_4.sort_values(by='birth_year_mathshistory'), classes="display", scrollY="200px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,pk_mathshistory,name_year_mathshistory,birth_year_mathshistory,birth_year_dbpedia,name_year_dbpedia,uri
Loading... (need help?),,,,,,


In [190]:
l_no_match

[]

In [None]:
l_match

In [94]:
lr = []
for r in q_result[:5]:
    # if len(r) > 1:
    print(len(r[1:]))


1
1
1
2
0


In [97]:
lr = pd.Series([len(r[1:]) for r in q_result]); len(lr)

3009

In [98]:
lr.describe()

count    3009.000000
mean        1.714191
std         3.360252
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max        20.000000
dtype: float64

In [100]:
len(lr[lr==0]), len(lr[lr > 1])

(702, 534)

In [101]:
lr[lr > 1].describe()

count    534.000000
mean       6.338951
std        6.060305
min        2.000000
25%        2.000000
50%        3.000000
75%        8.000000
max       20.000000
dtype: float64

In [79]:
df_result = pd.DataFrame(q_result); df_result.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,"[31, Guo Shoujing, 1231]","[http://dbpedia.org/resource/Guo_Shoujing, Guo...",,,,,,
1,"[32, Ramon Llull, 1235]","[http://dbpedia.org/resource/Ramon_Llull, Ramo...",,,,,,
2,"[33, Jacob ben Tibbon, 1236]",,,,,,,
3,"[34, Yang Hui, 1238]","[http://dbpedia.org/resource/Yang_Hui, Yang Hu...","[http://dbpedia.org/resource/Yang_Gyeong-hui, ...","[http://dbpedia.org/resource/Yang_Hui-cheon, Y...","[http://dbpedia.org/resource/Yang_Hui-chun, Ya...",[http://dbpedia.org/resource/Yang_Hui_(figure_...,"[http://dbpedia.org/resource/Yang_Kyong-hui, Y...","[http://dbpedia.org/resource/Yang-Hui_He, Yang..."
4,"[35, Shams al-Din al-Samarqandi, 1250]",,,,,,,


## Wikidata

```SPARQL
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

select distinct ?item ?itemLabel ?birthDate ?year
where {
    ?item wdt:P31 wd:Q5.  # Any instance of a human.
    ?item wdt:P106 wd:Q170790;
          wdt:P569 ?birthDate.
#    ?birthPlace wdt:P31 ?birthPlaceType.
#   filter not exists {?birthPlace a wdno:P6}.

BIND(REPLACE(str(?birthDate), "(.*)([0-9]{4})(.*)", "$2") AS ?year)
# FILTER(?year > 1537 )
### Filtrage sur les années comme entiers si souhaité
FILTER(xsd:integer(?year) > ("1537"^^xsd:integer - 2)  && (xsd:integer(?year) < "1537"^^xsd:integer + 2))
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,nl" }

}
ORDER BY ?year
LIMIT 50
```

```SPARQL
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

select ?item ?itemLabel ?birthDate ?year
where {
    {
select distinct ?item ?itemLabel ?birthDate ?year
where {
     ?item wdt:P31 wd:Q5.  # Any instance of a human.
    # ?item wdt:P106 wd:Q170790 OR wd:Q333);
     ?item     wdt:P569 ?birthDate.
#    ?birthPlace wdt:P31 ?birthPlaceType.
#   filter not exists {?birthPlace a wdno:P6}.

   BIND(REPLACE(str(?birthDate), "(.*)([0-9]{4})(.*)", "$2") AS ?year)
# FILTER(?year > 1537 )
### Filtrage sur les années comme entiers si souhaité

    SERVICE wikibase:label { bd:serviceParam wikibase:language "en,nl" }
       }
    }
FILTER(contains("Roberval" , ?itemLabel) && xsd:integer(?year) > ("1537"^^xsd:integer - 2)  && (xsd:integer(?year) < "1537"^^xsd:integer + 2))
}

ORDER BY ?year
LIMIT 50

```

```SPARQL
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT * WHERE {
  ?sub a foaf:Person .
  ?sub rdfs:label ?lbl ;
 dbp:birthDate ?birthDate.
  ?lbl bif:contains "'danti'".
?birthDate bif:contains "'1536'".
#   ?lbl bif:contains "'*gnat*' AND 'dant*'" .
 # FILTER (?lbl LIKE '%gill%roberva%')
#FILTER (CONTAINS(str(?birthDate), '1536'))
filter(langMatches(lang(?lbl), "en"))
} 
LIMIT 10
```

In [11]:
q2 = """
SELECT pk_mathshistory, "name", dates, url FROM astronomers.mathshistory WHERE xml_is_well_formed(page_html_txt) IS FALSE ;
"""

In [53]:
with conn.cursor() as curs:
    try:   
        ### Décommenter pour nouvelle exécution
        # curs.execute(qu)
        # conn.commit()
    except Exception as e:
        print(e)

In [52]:
qu = """UPDATE astronomers.mathshistory SET page_html = page_html_txt::XML
WHERE xml_is_well_formed(page_html_txt) IS TRUE ;"""

In [25]:
q3 = """
SELECT name, dates, 
(xpath('//dt[text()="Born"]/following-sibling::dd/a/text()', page_html))[1]::text as lieu_naiss,
(xpath('//dt[text()="Born"]/following-sibling::dd/a/@href', page_html))[1]::text as url_lieu_naiss
FROM astronomers.mathshistory WHERE substr(dates, 1, 4)::INTEGER BETWEEN 1601 and 1650;
"""

In [26]:
result = sql_explore(q3)
print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

NameError: name 'sql_explore' is not defined

In [67]:
conn.close()