In [2]:
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML
import pprint
import csv
# from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from collections import Counter
from operator import itemgetter
import pandas as pd
from sqlalchemy import create_engine

# Calling the nltk package to merge the data of people without existing VIAF URI in the two datasets 
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder

from nltk.metrics import BigramAssocMeasures
from nltk.metrics import TrigramAssocMeasures
import numpy as np


In [3]:
query = """
PREFIX  egr:  <http://rdvocab.info/ElementsGr2/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?name ?sName ?uri ?year ?bio
WHERE
  {   { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1770" )
        FILTER ( ( ( ( ( regex(?bio, "juriste", "i") || regex(?bio, "professeur de droit", "i") ) || regex(?bio, "docteur en droit", "i") ) || regex(?bio, "avocat", "i") ) || regex(?bio, "juge", "i") ) || regex(?bio, "magistrat", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
          }
      }
    UNION
      { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1770" )
        FILTER ( ( ( regex(?bio, "économiste") || regex(?bio, "Economiste") ) || regex(?bio, "professeur d'économie", "i") ) || regex(?bio, "docteur en économie", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
          }
      }
  }
ORDER BY DESC(?uri)

"""

In [4]:
sparql = SPARQLWrapper("https://data.bnf.fr/sparql") ##, returnFormat=RDFXML)  [LOCALHOST]

In [5]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [6]:
try:
    rc_bnf = sparql.queryAndConvert()
except Exception as e:
    print(e)

In [7]:
# Number of rows in the result
len(rc_bnf['results']['bindings'])

11100

In [8]:
# Inspect the first three rows
i = 0
for l in rc_bnf['results']['bindings']:
    if i < 100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb12981404c#about'}, 'name': {'type': 'literal', 'value': 'Léon Garnier'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99996033'}, 'year': {'type': 'literal', 'value': '1836'}, 'bio': {'type': 'literal', 'value': "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb13484444m#about'}, 'name': {'type': 'literal', 'value': 'Gaston de Pawlowski'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9999219'}, 'year': {'type': 'literal', 'value': '1874'}, 'bio': {'type': 'literal', 'value': 'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb134841632#about'}, 'name': {'type': 'literal', 'value': 'Jean-Michel Berton'}, 'uri': {'type': 'uri', 'value': 'http://viaf.o

In [9]:
result_bnf = []
for l in rc_bnf['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            sName = l['sName']['value']
        except Exception as e:
            sName = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            bio = l['bio']['value']
        except Exception as e:
            bio = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        result_bnf.append([l['s']['value'], uri, name, sName, year, bio])        
            
        

In [10]:
print(len(result_bnf))
result_bnf[:2]

11100


[['http://data.bnf.fr/ark:/12148/cb12981404c#about',
  'http://viaf.org/viaf/99996033',
  'Léon Garnier',
  '',
  '1836',
  "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"],
 ['http://data.bnf.fr/ark:/12148/cb13484444m#about',
  'http://viaf.org/viaf/9999219',
  'Gaston de Pawlowski',
  '',
  '1874',
  'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"']]

In [11]:
query_2= """
PREFIX  dbo:  <http://dbpedia.org/ontology/>
PREFIX  dbp:  <http://dbpedia.org/property/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?uri ?name (year(xsd:dateTime(?Birth_Date)) AS ?year) ?abstract
WHERE
  {   { ?s  a              dbo:Economist ;
          # "Economist" has a class function, he explains the use of the "a".
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
          # We use this filter to preserve only the persons born after 1800. 
          # Here, the method is a little different because we have a date and not a year. 
          # We convert above the date to a year to have the same format as BnF Data.
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name }
        FILTER ( xsd:string(?name) != "Samuel Bowles" )
     # We use this filter to remove the data concerning "Samuel Bowles" 
     # because there is mistakes in data i.e he is related to wrong people -eg. William Turner (cf. https://dbpedia.org/page/Samuel_Bowles_(economist)).
     # We will can fix this problem later, by entering data about him manually.
      }
    UNION
      { ?s  ?propriety  dbr:Economist
      # Here, "Economist" is a instance, we have as triplet "subject predicate object". 
      # In DBpedia, "Economist" is define as a instance and at the same time as a class. So we use both.
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }    
      }
    UNION
      { ?s  ?p             dbr:Jurist ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
       # We use this filter to remove the data concerning "Cicero". 
       # He appears in results, certainly because his date of birth contains the sign "-" (cf. https://dbpedia.org/page/Cicero).
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
      }
    UNION
      { ?s  ?p             dbr:Lawyer ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
      }
    UNION
      { ?s  a              dbr:Professor ;
            dbp:birthDate  ?Birth_Date ;
            dbo:abstract   ?abstract
        FILTER ( ( ( ( regex(?abstract, "lawyer", "i") || regex(?abstract, "jurist", "i") ) || regex(?abstract, "juriste", "i") ) || regex(?abstract, "attorney", "i") ) || regex(?abstract, "legal professional", "i") )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
      }
  }
ORDER BY DESC(?uri)
"""

In this query, we have made the choice to aggregate, by a UNION clause, several queries to maximise the results' number. Also we request the "economists" and the "jurists" in only one query. 

Obviously, we chose classes and instances directly related to our population, but also the "professor" instance, because some "economists" or "jurists" are in this instance (we have tried with and without them, and there more result when we use them). 

Also, we excluse all classes because they don't add more result, except the "Economist" class (we keep it) 

For exemple, we exclude the resource "personFunction" and the resource "Jurists" because they add no more data. Additionally, we keep only the "Professor" instance for the jurists (it returns result only for the jurists).

In [12]:
sparql = SPARQLWrapper("https://dbpedia.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [13]:
sparql.setQuery(query_2)
sparql.setReturnFormat(JSON)

In [14]:
rc_db = sparql.queryAndConvert()

In [15]:
# Number of rows in the result
len(rc_db['results']['bindings'])

8545

In [16]:
# Inspect the first three rows
i = 0
for l in rc_db['results']['bindings']:
    if i <100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/António_de_Almeida_Santos'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99921066'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'António de Almeida Santos'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1926'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Anita_Augspurg'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9976800'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Anita Augspurg'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1857'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Mason_Gaffney'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9960617'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Mason Gaffney'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1923'}}
{'s'

In [17]:
# Create a list with URI, VIAF URI, name, year
result_dbpedia = []
for l in rc_db['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            uri = ''
        result_dbpedia.append([l['s']['value'], uri, name, year])

In [24]:
# Inspect the first three of the list
result_dbpedia[:10]

[['http://dbpedia.org/resource/António_de_Almeida_Santos',
  'http://viaf.org/viaf/99921066',
  'António de Almeida Santos',
  '1926'],
 ['http://dbpedia.org/resource/Anita_Augspurg',
  'http://viaf.org/viaf/9976800',
  'Anita Augspurg',
  '1857'],
 ['http://dbpedia.org/resource/Mason_Gaffney',
  'http://viaf.org/viaf/9960617',
  'Mason Gaffney',
  '1923'],
 ['http://dbpedia.org/resource/Hermann_Heinrich_Gossen',
  'http://viaf.org/viaf/9939728',
  'Hermann Heinrich Gossen',
  '1810'],
 ['http://dbpedia.org/resource/Gottfried_Haberler',
  'http://viaf.org/viaf/99257315',
  'Gottfried Haberler',
  '1900'],
 ['http://dbpedia.org/resource/Michael_C._Burda',
  'http://viaf.org/viaf/9922987',
  'Michael C. Burda',
  '1959'],
 ['http://dbpedia.org/resource/Xavier_Vives',
  'http://viaf.org/viaf/9920331',
  'Xavier Vives',
  '1955'],
 ['http://dbpedia.org/resource/Vittorio_Emanuele_Orlando',
  'http://viaf.org/viaf/9914155',
  'Vittorio Emanuele Orlando',
  '1860'],
 ['http://dbpedia.org/reso

In [25]:
engine = create_engine('sqlite:///database_4.sqlite', echo=False)

In [26]:
df_bnf = pd.DataFrame([f for f in result_bnf if len(f[1]) > 0], columns=['uri_bnf', 'viaf', 'name_bnf', 'sName', 'year_bnf', 'bio_bnf'])
print(len(df_bnf))
df_bnf.fillna('')

df_bnf.head()

9764


Unnamed: 0,uri_bnf,viaf,name_bnf,sName,year_bnf,bio_bnf
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874,Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794,"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb11919308t#about,http://viaf.org/viaf/99952938,François Perroux,,1903,Économiste. - Professeur d'économie politique ...
4,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852,"Docteur en droit (Paris, 1873)"


In [27]:
df_bnf.to_sql('df_bnf', con=engine)

ValueError: Table 'df_bnf' already exists.

In [28]:
df_dbp = pd.DataFrame([p for p in result_dbpedia if len(p[1]) > 0], columns=['uri_dbp', 'viaf', 'name_dbp', 'date_dbp'])
print(len(df_dbp))
df_dbp.fillna('')
df_dbp[-10:]

1747


Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp
1737,http://dbpedia.org/resource/Paul_A._Baran,http://viaf.org/viaf/100304945,Paul Alexander Baran,1909
1738,http://dbpedia.org/resource/Menachem_Elon,http://viaf.org/viaf/100286138,Menachem Elon,1923
1739,http://dbpedia.org/resource/József_Antall,http://viaf.org/viaf/100281452,József Antall,1932
1740,http://dbpedia.org/resource/Luiz_Carlos_Bresse...,http://viaf.org/viaf/100278275,Luiz Carlos Bresser-Pereira,1934
1741,http://dbpedia.org/resource/Michael_Kaser,http://viaf.org/viaf/100274542,Michael Kaser,1926
1742,http://dbpedia.org/resource/Merton_Miller,http://viaf.org/viaf/100258394,Merton Miller,1923
1743,http://dbpedia.org/resource/Peter_J._Hammond_(...,http://viaf.org/viaf/100246974,Peter Hammond,1945
1744,http://dbpedia.org/resource/José_Hermano_Saraiva,http://viaf.org/viaf/100238197,José Hermano Saraiva,1919
1745,http://dbpedia.org/resource/Alf_Ross,http://viaf.org/viaf/100233686,Alf Ross,1899
1746,http://dbpedia.org/resource/Max_Weber,http://viaf.org/viaf/100180950,,1864


In [29]:
df_dbp.to_sql('df_dbp', con=engine)

ValueError: Table 'df_dbp' already exists.

In [30]:
merged_df = pd.merge( df_dbp, df_bnf, on='viaf', how='inner', sort='date_dbp')

In [31]:
print(len(merged_df))

147


In [32]:
merged_df

Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://dbpedia.org/resource/John_Peters_Humphrey,http://viaf.org/viaf/100966624,John Peters Humphrey,1905,http://data.bnf.fr/ark:/12148/cb122145877#about,John Humphrey,,1905,Juriste. - A été professeur de droit internati...
1,http://dbpedia.org/resource/Louis_Renault_(jur...,http://viaf.org/viaf/107536763,Louis Renault,1843,http://data.bnf.fr/ark:/12148/cb12327654n#about,Louis Renault,,1843,Juriste. - Professeur de droit international à...
2,http://dbpedia.org/resource/Ronald_Dworkin,http://viaf.org/viaf/108173876,,1931,http://data.bnf.fr/ark:/12148/cb122775427#about,Ronald Myles Dworkin,,1931,Juriste. - Professeur de jurisprudence à la Ya...
3,http://dbpedia.org/resource/Gordon_Tullock,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
4,http://dbpedia.org/resource/Alan_Walters,http://viaf.org/viaf/108202298,Alan Walters,1926,http://data.bnf.fr/ark:/12148/cb122784368#about,Alan Arthur Walters,,1926,Economiste. - Professeur en poste à John Hopki...
5,http://dbpedia.org/resource/Charles_P._Kindleb...,http://viaf.org/viaf/108254255,Charles Kindleberger,1910,http://data.bnf.fr/ark:/12148/cb11909760t#about,Charles Poor Kindleberger,,1910,"Professeur d'économie, Massachusetts institute..."
6,http://dbpedia.org/resource/Charles_P._Kindleb...,http://viaf.org/viaf/108254255,Charles P. Kindleberger,1910,http://data.bnf.fr/ark:/12148/cb11909760t#about,Charles Poor Kindleberger,,1910,"Professeur d'économie, Massachusetts institute..."
7,http://dbpedia.org/resource/Theodore_Schultz,http://viaf.org/viaf/108315605,Theodore Schultz,1902,http://data.bnf.fr/ark:/12148/cb12279484c#about,Theodore William Schultz,,1902,Economiste. - Directeur du département d'écono...
8,http://dbpedia.org/resource/Paul_A._Freund,http://viaf.org/viaf/108565309,Paul Abraham Freund,1908,http://data.bnf.fr/ark:/12148/cb120906270#about,Paul Abraham Freund,,1908,"Professeur de droit, ""Harvard Law School"""
9,http://dbpedia.org/resource/Alexis_Jacquemin,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...


In [63]:
# Append DBpedia and BnF Data to the merged Dataframe who don't have viaf.org values in common.

# To achieve this, I am served on these pages: 
# https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html (en)
# https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html (en)
# http://www.python-simple.com/python-pandas/concatenations-joins-dataframe.php (fr)

result = merged_df.append([df_dbp, df_bnf], sort=False)
print(len(result))
result[-150:]

11658


Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
9614,,http://viaf.org/viaf/101645394,,,http://data.bnf.fr/ark:/12148/cb10684224c#about,Eduard Strasburger,,1882,Economiste. - A été professeur à l'Université ...
9615,,http://viaf.org/viaf/101632152,,,http://data.bnf.fr/ark:/12148/cb11271806m#about,Michał Jan Rostworowski,,1864,"Juriste, professeur de droit international et ..."
9616,,http://viaf.org/viaf/10157246,,,http://data.bnf.fr/ark:/12148/cb15531380w#about,Alfred Bligny,,1833,Magistrat
9617,,http://viaf.org/viaf/10150030471110960105,,,http://data.bnf.fr/ark:/12148/cb17131176p#about,Pierre Douare,,1899,Avocat à la cour d'appel de Paris. - Membre de...
9618,,http://viaf.org/viaf/10149717566810952796,,,http://data.bnf.fr/ark:/12148/cb171280161#about,Henry Roger,,1890,"Sous-préfet de Meaux, Seine-et-Marne (en 1936)..."
9619,,http://viaf.org/viaf/10149717179710950433,,,http://data.bnf.fr/ark:/12148/cb171268320#about,Paul Rolland,,1870,Président de chambre honoraire à la Cour de ca...
9620,,http://viaf.org/viaf/10148120495894791073,,,http://data.bnf.fr/ark:/12148/cb13372910t#about,Nicolas Bodoy,,1850,"Docteur en droit, avocat à Tunis"
9621,,http://viaf.org/viaf/10147101,,,http://data.bnf.fr/ark:/12148/cb15126315r#about,E. T. Hooley,,1842,"Explorateur et pionnier, juge de paix, candida..."
9622,,http://viaf.org/viaf/10146574881738150893,,,http://data.bnf.fr/ark:/12148/cb170462019#about,Charles Pelleport-Burète,,1827,Avocat et homme politique. - Fut sous-préfet d...
9623,,http://viaf.org/viaf/10146029554435822367,,,http://data.bnf.fr/ark:/12148/cb13358951f#about,Édouard Delpech,,1794,Avocat. - Substitut du procureur général à la ...


In [34]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_test=result
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_dbp'],result['name_bnf'])
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_bnf'],result['name_dbp'])
result_test['year'] = np.where(result['date_dbp'].isnull(),result['year_bnf'],result['date_dbp'])
result_test['year'] = np.where(result['year_bnf'].isnull(),result['date_dbp'],result['year_bnf'])


result_test=result_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf']]
result_test

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf
0,http://viaf.org/viaf/100966624,John Peters Humphrey,1905,http://dbpedia.org/resource/John_Peters_Humphrey,http://data.bnf.fr/ark:/12148/cb122145877#about
1,http://viaf.org/viaf/107536763,Louis Renault,1843,http://dbpedia.org/resource/Louis_Renault_(jur...,http://data.bnf.fr/ark:/12148/cb12327654n#about
2,http://viaf.org/viaf/108173876,,1931,http://dbpedia.org/resource/Ronald_Dworkin,http://data.bnf.fr/ark:/12148/cb122775427#about
3,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,http://data.bnf.fr/ark:/12148/cb11927239j#about
4,http://viaf.org/viaf/108202298,Alan Walters,1926,http://dbpedia.org/resource/Alan_Walters,http://data.bnf.fr/ark:/12148/cb122784368#about
5,http://viaf.org/viaf/108254255,Charles Kindleberger,1910,http://dbpedia.org/resource/Charles_P._Kindleb...,http://data.bnf.fr/ark:/12148/cb11909760t#about
6,http://viaf.org/viaf/108254255,Charles P. Kindleberger,1910,http://dbpedia.org/resource/Charles_P._Kindleb...,http://data.bnf.fr/ark:/12148/cb11909760t#about
7,http://viaf.org/viaf/108315605,Theodore Schultz,1902,http://dbpedia.org/resource/Theodore_Schultz,http://data.bnf.fr/ark:/12148/cb12279484c#about
8,http://viaf.org/viaf/108565309,Paul Abraham Freund,1908,http://dbpedia.org/resource/Paul_A._Freund,http://data.bnf.fr/ark:/12148/cb120906270#about
9,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,http://data.bnf.fr/ark:/12148/cb119084288#about


In [35]:
# Extract only rows without VIAF uri from DBpeida
df_dbp_test = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf', 'name_dbp', 'date_dbp'])

df_dbp_test_mask=df_dbp_test['viaf']==''
filtered_df_dbp_test = df_dbp_test[df_dbp_test_mask]
filtered_df_dbp_test

Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp
1747,http://dbpedia.org/resource/Luc-Normand_Tellier,,Luc-Normand Tellier,1944
1748,http://dbpedia.org/resource/Madhu_Verma,,Madhu Verma,1961
1749,http://dbpedia.org/resource/Magda_Kandil,,Magda ElSayed Kandil,1958
1750,http://dbpedia.org/resource/Magnus_Johannesson,,Magnus Johannesson,1964
1751,http://dbpedia.org/resource/Mahendra_P._Lama,,Mahendra P. Lama,1961
1752,http://dbpedia.org/resource/Mainul_Islam,,Mainul Islam,1950
1753,http://dbpedia.org/resource/Urs_Meisterhans,,Urs Meisterhans,1960
1754,http://dbpedia.org/resource/Rosalind_Blauer,,Rosalind Blauer,1943
1755,http://dbpedia.org/resource/Makoto_Yano,,Makoto Yano,1952
1756,http://dbpedia.org/resource/Krzysztof_Zamasz,,Krzysztof Zamasz,1974


In [67]:
# Extract only rows without VIAF uri from BnF Data
df_bnf_test = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'sName','year_bnf','bio_bnf'])

df_bnf_test_mask=df_bnf_test['viaf']==''
filtered_df_bnf_test = df_bnf_test[df_bnf_test_mask]
len(filtered_df_bnf_test)

1336

In [68]:
# merge the both dateframes 
filtered_dbp_bnf_test= filtered_df_dbp_test.append(filtered_df_bnf_test, sort=True)
len(filtered_dbp_bnf_test)

8134

In [69]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_f_test=filtered_dbp_bnf_test
result_f_test['name'] = np.where(filtered_dbp_bnf_test['name_bnf'].isnull(),filtered_dbp_bnf_test['name_dbp'],filtered_dbp_bnf_test['name_bnf'])
result_f_test['name'] = np.where(filtered_dbp_bnf_test['name_dbp'].isnull(),filtered_dbp_bnf_test['name_bnf'],filtered_dbp_bnf_test['name_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_test['date_dbp'].isnull(),filtered_dbp_bnf_test['year_bnf'],filtered_dbp_bnf_test['date_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_test['year_bnf'].isnull(),filtered_dbp_bnf_test['date_dbp'],filtered_dbp_bnf_test['year_bnf'])


result_f_test=result_f_test.loc[:,['name','year','uri_dbp','uri_bnf']]
sort_rft=result_f_test.sort_values(by='name', ascending=False)
sort_rft[700:750]

Unnamed: 0,name,year,uri_dbp,uri_bnf
6649,Vigneswaran Sanasee,1965,http://dbpedia.org/resource/Vigneswaran_Sanasee,
6647,Victorino de la Plaza,1840,http://dbpedia.org/resource/Victorino_de_la_Plaza,
6646,Victoria Neave,1980,http://dbpedia.org/resource/Victoria_Neave,
2147,Victor Său,1971,http://dbpedia.org/resource/Victor_Său,
6645,Victor San Andres Ziga,1945,http://dbpedia.org/resource/Victor_Ziga,
6644,Victor S. Johnson Jr.,1916,http://dbpedia.org/resource/Victor_S._Johnson_Jr.,
10119,Victor Resal,1807,,http://data.bnf.fr/ark:/12148/cb105558741#about
6643,Victor Perlo,1912,http://dbpedia.org/resource/Victor_Perlo,
6642,Victor Owusu,1923,http://dbpedia.org/resource/Victor_Owusu,
6641,Victor Ndoma-Egba,1956,http://dbpedia.org/resource/Victor_Ndoma-Egba,


In [74]:
# Test to find similarities between the names BnF Data and DBpedia with collocation 


from nltk.corpus import stopwords
stopset = stopwords.words('english')
stopset = stopwords.words('french')
stopset = stopwords.words('russian')
nltk.word_tokenize(result_f_test["name"][0])

result_f_test['tokenized_sents'] = result_f_test.apply(lambda row: nltk.word_tokenize(row['name']), axis=1)

bcf = TrigramCollocationFinder.from_documents(result_f_test['tokenized_sents'])
filter_stops = lambda w: w in stopset
bcf.apply_word_filter(filter_stops)
f= bcf.nbest(TrigramAssocMeasures.likelihood_ratio, 50)
print(f)

TypeError: expected string or bytes-like object

In [40]:
# attempt to change the 'NaN' names from DBpedia to name from BnF and the same for the reverse 
# (cf.https://stackoverflow.com/questions/29177498/python-pandas-replace-nan-in-one-column-with-value-from-corresponding-row-of-sec)
import pandas as pd
import numpy as np

def fx(x):
    if np.isnan(x['date_dbp']):
        return x['year_bnf ']
    else:
        return x['date_dbp']
result_test=result.apply(lambda x : fx(x),axis=1)

TypeError: ("ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''", 'occurred at index 0')

In [41]:
# Another method for trying to do it 
result.name_dbp.fillna(result.name_bnf, inplace=True)
del result['name_bnf']
result.columns = 'uri_dbp viaf name_dbp date_dbp uri_bnf sName year_bnf bio_bnf'.split()


ValueError: Length mismatch: Expected axis has 10 elements, new values have 8 elements

In [42]:
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
                    'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
                    'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
                    'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
                              'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
                    })

In [43]:
df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
                    'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
                                  'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
                                  'http://example.com/img/id005.png']
                    })

In [11]:
df3_merged = pd.merge(df1, df2)
df3_merged 

Unnamed: 0,user_id,first_name,last_name,email,image_url
0,id001,Rivi,Valti,rvalti0@example.com,http://example.com/img/id001.png
1,id002,Wynnie,McMurty,wmcmurty1@example.com,http://example.com/img/id002.jpg
2,id003,Kristos,Ivanets,kivanets2@example.com,http://example.com/img/id003.bmp
3,id004,Madalyn,Max,mmax3@example.com,http://example.com/img/id004.jpg
4,id005,Tobe,Riddich,triddich4@example.com,http://example.com/img/id005.png
