In [39]:
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML
import pprint
import csv
# from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from collections import Counter
from operator import itemgetter
import pandas as pd
from sqlalchemy import create_engine

# Calling the nltk package to merge the data of people without existing VIAF URI in the two datasets 
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder

from nltk.metrics import BigramAssocMeasures
from nltk.metrics import TrigramAssocMeasures
import numpy as np


In [40]:
query = """
PREFIX  egr:  <http://rdvocab.info/ElementsGr2/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?name ?sName ?uri ?year ?bio
WHERE
  {   { ?s  egr:biographicalInformation  ?bio ; 
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year) # the bind is used to keep only the date inside the URL.
        FILTER ( ?year > "1800" ) # We use this filter to preserve only the persons born after 1800.
        FILTER ( ( regex(?bio, "juriste", "i") || regex(?bio, "professeur de droit", "i") ) || regex(?bio, "docteur en droit", "i") )
        # This filter is necessary to have the desired population. the "||" sign is used as a "and/or".
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
            # This filter is used to merge data from BnF Data and DBpedia by the VIAF URI.
          }
      }
    UNION
      { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1800" )
        FILTER ( ( ( regex(?bio, "économiste") || regex(?bio, "Economiste") ) || regex(?bio, "professeur d'économie", "i") ) || regex(?bio, "docteur en économie", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
          }
      }
  }
ORDER BY DESC(?uri)

"""

In [41]:
sparql = SPARQLWrapper("https://data.bnf.fr/sparql") ##, returnFormat=RDFXML)  [LOCALHOST]

In [42]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [43]:
try:
    rc_bnf = sparql.queryAndConvert()
except Exception as e:
    print(e)

In [44]:
# Number of rows in the result
len(rc_bnf['results']['bindings'])

5867

In [45]:
# Inspect the first three rows
i = 0
for l in rc_bnf['results']['bindings']:
    if i < 100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb12981404c#about'}, 'name': {'type': 'literal', 'value': 'Léon Garnier'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99996033'}, 'year': {'type': 'literal', 'value': '1836'}, 'bio': {'type': 'literal', 'value': "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb13484444m#about'}, 'name': {'type': 'literal', 'value': 'Gaston de Pawlowski'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9999219'}, 'year': {'type': 'literal', 'value': '1874'}, 'bio': {'type': 'literal', 'value': 'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb11919308t#about'}, 'name': {'type': 'literal', 'value': 'François Perroux'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org

In [46]:
result_bnf = []
for l in rc_bnf['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            sName = l['sName']['value']
        except Exception as e:
            sName = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            bio = l['bio']['value']
        except Exception as e:
            bio = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        result_bnf.append([l['s']['value'], uri, name, sName, year, bio])        
            
        

In [47]:
print(len(result_bnf))
result_bnf[:2]

5867


[['http://data.bnf.fr/ark:/12148/cb12981404c#about',
  'http://viaf.org/viaf/99996033',
  'Léon Garnier',
  '',
  '1836',
  "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"],
 ['http://data.bnf.fr/ark:/12148/cb13484444m#about',
  'http://viaf.org/viaf/9999219',
  'Gaston de Pawlowski',
  '',
  '1874',
  'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"']]

In [57]:
query_2= """
PREFIX  dbo:  <http://dbpedia.org/ontology/>
PREFIX  dbp:  <http://dbpedia.org/property/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?uri ?name (year(xsd:dateTime(?Birth_Date)) AS ?year) ?abstract
WHERE
  {   { ?s  a              dbo:Economist ;
          # "Economist" has a class function, he explains the use of the "a".
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1800-01-01"^^xsd:date )
          # We use this filter to preserve only the persons born after 1800. 
          # Here, the method is a little different because we have a date and not a year. 
          # We convert above the date to a year to have the same format as BnF Data.
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name }
        FILTER ( xsd:string(?name) != "Samuel Bowles" )
     # We use this filter to remove the data concerning "Samuel Bowles" 
     # because there is mistakes in data i.e he is related to wrong people -eg. William Turner (cf. https://dbpedia.org/page/Samuel_Bowles_(economist)).
     # We will can fix this problem later, by entering data about him manually.
      }
    UNION
      { ?s  ?propriety  dbr:Economist
      # Here, "Economist" is a instance, we have as triplet "subject predicate object". 
      # In DBpedia, "Economist" is define as a instance and at the same time as a class. So we use both.
        FILTER ( xsd:date(?Birth_Date) > "1800-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
            OPTIONAL
              { ?s  dbp:name  ?name }
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
      }
    UNION
      { ?s  a              dbo:Professor ;
            dbp:birthDate  ?Birth_Date ;
            dbo:abstract   ?abstract
        FILTER ( ( ( ( ( regex(?abstract, "Economist", "i") || regex(?abstract, "économiste", "i") ) || regex(?abstract, "professeur d'économie", "i") ) || regex(?abstract, "economics professor", "i") ) || regex(?abstract, "docteur en économie", "i") ) || regex(?abstract, "PhD in Economics", "i") )
        OPTIONAL
          { ?s  dbp:name  ?name }
        FILTER ( xsd:string(?name) != "Samuel Bowles" )
        FILTER ( xsd:date(?Birth_Date) > "1800-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
      }
    UNION
      { ?s  ?p             dbr:Jurist ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1800-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
       # We use this filter to remove the data concerning "Cicero". 
       # He appears in results, certainly because his date of birth contains the sign "-" (cf. https://dbpedia.org/page/Cicero).
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
      }
    UNION
      { ?s  a              dbo:Lawyer ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1800-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
      }
    UNION
      { ?s  a              dbo:Professor ;
            dbp:birthDate  ?Birth_Date ;
            owl:sameAs     ?sameAs ;
            dbo:abstract   ?abstract
        FILTER ( ( ( ( regex(?abstract, "lawyer", "i") || regex(?abstract, "jurist", "i") ) || regex(?abstract, "juriste", "i") ) || regex(?abstract, "attorney", "i") ) || regex(?abstract, "legal professional", "i") )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        FILTER ( xsd:date(?Birth_Date) > "1800-01-01"^^xsd:date )
      }
  }
ORDER BY DESC(?uri)
"""

In this query, we have made the choice to aggregate, by a UNION clause, several queries to maximise the results' number. Also we request the "economists" and the "jurists" in only one query. 

Obviously, we chose classes and instances directly related to our population, but also the "professor" classe, because some "economists" or "jurists" are in this class (we have tried with and without them, and there more result when we use them). 

For exemple, we exclude the class "PersonFunction" and the resource "personFunction" because they add no more data.

In [58]:
sparql = SPARQLWrapper("https://dbpedia.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [59]:
sparql.setQuery(query_2)
sparql.setReturnFormat(JSON)

In [60]:
rc_db = sparql.queryAndConvert()

In [61]:
# Number of rows in the result
len(rc_db['results']['bindings'])

2218

In [62]:
# Inspect the first three rows
i = 0
for l in rc_db['results']['bindings']:
    if i <100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Mason_Gaffney'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9960617'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Mason Gaffney'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1923'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Hermann_Heinrich_Gossen'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9939728'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Hermann Heinrich Gossen'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1810'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Gottfried_Haberler'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99257315'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Gottfried Haberler'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1900'}}


In [63]:
# Create a list with URI, VIAF URI, name, year
result_dbpedia = []
for l in rc_db['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            uri = ''
        result_dbpedia.append([l['s']['value'], uri, name, year])

In [64]:
# Inspect the first three of the list
result_dbpedia[-10:]

[['http://dbpedia.org/resource/Max_Weber', '', '', '1864'],
 ['http://dbpedia.org/resource/Marie_Toussaint', '', '', '1987'],
 ['http://dbpedia.org/resource/Mark_D._Siljander', '', '', '1951'],
 ['http://dbpedia.org/resource/Marshall_F._McComb', '', '', '1894'],
 ['http://dbpedia.org/resource/Theo_Titus', '', '', '1920'],
 ['http://dbpedia.org/resource/Theodor_von_Guérard', '', '', '1863'],
 ['http://dbpedia.org/resource/Bektas_Beknazarov',
  '',
  'Bektas Beknazarov',
  '1956'],
 ['http://dbpedia.org/resource/Bektas_Beknazarov',
  '',
  'Бектас Бекназаров',
  '1956'],
 ['http://dbpedia.org/resource/Mustafa_Kamal_(judge)',
  '',
  'Mustafa Kamal',
  '1933'],
 ['http://dbpedia.org/resource/Nora_Ananieva', '', 'Nora Ananieva', '1938']]

In [69]:
# Join data of BnF Data and DBpedia
joined_data = []
for bnf in result_bnf:
    db_out = []
    for db in result_dbpedia:
        if bnf[1] == db[1] :
            db_out = db
    joined_data.append(bnf + db_out)

In [70]:
result_bnf[:10]

[['http://data.bnf.fr/ark:/12148/cb12981404c#about',
  'http://viaf.org/viaf/99996033',
  'Léon Garnier',
  '',
  '1836',
  "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"],
 ['http://data.bnf.fr/ark:/12148/cb13484444m#about',
  'http://viaf.org/viaf/9999219',
  'Gaston de Pawlowski',
  '',
  '1874',
  'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'],
 ['http://data.bnf.fr/ark:/12148/cb11919308t#about',
  'http://viaf.org/viaf/99952938',
  'François Perroux',
  '',
  '1903',
  "Économiste. - Professeur d'économie politique à la Faculté de droit de Lyon (1928-1937), puis à Paris (1937-1955). - Professeur d'analyse des faits économiques et sociaux au Collège de France (1955-1974). - Fondateur (en 1944), puis président de l'Institut de science économique appliquée"],
 ['http://data.bnf.fr/ark:/12148/cb13379520q#about',
  'http://viaf.org/viaf/9995247',
  '

In [71]:
print(len(joined_data))

5867


In [72]:
# A problem with the join appears because the people from BnF Data without VIAF URI are joined to a person DBpedia
joined_data[-10]

['http://data.bnf.fr/ark:/12148/cb177975050#about',
 '',
 'Lev Pavlovič Garkunov',
 '',
 '1901',
 "Joueur d'échecs et arbitre international (1956). - Ingénieur économiste",
 'http://dbpedia.org/resource/Nora_Ananieva',
 '',
 'Nora Ananieva',
 '1938']

In [73]:
final_result = [i for i in joined_data if 'dbpedia' in i[-4]]
print(len(final_result))
pprint.pprint(final_result)

658
[['http://data.bnf.fr/ark:/12148/cb108125478#about',
  'http://viaf.org/viaf/9838425',
  'Luigi Amoroso',
  '',
  '1886',
  "Mathématicien, professeur d'économie politique à l'Université de Rome "
  "(1926-1956). - Membre de l'Académie des Lincei (Rome)",
  'http://dbpedia.org/resource/Luigi_Amoroso',
  'http://viaf.org/viaf/9838425',
  'Luigi Amoroso',
  '1886'],
 ['http://data.bnf.fr/ark:/12148/cb119168550#about',
  'http://viaf.org/viaf/98356107',
  'Oskar Morgenstern',
  '',
  '1902',
  "Professeur d'économie, Universität Wien (1935-1938) ; Princeton university, "
  'N.J.(1938-1970)',
  'http://dbpedia.org/resource/Oskar_Morgenstern',
  'http://viaf.org/viaf/98356107',
  'Oskar Morgenstern',
  '1902'],
 ['http://data.bnf.fr/ark:/12148/cb119074897#about',
  'http://viaf.org/viaf/89700672',
  'John Hicks',
  '',
  '1904',
  'Economiste. - Prix Nobel de sciences économiques (1972)',
  'http://dbpedia.org/resource/John_Hicks',
  'http://viaf.org/viaf/89700672',
  'Sir John Hicks',


In [74]:
# I expect than a person without a VIAF URI are in any case joined.
final_result[-10:]

[['http://data.bnf.fr/ark:/12148/cb177975050#about',
  '',
  'Lev Pavlovič Garkunov',
  '',
  '1901',
  "Joueur d'échecs et arbitre international (1956). - Ingénieur économiste",
  'http://dbpedia.org/resource/Nora_Ananieva',
  '',
  'Nora Ananieva',
  '1938'],
 ['http://data.bnf.fr/ark:/12148/cb11557012w#about',
  '',
  'Bolesław Koskowski',
  '',
  '1870',
  'Economiste. - Sénateur',
  'http://dbpedia.org/resource/Nora_Ananieva',
  '',
  'Nora Ananieva',
  '1938'],
 ['http://data.bnf.fr/ark:/12148/cb112424331#about',
  '',
  "Jacques L'Huillier",
  '',
  '1917',
  "Professeur d'économie politique à l'université de Genève. - Diplômé de l'École libre de sciences politiques de Paris",
  'http://dbpedia.org/resource/Nora_Ananieva',
  '',
  'Nora Ananieva',
  '1938'],
 ['http://data.bnf.fr/ark:/12148/cb10596494m#about',
  '',
  'François Villegardelle',
  '',
  '1810',
  'Economiste. - Fouriériste, puis communiste',
  'http://dbpedia.org/resource/Nora_Ananieva',
  '',
  'Nora Ananieva',
 

In [75]:
engine = create_engine('sqlite:///database.sqlite', echo=False)

In [76]:
df_bnf = pd.DataFrame([f for f in result_bnf if len(f[1]) > 0], columns=['uri_bnf', 'viaf', 'name_bnf', 'sName', 'year_bnf', 'bio_bnf'])
print(len(df_bnf))
df_bnf.fillna('')

df_bnf.head()

5264


Unnamed: 0,uri_bnf,viaf,name_bnf,sName,year_bnf,bio_bnf
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874,Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb11919308t#about,http://viaf.org/viaf/99952938,François Perroux,,1903,Économiste. - Professeur d'économie politique ...
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,Juriste. - Bibliophile


In [77]:
df_bnf.to_sql('df_bnf', con=engine)

ValueError: Table 'df_bnf' already exists.

In [78]:
df_dbp = pd.DataFrame([p for p in result_dbpedia if len(p[1]) > 0], columns=['uri_dbp', 'viaf', 'name_dbp', 'date_dbp'])
print(len(df_dbp))
df_dbp.fillna('')
df_dbp[-10:]

1011


Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp
1001,http://dbpedia.org/resource/Robert_Triffin,http://viaf.org/viaf/102376887,Robert Triffin,1911
1002,http://dbpedia.org/resource/Ludwik_Maurycy_Landau,http://viaf.org/viaf/101991628,Ludwik Maurycy Landau,1902
1003,http://dbpedia.org/resource/Fritz_Machlup,http://viaf.org/viaf/101860744,Fritz Machlup,1902
1004,http://dbpedia.org/resource/Thomas_Woods,http://viaf.org/viaf/10127341,Thomas Woods,1972
1005,http://dbpedia.org/resource/Max_Hirsch_(econom...,http://viaf.org/viaf/100595201,Max Hirsch,1852
1006,http://dbpedia.org/resource/Paul_A._Baran,http://viaf.org/viaf/100304945,Paul Alexander Baran,1909
1007,http://dbpedia.org/resource/Luiz_Carlos_Bresse...,http://viaf.org/viaf/100278275,Luiz Carlos Bresser-Pereira,1934
1008,http://dbpedia.org/resource/Michael_Kaser,http://viaf.org/viaf/100274542,Michael Kaser,1926
1009,http://dbpedia.org/resource/Merton_Miller,http://viaf.org/viaf/100258394,Merton Miller,1923
1010,http://dbpedia.org/resource/Peter_J._Hammond_(...,http://viaf.org/viaf/100246974,Peter Hammond,1945


In [79]:
df_dbp.to_sql('df_dbp', con=engine)

ValueError: Table 'df_dbp' already exists.

In [80]:
merged_df = pd.merge( df_dbp, df_bnf, on='viaf', how='inner', sort='date_dbp')

In [81]:
print(len(merged_df))

56


In [82]:
merged_df[-50:]

Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
6,http://dbpedia.org/resource/Erik_Lindahl,http://viaf.org/viaf/110766422,Erik Lindahl,1891,http://data.bnf.fr/ark:/12148/cb122779519#about,Erik Robert Lindahl,,1891,Economiste. - Spécialiste de la théorie économ...
7,http://dbpedia.org/resource/Tony_Atkinson,http://viaf.org/viaf/12353979,Tony Atkinson,1944,http://data.bnf.fr/ark:/12148/cb121949138#about,Anthony Barnes Atkinson,,1944,Économiste. - Professeur d'économie politique ...
8,http://dbpedia.org/resource/Thorold_Rogers,http://viaf.org/viaf/12659238,Thorold Rogers,1823,http://data.bnf.fr/ark:/12148/cb12347168c#about,James Edwin Thorold Rogers,,1823,Economiste. - A été professeur d'économie poli...
9,http://dbpedia.org/resource/Thomas_Nixon_Carver,http://viaf.org/viaf/12668196,Thomas Nixon Carver,1865,http://data.bnf.fr/ark:/12148/cb111973554#about,Thomas-Nixon Carver,,1865,Professeur d'économie politique à l'Université...
10,http://dbpedia.org/resource/Gérard_Debreu,http://viaf.org/viaf/14832766,Gérard Debreu,1921,http://data.bnf.fr/ark:/12148/cb122801806#about,Gerard Debreu,,1921,Professeur d'économie et de mathématiques à l'...
11,http://dbpedia.org/resource/Ragnar_Nurkse,http://viaf.org/viaf/14894400,Ragnar Nurkse,1907,http://data.bnf.fr/ark:/12148/cb12680064c#about,Ragnar Nurkse,,1907,Né en Estonie. - Etudes à Edimbourg et Vienne....
12,http://dbpedia.org/resource/William_Phillips_(...,http://viaf.org/viaf/193761900,William Phillips,1914,http://data.bnf.fr/ark:/12148/cb14476846t#about,Alban William Housego Phillips,,1914,Economiste
13,http://dbpedia.org/resource/John_Bates_Clark,http://viaf.org/viaf/19739027,John Bates Clark,1847,http://data.bnf.fr/ark:/12148/cb12278491d#about,John Bates Clark,,1847,Economiste. - A été professeur de sciences his...
14,"http://dbpedia.org/resource/Richard_Kahn,_Baro...",http://viaf.org/viaf/20699428,Richard Ferdinand Kahn,1905,http://data.bnf.fr/ark:/12148/cb122778155#about,Richard Ferdinand Kahn,,1905,Economiste. - Devient professeur à l'Universit...
15,http://dbpedia.org/resource/Friedrich_Hayek,http://viaf.org/viaf/2471646,Friedrich Hayek,1899,http://data.bnf.fr/ark:/12148/cb119070534#about,Friedrich August Hayek,,1899,Économiste. - Prix Nobel d'économie (1974) ave...


In [83]:
# Append DBpedia and BnF Data to the merged Dataframe who don't have viaf.org values in common.

# To achieve this, I am served on these pages: 
# https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html (en)
# https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html (en)
# http://www.python-simple.com/python-pandas/concatenations-joins-dataframe.php (fr)

result = merged_df.append([df_dbp, df_bnf], sort=False)
print(len(result))
result[:10]

6331


Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://dbpedia.org/resource/Gordon_Tullock,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
1,http://dbpedia.org/resource/Alan_Walters,http://viaf.org/viaf/108202298,Alan Walters,1926,http://data.bnf.fr/ark:/12148/cb122784368#about,Alan Arthur Walters,,1926,Economiste. - Professeur en poste à John Hopki...
2,http://dbpedia.org/resource/Charles_P._Kindleb...,http://viaf.org/viaf/108254255,Charles Kindleberger,1910,http://data.bnf.fr/ark:/12148/cb11909760t#about,Charles Poor Kindleberger,,1910,"Professeur d'économie, Massachusetts institute..."
3,http://dbpedia.org/resource/Charles_P._Kindleb...,http://viaf.org/viaf/108254255,Charles P. Kindleberger,1910,http://data.bnf.fr/ark:/12148/cb11909760t#about,Charles Poor Kindleberger,,1910,"Professeur d'économie, Massachusetts institute..."
4,http://dbpedia.org/resource/Theodore_Schultz,http://viaf.org/viaf/108315605,Theodore Schultz,1902,http://data.bnf.fr/ark:/12148/cb12279484c#about,Theodore William Schultz,,1902,Economiste. - Directeur du département d'écono...
5,http://dbpedia.org/resource/Alexis_Jacquemin,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
6,http://dbpedia.org/resource/Erik_Lindahl,http://viaf.org/viaf/110766422,Erik Lindahl,1891,http://data.bnf.fr/ark:/12148/cb122779519#about,Erik Robert Lindahl,,1891,Economiste. - Spécialiste de la théorie économ...
7,http://dbpedia.org/resource/Tony_Atkinson,http://viaf.org/viaf/12353979,Tony Atkinson,1944,http://data.bnf.fr/ark:/12148/cb121949138#about,Anthony Barnes Atkinson,,1944,Économiste. - Professeur d'économie politique ...
8,http://dbpedia.org/resource/Thorold_Rogers,http://viaf.org/viaf/12659238,Thorold Rogers,1823,http://data.bnf.fr/ark:/12148/cb12347168c#about,James Edwin Thorold Rogers,,1823,Economiste. - A été professeur d'économie poli...
9,http://dbpedia.org/resource/Thomas_Nixon_Carver,http://viaf.org/viaf/12668196,Thomas Nixon Carver,1865,http://data.bnf.fr/ark:/12148/cb111973554#about,Thomas-Nixon Carver,,1865,Professeur d'économie politique à l'Université...


In [84]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_test=result
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_dbp'],result['name_bnf'])
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_bnf'],result['name_dbp'])
result_test['year'] = np.where(result['date_dbp'].isnull(),result['year_bnf'],result['date_dbp'])
result_test['year'] = np.where(result['year_bnf'].isnull(),result['date_dbp'],result['year_bnf'])


result_test=result_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf']]
result_test

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf
0,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,http://data.bnf.fr/ark:/12148/cb11927239j#about
1,http://viaf.org/viaf/108202298,Alan Walters,1926,http://dbpedia.org/resource/Alan_Walters,http://data.bnf.fr/ark:/12148/cb122784368#about
2,http://viaf.org/viaf/108254255,Charles Kindleberger,1910,http://dbpedia.org/resource/Charles_P._Kindleb...,http://data.bnf.fr/ark:/12148/cb11909760t#about
3,http://viaf.org/viaf/108254255,Charles P. Kindleberger,1910,http://dbpedia.org/resource/Charles_P._Kindleb...,http://data.bnf.fr/ark:/12148/cb11909760t#about
4,http://viaf.org/viaf/108315605,Theodore Schultz,1902,http://dbpedia.org/resource/Theodore_Schultz,http://data.bnf.fr/ark:/12148/cb12279484c#about
5,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,http://data.bnf.fr/ark:/12148/cb119084288#about
6,http://viaf.org/viaf/110766422,Erik Lindahl,1891,http://dbpedia.org/resource/Erik_Lindahl,http://data.bnf.fr/ark:/12148/cb122779519#about
7,http://viaf.org/viaf/12353979,Tony Atkinson,1944,http://dbpedia.org/resource/Tony_Atkinson,http://data.bnf.fr/ark:/12148/cb121949138#about
8,http://viaf.org/viaf/12659238,Thorold Rogers,1823,http://dbpedia.org/resource/Thorold_Rogers,http://data.bnf.fr/ark:/12148/cb12347168c#about
9,http://viaf.org/viaf/12668196,Thomas Nixon Carver,1865,http://dbpedia.org/resource/Thomas_Nixon_Carver,http://data.bnf.fr/ark:/12148/cb111973554#about


In [85]:
# Extract only rows without VIAF uri from DBpeida
df_dbp_test = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf', 'name_dbp', 'date_dbp'])

df_dbp_test_mask=df_dbp_test['viaf']==''
filtered_df_dbp_test = df_dbp_test[df_dbp_test_mask]
filtered_df_dbp_test

Unnamed: 0,uri_dbp,viaf,name_dbp,date_dbp
1011,http://dbpedia.org/resource/Luc-Normand_Tellier,,Luc-Normand Tellier,1944
1012,http://dbpedia.org/resource/Madhu_Verma,,Madhu Verma,1961
1013,http://dbpedia.org/resource/Magda_Kandil,,Magda ElSayed Kandil,1958
1014,http://dbpedia.org/resource/Magnus_Johannesson,,Magnus Johannesson,1964
1015,http://dbpedia.org/resource/Mahendra_P._Lama,,Mahendra P. Lama,1961
1016,http://dbpedia.org/resource/Mainul_Islam,,Mainul Islam,1950
1017,http://dbpedia.org/resource/Urs_Meisterhans,,Urs Meisterhans,1960
1018,http://dbpedia.org/resource/Rosalind_Blauer,,Rosalind Blauer,1943
1019,http://dbpedia.org/resource/Makoto_Yano,,Makoto Yano,1952
1020,http://dbpedia.org/resource/Krzysztof_Zamasz,,Krzysztof Zamasz,1974


In [86]:
# Extract only rows without VIAF uri from BnF Data
df_bnf_test = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'sName','year_bnf','bio_bnf'])

df_bnf_test_mask=df_bnf_test['viaf']==''
filtered_df_bnf_test = df_bnf_test[df_bnf_test_mask]
filtered_df_bnf_test 

Unnamed: 0,uri_bnf,viaf,name_bnf,sName,year_bnf,bio_bnf
5264,http://data.bnf.fr/ark:/12148/cb105931168#about,,Lucien de Sainte-Croix,,1861,Avocat. - Secrétaire-rédacteur au Sénat (en 18...
5265,http://data.bnf.fr/ark:/12148/cb11154681z#about,,Ljubo Leontić,,1887,Juriste et homme politique
5266,http://data.bnf.fr/ark:/12148/cb111816241#about,,Sava Kosanović,,1894,"Juriste, homme politique et diplomate"
5267,http://data.bnf.fr/ark:/12148/cb11182130t#about,,Roger Labrousse,,1908,Professeur de philosophie politique à l'univer...
5268,http://data.bnf.fr/ark:/12148/cb11057110m#about,,James Donnadieu,,1899,Journaliste et homme de lettres. - Licencié ès...
5269,http://data.bnf.fr/ark:/12148/cb11058379c#about,,Agustín Edwards McClure,,1878,"Juriste, diplomate, homme d'affaires et homme ..."
5270,http://data.bnf.fr/ark:/12148/cb10970864m#about,,Gaston Moreau,,1878,Avoué et homme politique. - Député de Maine-et...
5271,http://data.bnf.fr/ark:/12148/cb107177993#about,,Paul Lemarcis,,1827,Docteur en droit. - Avocat à la cour de Rouen
5272,http://data.bnf.fr/ark:/12148/cb11533718h#about,,Antonio Gazzoletti,,1813,Juriste. - Poète. - Journaliste
5273,http://data.bnf.fr/ark:/12148/cb11349248c#about,,,,1903,Nom en religion de Robert Bocquet. - Docteur e...


In [87]:
# merge the both dateframes 
filtered_dbp_bnf_test= filtered_df_dbp_test.append(filtered_df_bnf_test, sort=True)
filtered_dbp_bnf_test[:10]

Unnamed: 0,bio_bnf,date_dbp,name_bnf,name_dbp,sName,uri_bnf,uri_dbp,viaf,year_bnf
1011,,1944,,Luc-Normand Tellier,,,http://dbpedia.org/resource/Luc-Normand_Tellier,,
1012,,1961,,Madhu Verma,,,http://dbpedia.org/resource/Madhu_Verma,,
1013,,1958,,Magda ElSayed Kandil,,,http://dbpedia.org/resource/Magda_Kandil,,
1014,,1964,,Magnus Johannesson,,,http://dbpedia.org/resource/Magnus_Johannesson,,
1015,,1961,,Mahendra P. Lama,,,http://dbpedia.org/resource/Mahendra_P._Lama,,
1016,,1950,,Mainul Islam,,,http://dbpedia.org/resource/Mainul_Islam,,
1017,,1960,,Urs Meisterhans,,,http://dbpedia.org/resource/Urs_Meisterhans,,
1018,,1943,,Rosalind Blauer,,,http://dbpedia.org/resource/Rosalind_Blauer,,
1019,,1952,,Makoto Yano,,,http://dbpedia.org/resource/Makoto_Yano,,
1020,,1974,,Krzysztof Zamasz,,,http://dbpedia.org/resource/Krzysztof_Zamasz,,


In [88]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_f_test=filtered_dbp_bnf_test
result_f_test['name'] = np.where(filtered_dbp_bnf_test['name_bnf'].isnull(),filtered_dbp_bnf_test['name_dbp'],filtered_dbp_bnf_test['name_bnf'])
result_f_test['name'] = np.where(filtered_dbp_bnf_test['name_dbp'].isnull(),filtered_dbp_bnf_test['name_bnf'],filtered_dbp_bnf_test['name_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_test['date_dbp'].isnull(),filtered_dbp_bnf_test['year_bnf'],filtered_dbp_bnf_test['date_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_test['year_bnf'].isnull(),filtered_dbp_bnf_test['date_dbp'],filtered_dbp_bnf_test['year_bnf'])


result_f_test=result_f_test.loc[:,['name','year','uri_dbp','uri_bnf']]
sort_rft=result_f_test.sort_values(by='name', ascending=False)
sort_rft[700:750]

Unnamed: 0,name,year,uri_dbp,uri_bnf
1247,Louis Westerfield,1949,http://dbpedia.org/resource/Louis_Westerfield,
1246,Louis Renault,1843,http://dbpedia.org/resource/Louis_Renault_(jur...,
5361,Louis Quesnot,1867,,http://data.bnf.fr/ark:/12148/cb127957568#about
5278,Louis Marie Alfred Dubois de Jancigny,1824,,http://data.bnf.fr/ark:/12148/cb10708630w#about
1035,Louis Lévy-Garboua,1945,http://dbpedia.org/resource/Louis_Lévy-Garboua,
5427,Louis Falletti,1899,,http://data.bnf.fr/ark:/12148/cb110057322#about
5571,Louis Dupré La Tour,1902,,http://data.bnf.fr/ark:/12148/cb111999767#about
5770,Louis Colin,1835,,http://data.bnf.fr/ark:/12148/cb170640459#about
5528,Louis Cluzel,1879,,http://data.bnf.fr/ark:/12148/cb10890440r#about
1245,Louis Beel,1902,http://dbpedia.org/resource/Louis_Beel,


In [93]:
# Test to find similarities between the names BnF Data and DBpedia with collocation 

# cf. https://stackoverflow.com/questions/56950291/extracting-and-counting-trigrams-from-dataframe/56950405

from itertools import chain

result_f_test['name'] = result_f_test['name'].astype(str)

trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_documents(result_f_test['name'])
finder.nbest(trigram_measures.pmi, 50)

s = pd.Series(result_f_test['name'])
trigram_f_test = s.apply(lambda x: Counter(ngrams(x.split(), 3)).most_common())

trigram_f_test

1011                                    []
1012                                    []
1013       [((Magda, ElSayed, Kandil), 1)]
1014                                    []
1015           [((Mahendra, P., Lama), 1)]
                       ...                
5862                                    []
5863                                    []
5864                                    []
5865                                    []
5866    [((Ricardo, Martinez, Vargas), 1)]
Name: name, Length: 1810, dtype: object

In [91]:
# attempt to change the 'NaN' names from DBpedia to name from BnF and the same for the reverse 
# (cf.https://stackoverflow.com/questions/29177498/python-pandas-replace-nan-in-one-column-with-value-from-corresponding-row-of-sec)
import pandas as pd
import numpy as np

def fx(x):
    if np.isnan(x['date_dbp']):
        return x['year_bnf ']
    else:
        return x['date_dbp']
result_test=result.apply(lambda x : fx(x),axis=1)

TypeError: ("ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''", 'occurred at index 0')

In [41]:
# Another method for trying to do it 
result.name_dbp.fillna(result.name_bnf, inplace=True)
del result['name_bnf']
result.columns = 'uri_dbp viaf name_dbp date_dbp uri_bnf sName year_bnf bio_bnf'.split()


ValueError: Length mismatch: Expected axis has 10 elements, new values have 8 elements

In [42]:
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
                    'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
                    'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
                    'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
                              'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
                    })

In [43]:
df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
                    'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
                                  'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
                                  'http://example.com/img/id005.png']
                    })

In [11]:
df3_merged = pd.merge(df1, df2)
df3_merged 

Unnamed: 0,user_id,first_name,last_name,email,image_url
0,id001,Rivi,Valti,rvalti0@example.com,http://example.com/img/id001.png
1,id002,Wynnie,McMurty,wmcmurty1@example.com,http://example.com/img/id002.jpg
2,id003,Kristos,Ivanets,kivanets2@example.com,http://example.com/img/id003.bmp
3,id004,Madalyn,Max,mmax3@example.com,http://example.com/img/id004.jpg
4,id005,Tobe,Riddich,triddich4@example.com,http://example.com/img/id005.png
