# Merge BnF DATA, DBpedia and Wikidata

In this notebook, we apply a method to merge three datasets (BnF, DBpedia and Wikidata)

* First, we drop duplicates of each datasets. 

* Secondly, we merge the three datasets  by removing duplicate data. To realise that, we will use the Linkage toolkit who calculate the proximity (by giving a score) between to string from two dataframes.

* Previously, we have to collect data about economists with SPARQL queries.

In [2]:
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML
import pprint
import csv
# from bs4 import BeautifulSoup

from collections import Counter
from operator import itemgetter
import pandas as pd
from sqlalchemy import create_engine

# Calling the nltk package to merge the data of people without existing VIAF URI in the two datasets 

In [288]:
query = """
PREFIX  egr:  <http://rdvocab.info/ElementsGr2/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?name ?sName ?uri ?year ?placeOfBirth ?bio
WHERE
  {   { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1770" )
        FILTER ( ( ( ( ( regex(?bio, "juriste", "i") || regex(?bio, "professeur de droit", "i") ) || regex(?bio, "docteur en droit", "i") ) || regex(?bio, "avocat", "i") ) || regex(?bio, "juge", "i") ) || regex(?bio, "magistrat", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  egr:placeOfBirth ?placeOfBirth}
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
          }
      }
    UNION
      { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1770" )
        FILTER ( ( ( regex(?bio, "économiste") || regex(?bio, "Economiste") ) || regex(?bio, "professeur d'économie", "i") ) || regex(?bio, "docteur en économie", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  egr:placeOfBirth ?placeOfBirth}
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri1
            FILTER regex(?uri1, "viaf.org", "i")
            BIND(strbefore(str(?uri1), "http://viaf.org/viaf/") AS ?uri)
          }
      }
  }
ORDER BY DESC(?uri)


"""

In [289]:
sparql = SPARQLWrapper("https://data.bnf.fr/sparql") ##, returnFormat=RDFXML)  [LOCALHOST]

In [290]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [291]:
try:
    rc_bnf = sparql.queryAndConvert()
except Exception as e:
    print(e)

In [292]:
# Number of rows in the result
len(rc_bnf['results']['bindings'])

11201

In [293]:
# Inspect the first three rows
i = 0
for l in rc_bnf['results']['bindings']:
    if i < 100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb12981404c#about'}, 'name': {'type': 'literal', 'value': 'Léon Garnier'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99996033'}, 'year': {'type': 'literal', 'value': '1836'}, 'bio': {'type': 'literal', 'value': "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb13484444m#about'}, 'name': {'type': 'literal', 'value': 'Gaston de Pawlowski'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9999219'}, 'year': {'type': 'literal', 'value': '1874'}, 'placeOfBirth': {'type': 'literal', 'value': 'Joigny (Yonne)'}, 'bio': {'type': 'literal', 'value': 'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb134841632#about'}, 'name': {'type': 'literal', 'value': 'Jean

In [294]:
result_bnf = []
for l in rc_bnf['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            sName = l['sName']['value']
        except Exception as e:
            sName = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            bio = l['bio']['value']
        except Exception as e:
            bio = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        try: 
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        result_bnf.append([l['s']['value'], uri, name, sName, year,placeOfBirth, bio])        
            
        

In [295]:
print(len(result_bnf))
result_bnf[:10]

11201


[['http://data.bnf.fr/ark:/12148/cb12981404c#about',
  'http://viaf.org/viaf/99996033',
  'Léon Garnier',
  '',
  '1836',
  '',
  "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"],
 ['http://data.bnf.fr/ark:/12148/cb13484444m#about',
  'http://viaf.org/viaf/9999219',
  'Gaston de Pawlowski',
  '',
  '1874',
  'Joigny (Yonne)',
  'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'],
 ['http://data.bnf.fr/ark:/12148/cb134841632#about',
  'http://viaf.org/viaf/9999131',
  'Jean-Michel Berton',
  '',
  '1794',
  'Cahors (Lot)',
  'Écrivain politique, avocat à la Cour de cassation. - Fut fondateur et directeur de la "Revue poétique française et étrangère"'],
 ['http://data.bnf.fr/ark:/12148/cb13379520q#about',
  'http://viaf.org/viaf/9995247',
  'Emmanuel Mathieu',
  '',
  '1852',
  '',
  'Docteur en droit (Paris, 1873)'],
 ['http://data.bnf.fr/ark:/12148/cb13338

In [296]:
query_2= """
PREFIX  dbo:  <http://dbpedia.org/ontology/>
PREFIX  dbp:  <http://dbpedia.org/property/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?uri ?name (year(xsd:dateTime(?Birth_Date)) AS ?year) ?abstract ?placeOfBirth ?nationality
WHERE
  {   { ?s  a              dbo:Economist ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name }
        FILTER ( xsd:string(?name) != "Samuel Bowles" )
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  ?propriety  dbr:Economist
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  ?p             dbr:Jurist ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  ?p             dbr:Lawyer ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  a              dbr:Professor ;
            dbp:birthDate  ?Birth_Date ;
            dbo:abstract   ?abstract
        FILTER ( ( ( ( regex(?abstract, "lawyer", "i") || regex(?abstract, "jurist", "i") ) || regex(?abstract, "juriste", "i") ) || regex(?abstract, "attorney", "i") ) || regex(?abstract, "legal professional", "i") )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    BIND(strafter(str(?bp), "http://dbpedia.org/resource/") AS ?bp1)
    BIND(replace(str(?bp1), "[_]", " ") AS ?placeOfBirth)
    BIND(strafter(str(?nationality1), "http://dbpedia.org/resource/") AS ?nationality2)
    BIND(replace(str(?nationality2), "[_]", " ") AS ?nationality)
  }
ORDER BY DESC(?uri)

"""

In this query, we have made the choice to aggregate, by a UNION clause, several queries to maximise the results' number. Also we request the "economists" and the "jurists" in only one query. 

Obviously, we chose classes and instances directly related to our population, but also the "professor" instance, because some "economists" or "jurists" are in this instance (we have tried with and without them, and there more result when we use them). 

Also, we exclude all classes because they don't add more result, except the "Economist" class (we keep it) 

For exemple, we exclude the resource "personFunction" and the resource "Jurists" because they add no more data. Additionally, we keep only the "Professor" instance for the jurists (it returns result only for the jurists).

In [11]:
sparql = SPARQLWrapper("https://dbpedia.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [12]:
sparql.setQuery(query_2)
sparql.setReturnFormat(JSON)

In [13]:
rc_db = sparql.queryAndConvert()

In [14]:
# Number of rows in the result
len(rc_db['results']['bindings'])

10000

In [15]:
# Inspect the first three rows
i = 0
for l in rc_db['results']['bindings']:
    if i <100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/António_de_Almeida_Santos'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99921066'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'António de Almeida Santos'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1926'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Anita_Augspurg'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9976800'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Anita Augspurg'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1857'}, 'placeOfBirth': {'type': 'literal', 'value': ''}, 'nationality': {'type': 'literal', 'value': ''}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Mason_Gaffney'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9960617'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Mason Gaffney'}, 'year': {'typ

In [16]:
# Create a list with URI, VIAF URI, name, year
result_dbpedia = []
for l in rc_db['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        try:
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        try:
            nationality = l['nationality']['value']
        except Exception as e:
            nationality = ''
        result_dbpedia.append([l['s']['value'], uri, name, year, placeOfBirth, nationality])

In [17]:
# Inspect the first three of the list
result_dbpedia[-10:]

[['http://dbpedia.org/resource/Peter_Ala_Adjetey',
  '',
  'Peter Ala Adjetey',
  '1931',
  'Accra',
  ''],
 ['http://dbpedia.org/resource/Peter_Angelos',
  '',
  'Peter Angelos',
  '1929',
  'United States',
  ''],
 ['http://dbpedia.org/resource/Peter_Angelos',
  '',
  'Peter Angelos',
  '1929',
  'Baltimore',
  ''],
 ['http://dbpedia.org/resource/Peter_Breck',
  '',
  'Peter Breck',
  '1929',
  'Rochester, New York',
  ''],
 ['http://dbpedia.org/resource/Peter_Bynoe',
  '',
  'Peter Bynoe',
  '1951',
  'Boston',
  ''],
 ['http://dbpedia.org/resource/Peter_F._Leuch',
  '',
  'Peter F. Leuch',
  '1883',
  '',
  ''],
 ['http://dbpedia.org/resource/Peter_Groff',
  '',
  'Peter Groff',
  '1963',
  '',
  ''],
 ['http://dbpedia.org/resource/Peter_Hall_(sailor)',
  '',
  'Peter Hall',
  '1949',
  '',
  ''],
 ['http://dbpedia.org/resource/Peter_J._Hamilton',
  '',
  'Peter J. Hamilton',
  '1859',
  'Alabama',
  ''],
 ['http://dbpedia.org/resource/Peter_J._Hamilton',
  '',
  'Peter J. Hamilton

In [18]:
query_3= """
PREFIX  rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX  wd:   <http://www.wikidata.org/entity/>
PREFIX  wdt:  <http://www.wikidata.org/prop/direct/>

SELECT DISTINCT  ?s ?uri ?name ?yearBirth ?yearDeath ?nationality
WHERE
  {   { ?s  wdt:P106  wd:Q188094 } # Economists
      UNION
      { ?s  wdt:P106  wd:Q185351 }  # Jurists
    OPTIONAL
      { ?s  wdt:P569  ?dob } # date of birth
    OPTIONAL
      { ?s  wdt:P570  ?dod } # P570 is the "date of death" propriety.
       OPTIONAL
      { ?s  wdt:P27 ?nationality }
    OPTIONAL
      { ?s  wdt:P214  ?oldURI
        BIND(uri(concat("http://viaf.org/viaf/", strafter(str(?oldURI), ""))) AS ?uri) # It's useful to have the URI VIAF in the same and merge data.
      }
    BIND(year(?dob) AS ?yearBirth)
    BIND(year(?dod) AS ?yearDeath)
    FILTER ( ?yearBirth > 1770 )
    OPTIONAL
      { ?s  rdfs:label  ?name
        FILTER ( lang(?name) = "en" )
      }
  }
"""

In [19]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [20]:
sparql.setQuery(query_3)
sparql.setReturnFormat(JSON)

In [21]:
rc_wk = sparql.queryAndConvert()

In [22]:
# Number of rows in the result
len(rc_wk['results']['bindings'])

60139

In [23]:
# Create a list with URI, VIAF URI, name, year
result_wikidata = []
for l in rc_wk['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            yearBirth = l['yearBirth']['value']
        except Exception as e:
            yearBirth = ''
        try: 
            yearDeath = l['yearDeath']['value']
        except Exception as e:
            yearDeath = ''
        try:
            nationality = l['nationality']['value']
        except Exception as e:
            nationality = ''
        result_wikidata.append([l['s']['value'], uri, name, yearBirth,yearDeath, nationality])

In [24]:
result_wikidata[8000:8010]

[['http://www.wikidata.org/entity/Q1715330',
  'http://viaf.org/viaf/54677541',
  'Jörg Goldberg',
  '1943',
  '',
  'http://www.wikidata.org/entity/Q183'],
 ['http://www.wikidata.org/entity/Q1715304',
  '',
  'Jörg Erpenbach',
  '1966',
  '',
  'http://www.wikidata.org/entity/Q183'],
 ['http://www.wikidata.org/entity/Q1437151',
  'http://viaf.org/viaf/80375041',
  'Wolfram Timm',
  '1949',
  '',
  'http://www.wikidata.org/entity/Q183'],
 ['http://www.wikidata.org/entity/Q1437156',
  'http://viaf.org/viaf/1735147270637235700001',
  'Heinz-Jürgen Koloczek',
  '1943',
  '',
  'http://www.wikidata.org/entity/Q183'],
 ['http://www.wikidata.org/entity/Q1436687',
  'http://viaf.org/viaf/870109',
  'Franz Joseph von Bretfeld-Chlumczansky',
  '1777',
  '1839',
  'http://www.wikidata.org/entity/Q131964'],
 ['http://www.wikidata.org/entity/Q1436596',
  '',
  'Joseph Charlier',
  '1816',
  '1896',
  'http://www.wikidata.org/entity/Q31'],
 ['http://www.wikidata.org/entity/Q1722184',
  'http://viaf

In [25]:
engine = create_engine('sqlite:///database.sqlite_2', echo=False)

NameError: name 'viaf' is not defined

In [297]:
df_bnf = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf_bnf', 'name_bnf', 'sName', 'year_bnf', 'placeOfBirth_bnf','bio_bnf'])
print(len(df_bnf))
df_bnf.fillna('')

df_bnf[:10]

11201


Unnamed: 0,uri_bnf,viaf_bnf,name_bnf,sName,year_bnf,placeOfBirth_bnf,bio_bnf
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836,,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874,Joigny (Yonne),Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794,Cahors (Lot),"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852,,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,,Juriste. - Bibliophile
5,http://data.bnf.fr/ark:/12148/cb13322315v#about,http://viaf.org/viaf/9991357,Paul Pic,,1862,Alger,Juriste. - Professeur de droit à la Faculté de...
6,http://data.bnf.fr/ark:/12148/cb13193319k#about,http://viaf.org/viaf/9989230,Gaston Ravisse,,1877,Calais (Pas-de-Calais),Avocat. - Spécialiste du monde de l'entreprise...
7,http://data.bnf.fr/ark:/12148/cb15042710d#about,http://viaf.org/viaf/99857689,Cândido Jucá Filho,,1900,,Avocat
8,http://data.bnf.fr/ark:/12148/cb13169620f#about,http://viaf.org/viaf/9985289,Joseph de Trémaudan,,1846,,Juge à Paimboeuf. - Historien local
9,http://data.bnf.fr/ark:/12148/cb13075767f#about,http://viaf.org/viaf/9982622,Achille Villey-Desmeserets,,1878,Caen (Calvados),Avocat. - Préfet


In [298]:
df_bnf.drop_duplicates(subset ="uri_bnf", keep = 'first', inplace=True)
print(len(df_bnf))

11100


In [299]:
df_dbp = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf_dbp', 'name_dbp', 'year_dbp',"placeOfBirth_dbp", "nationality_dbp"])
print(len(df_dbp))
df_dbp.fillna('')
df_dbp.head()

NameError: name 'result_dbpedia' is not defined

In [29]:
df_dbp.drop_duplicates(subset ="uri_dbp", keep = 'first', inplace=True)
print(len(df_dbp))
df_dbp.head()

6123


Unnamed: 0,uri_dbp,viaf_dbp,name_dbp,year_dbp,placeOfBirth_dbp,nationality_dbp
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926,,
1,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857,,
2,http://dbpedia.org/resource/Mason_Gaffney,http://viaf.org/viaf/9960617,Mason Gaffney,1923,,United States
3,http://dbpedia.org/resource/Hermann_Heinrich_G...,http://viaf.org/viaf/9939728,Hermann Heinrich Gossen,1810,Düren,
4,http://dbpedia.org/resource/Gottfried_Haberler,http://viaf.org/viaf/99257315,Gottfried Haberler,1900,Purkersdorf,


In [30]:
df_wk= pd.DataFrame(result_wikidata, columns=['uri_wk', 'viaf_wk', 'name_wk', 'year_wk', "yearDeath_wk", "nationality_wk"])
print(len(df_wk))
df_wk.fillna('')
df_wk.head()

60139


Unnamed: 0,uri_wk,viaf_wk,name_wk,year_wk,yearDeath_wk,nationality_wk
0,http://www.wikidata.org/entity/Q65539,http://viaf.org/viaf/4137633,Peter Altmaier,1958,,http://www.wikidata.org/entity/Q183
1,http://www.wikidata.org/entity/Q72628,http://viaf.org/viaf/54958174,Alfred von Kiderlen-Waechter,1852,1912.0,http://www.wikidata.org/entity/Q183
2,http://www.wikidata.org/entity/Q65561,http://viaf.org/viaf/232142151,Hans Apel,1932,2011.0,http://www.wikidata.org/entity/Q183
3,http://www.wikidata.org/entity/Q72535,http://viaf.org/viaf/15698197,Rainer Rupp,1945,,http://www.wikidata.org/entity/Q183
4,http://www.wikidata.org/entity/Q72553,http://viaf.org/viaf/62342475,Heinrich von Bülow,1792,1846.0,http://www.wikidata.org/entity/Q183


In [31]:
df_wk.drop_duplicates(subset ='uri_wk', keep = 'first', inplace=True)
df_dbp.fillna('')
print(len(df_wk))
df_wk.head()

53200


Unnamed: 0,uri_wk,viaf_wk,name_wk,year_wk,yearDeath_wk,nationality_wk
0,http://www.wikidata.org/entity/Q65539,http://viaf.org/viaf/4137633,Peter Altmaier,1958,,http://www.wikidata.org/entity/Q183
1,http://www.wikidata.org/entity/Q72628,http://viaf.org/viaf/54958174,Alfred von Kiderlen-Waechter,1852,1912.0,http://www.wikidata.org/entity/Q183
2,http://www.wikidata.org/entity/Q65561,http://viaf.org/viaf/232142151,Hans Apel,1932,2011.0,http://www.wikidata.org/entity/Q183
3,http://www.wikidata.org/entity/Q72535,http://viaf.org/viaf/15698197,Rainer Rupp,1945,,http://www.wikidata.org/entity/Q183
4,http://www.wikidata.org/entity/Q72553,http://viaf.org/viaf/62342475,Heinrich von Bülow,1792,1846.0,http://www.wikidata.org/entity/Q183


## RecordLinked

It permits to calculate a match score between to strings. Here, we use the "fuzzymatcher" library.

This article explains  very well how uses it. cf. https://pbpython.com/record-linking.html

There is also a documentation but it is very light. cf. https://fuzzymatcher.readthedocs.io/en/latest/

It seems work well only for the strings, because I guess it manages integers as strings. 

In [3]:
from pathlib import Path
import fuzzymatcher

In [301]:
df_bnf.to_csv("df_bnf.csv")


In [102]:
BnF_Data = pd.read_csv('df_bnf.csv')
Wikidata = pd.read_csv('df_wk.csv')
DBpedia = pd.read_csv('df_dbp.csv')

In [35]:
left_on=["name_bnf", "placeOfBirth_bnf", "year_bnf"]
right_on=["name_dbp", "placeOfBirth_dbp", "year_dbp"]

In [36]:
matched_results = fuzzymatcher.fuzzy_left_join(BnF_Data,
                                            DBpedia,
                                            left_on,
                                            right_on,
                                            left_id_col='uri_bnf',
                                            right_id_col='uri_dbp')

In [37]:
cols_bnf_dbp= ["best_match_score","uri_bnf","viaf_bnf", "name_bnf", "year_bnf","placeOfBirth_bnf", "bio_bnf", "uri_dbp", "viaf_dbp", "name_dbp", "placeOfBirth_dbp","year_dbp"]

In [39]:
best_match_bnf_dbp=matched_results[cols_bnf_dbp].sort_values(by=["best_match_score"], ascending=False).head(10)
best_match_bnf_dbp

Unnamed: 0,best_match_score,uri_bnf,viaf_bnf,name_bnf,year_bnf,placeOfBirth_bnf,bio_bnf,uri_dbp,viaf_dbp,name_dbp,placeOfBirth_dbp,year_dbp
331174,1.991006,http://data.bnf.fr/ark:/12148/cb11298933w#about,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882,Antony (Hauts-de-Seine),Homme politique. - Avocat près la cour d'appel...,http://dbpedia.org/resource/Auguste_Champetier...,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,"Antony, Hauts-de-Seine",1882.0
617136,1.326509,http://data.bnf.fr/ark:/12148/cb12029534q#about,http://viaf.org/viaf/110689328,Kurt Georg Kiesinger,1904,"Ebingen, Allemagne","Avocat, membre du parti chrétien-démocrate (CD...",http://dbpedia.org/resource/Kurt_Georg_Kiesinger,,Kurt Georg Kiesinger,Ebingen,1904.0
89801,1.312447,http://data.bnf.fr/ark:/12148/cb11926733m#about,http://viaf.org/viaf/73860740,Jean-Louis Tixier-Vignancour,1907,,Avocat. - Ancien député. - Candidat à l'électi...,http://dbpedia.org/resource/Jean-Louis_Tixier-...,,Jean-Louis Tixier-Vignancour,,1907.0
125224,1.25093,http://data.bnf.fr/ark:/12148/cb12599024b#about,http://viaf.org/viaf/66585190,Michel Crépeau,1930,Fontenay-le-Comte (Vendée),Avocat. - Homme politique. - Co-fondateur puis...,http://dbpedia.org/resource/Michel_Crépeau,,Michel Crépeau,Fontenay-le-Comte,1930.0
116583,1.1956,http://data.bnf.fr/ark:/12148/cb12091668t#about,http://viaf.org/viaf/68956010,Adam von Trott zu Solz,1909,,"Juriste allemand, exécuté après l'attentat du ...",http://dbpedia.org/resource/Adam_von_Trott_zu_...,,Adam von Trott zu Solz,,1909.0
752898,1.046239,http://data.bnf.fr/ark:/12148/cb10114251z#about,,Clément Charles Sabrevois de Bleury,1798,Sorel-Tracy (Canada),Avocat et député. - Cofondateur du journal can...,http://dbpedia.org/resource/Clément-Charles_Sa...,,Clément-Charles Sabrevois de Bleury,Lower Canada,1798.0
384322,1.045153,http://data.bnf.fr/ark:/12148/cb12308925q#about,http://viaf.org/viaf/29600629,Morris Raphael Cohen,1880,Minsk,Juriste. - Spécialiste de philosophie du droit,http://dbpedia.org/resource/Morris_Raphael_Cohen,http://viaf.org/viaf/29600629,Morris Raphael Cohen,Minsk,1880.0
116834,1.029082,http://data.bnf.fr/ark:/12148/cb12054658d#about,http://viaf.org/viaf/68949059,Helmuth James von Moltke,1907,,Comte. - Juriste. - Militant actif de la résis...,http://dbpedia.org/resource/Helmuth_James_von_...,http://viaf.org/viaf/68949059,Helmuth James Graf von Moltke,,1907.0
67683,0.998654,http://data.bnf.fr/ark:/12148/cb12278447x#about,http://viaf.org/viaf/76377829,Friedrich Carl von Savigny,1779,,Juriste. - Spécialiste de droit romain. - Mini...,http://dbpedia.org/resource/Friedrich_Carl_von...,http://viaf.org/viaf/76377829,Friedrich Carl von Savigny,Frankfurt,1779.0
319731,0.992649,http://data.bnf.fr/ark:/12148/cb123779775#about,http://viaf.org/viaf/33055799,Francisco García Calderón,1834,Arequipa (Pérou),Juriste et homme politique. - Président de la ...,http://dbpedia.org/resource/Francisco_García_C...,,Francisco García Calderón,Arequipa,1834.0


In [182]:
maReSo=matched_results[cols_bnf_dbp].sort_values(by=["best_match_score"], ascending=True).head(10)
maReSo

Unnamed: 0,best_match_score,uri_bnf,viaf_bnf,name_bnf,year_bnf,placeOfBirth_bnf,bio_bnf,uri_dbp,viaf_dbp,name_dbp,placeOfBirth_dbp,year_dbp
608860,-2.037571,http://data.bnf.fr/ark:/12148/cb17791066c#about,,Sayf ibn Ḥamad ibn Šayẖān ibn Muḥammad ibn ...,1892,"Sima, Azki - Oman",Poète. - Juge,http://dbpedia.org/resource/Zu'bi_M.F._Al-Zu'bi,,Zu'bi M. F. Al-Zu'bi,Amman,1977
479613,-1.783377,http://data.bnf.fr/ark:/12148/cb12023012k#about,http://viaf.org/viaf/109768106,Ljubomir Tadić,1925,"Smriječno près de Plužine (Royaume des Serbes,...","Juriste, philosophe et homme politique. - Prof...",http://dbpedia.org/resource/Wade_H._McCree,,Wade H. McCree,"Des Moines, Iowa",1920
43336,-1.53876,http://data.bnf.fr/ark:/12148/cb121974159#about,http://viaf.org/viaf/78772873,Franc Miklošič,1813,"Radomerščak dans Ljutomer, Styrie (Empire ausr...",Docteur en philosophie en 1838 (de l'Universit...,http://dbpedia.org/resource/Mitrofan_Grodzitsky,,,Russian Empire,1861
469623,-1.503089,http://data.bnf.fr/ark:/12148/cb16761971p#about,http://viaf.org/viaf/116498531,Veljko Guberina,1925,"Né à Gvozd (Royaume des Serbes, Croates et Slo...",Avocat,http://dbpedia.org/resource/Wade_H._McCree,,Wade H. McCree,"Des Moines, Iowa",1920
162204,-1.464797,http://data.bnf.fr/ark:/12148/cb10272244m#about,http://viaf.org/viaf/51587696,Yiṣḥaq Zelig Gronemann,1843,"Flötenstein (Prusse-Occidentale), aujourd'hui ...","Rabbin. - Père de Sammy Gronemann (1875-1952),...",http://dbpedia.org/resource/James_Mitchell_(Ca...,,James Mitchell,,1843
34768,-1.418407,http://data.bnf.fr/ark:/12148/cb12965410v#about,http://viaf.org/viaf/82542537,Yiṣḥaq Qorn,1911,"Kišinëv (Bessarabie, Russie), aujourd'hui Chi...",Militant et dirigeant sioniste. - Homme politi...,http://dbpedia.org/resource/William_Greene_(ec...,http://viaf.org/viaf/111714803,William H. Greene,,1951
169748,-1.4171,http://data.bnf.fr/ark:/12148/cb12163529p#about,http://viaf.org/viaf/4970558,Gabriel Baudry-Lacantinerie,1837,"Saint-Sauveur-de-Nuaillé (Charente-Maritime, d...",Professeur de droit. - Doyen de la faculté de ...,http://dbpedia.org/resource/Gabriel_Mann,,Gabriel Mann,,1972
453220,-1.343622,http://data.bnf.fr/ark:/12148/cb16658417t#about,http://viaf.org/viaf/1280009,Nikolaj Vasilʹevič Vitruk,1937,"Žarovka, rayon de Pervomaiski, oblast de Tomsk...",Juriste. - Écrivit aussi des ouvrages sur la c...,http://dbpedia.org/resource/Charles_Foti,,,,1937
242831,-1.335902,http://data.bnf.fr/ark:/12148/cb126789606#about,http://viaf.org/viaf/32792663,Paul Zifferer,1879,"Bistritz am Hostein, Moravie (Bystřice pod Hos...",Romancier et journaliste. - Docteur en droit e...,http://dbpedia.org/resource/Paul_Brown_(Georgi...,,Paul Brown,United States,1880
195532,-1.329589,http://data.bnf.fr/ark:/12148/cb12213838m#about,http://viaf.org/viaf/44347928,Mehmed Bégovitch,1904,Lastva près de Trebinje (Empire austro-hongro...,Docteur en droit (en 1930),http://dbpedia.org/resource/Kurt_Schmitt,http://viaf.org/viaf/49986213,,German Empire,1886


In [44]:
matched_bnf_dbp=matched_results[cols_bnf_dbp].query("best_match_score >= .281651").sort_values(
    by=['best_match_score'], ascending=False)
print(len(matched_bnf_dbp))
matched_bnf_dbp[:10]

168


Unnamed: 0,best_match_score,uri_bnf,viaf_bnf,name_bnf,year_bnf,placeOfBirth_bnf,bio_bnf,uri_dbp,viaf_dbp,name_dbp,placeOfBirth_dbp,year_dbp
331174,1.991006,http://data.bnf.fr/ark:/12148/cb11298933w#about,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882,Antony (Hauts-de-Seine),Homme politique. - Avocat près la cour d'appel...,http://dbpedia.org/resource/Auguste_Champetier...,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,"Antony, Hauts-de-Seine",1882.0
617136,1.326509,http://data.bnf.fr/ark:/12148/cb12029534q#about,http://viaf.org/viaf/110689328,Kurt Georg Kiesinger,1904,"Ebingen, Allemagne","Avocat, membre du parti chrétien-démocrate (CD...",http://dbpedia.org/resource/Kurt_Georg_Kiesinger,,Kurt Georg Kiesinger,Ebingen,1904.0
89801,1.312447,http://data.bnf.fr/ark:/12148/cb11926733m#about,http://viaf.org/viaf/73860740,Jean-Louis Tixier-Vignancour,1907,,Avocat. - Ancien député. - Candidat à l'électi...,http://dbpedia.org/resource/Jean-Louis_Tixier-...,,Jean-Louis Tixier-Vignancour,,1907.0
125224,1.25093,http://data.bnf.fr/ark:/12148/cb12599024b#about,http://viaf.org/viaf/66585190,Michel Crépeau,1930,Fontenay-le-Comte (Vendée),Avocat. - Homme politique. - Co-fondateur puis...,http://dbpedia.org/resource/Michel_Crépeau,,Michel Crépeau,Fontenay-le-Comte,1930.0
116583,1.1956,http://data.bnf.fr/ark:/12148/cb12091668t#about,http://viaf.org/viaf/68956010,Adam von Trott zu Solz,1909,,"Juriste allemand, exécuté après l'attentat du ...",http://dbpedia.org/resource/Adam_von_Trott_zu_...,,Adam von Trott zu Solz,,1909.0
752898,1.046239,http://data.bnf.fr/ark:/12148/cb10114251z#about,,Clément Charles Sabrevois de Bleury,1798,Sorel-Tracy (Canada),Avocat et député. - Cofondateur du journal can...,http://dbpedia.org/resource/Clément-Charles_Sa...,,Clément-Charles Sabrevois de Bleury,Lower Canada,1798.0
384322,1.045153,http://data.bnf.fr/ark:/12148/cb12308925q#about,http://viaf.org/viaf/29600629,Morris Raphael Cohen,1880,Minsk,Juriste. - Spécialiste de philosophie du droit,http://dbpedia.org/resource/Morris_Raphael_Cohen,http://viaf.org/viaf/29600629,Morris Raphael Cohen,Minsk,1880.0
116834,1.029082,http://data.bnf.fr/ark:/12148/cb12054658d#about,http://viaf.org/viaf/68949059,Helmuth James von Moltke,1907,,Comte. - Juriste. - Militant actif de la résis...,http://dbpedia.org/resource/Helmuth_James_von_...,http://viaf.org/viaf/68949059,Helmuth James Graf von Moltke,,1907.0
67683,0.998654,http://data.bnf.fr/ark:/12148/cb12278447x#about,http://viaf.org/viaf/76377829,Friedrich Carl von Savigny,1779,,Juriste. - Spécialiste de droit romain. - Mini...,http://dbpedia.org/resource/Friedrich_Carl_von...,http://viaf.org/viaf/76377829,Friedrich Carl von Savigny,Frankfurt,1779.0
319731,0.992649,http://data.bnf.fr/ark:/12148/cb123779775#about,http://viaf.org/viaf/33055799,Francisco García Calderón,1834,Arequipa (Pérou),Juriste et homme politique. - Président de la ...,http://dbpedia.org/resource/Francisco_García_C...,,Francisco García Calderón,Arequipa,1834.0


In [184]:
bnf_dbp=pd.DataFrame(matched_bnf_dbp, columns=['uri_bnf','uri_dbp','viaf_bnf','name_bnf', 'year_bnf','bio_bnf'])


Unnamed: 0,uri_bnf,uri_dbp,viaf_bnf,name_bnf,year_bnf,bio_bnf
62919,http://data.bnf.fr/ark:/12148/cb121108528#about,http://dbpedia.org/resource/Jacques_Mairesse_(...,http://viaf.org/viaf/7421350,Jacques Mercier,1921,Avocat et homme politique. - Avocat à la cour ...
429849,http://data.bnf.fr/ark:/12148/cb14316906f#about,http://dbpedia.org/resource/Béla_Szászy,http://viaf.org/viaf/166146883,Béla Szász,1868,Écrivain. - Juge. - Traducteur. - A fait des é...
160756,http://data.bnf.fr/ark:/12148/cb11906728m#about,http://dbpedia.org/resource/Gisèle_Halimi,http://viaf.org/viaf/51690665,Gisèle Halimi,1927,"Avocate. - Femme politique, députée de l'Isère..."
194723,http://data.bnf.fr/ark:/12148/cb12400819r#about,http://dbpedia.org/resource/Claude_Goasguen,http://viaf.org/viaf/44383712,Claude Goasguen,1945,Avocat. - Inspecteur général de l'éducation na...
531196,http://data.bnf.fr/ark:/12148/cb11907611v#about,http://dbpedia.org/resource/Thomas_Hodgskin,,Thomas Hodgskin,1787,"Jounaliste, économiste. - Un des pionniers du ..."
...,...,...,...,...,...,...
162204,http://data.bnf.fr/ark:/12148/cb10272244m#about,http://dbpedia.org/resource/James_Mitchell_(Ca...,http://viaf.org/viaf/51587696,Yiṣḥaq Zelig Gronemann,1843,"Rabbin. - Père de Sammy Gronemann (1875-1952),..."
469623,http://data.bnf.fr/ark:/12148/cb16761971p#about,http://dbpedia.org/resource/Wade_H._McCree,http://viaf.org/viaf/116498531,Veljko Guberina,1925,Avocat
43336,http://data.bnf.fr/ark:/12148/cb121974159#about,http://dbpedia.org/resource/Mitrofan_Grodzitsky,http://viaf.org/viaf/78772873,Franc Miklošič,1813,Docteur en philosophie en 1838 (de l'Universit...
479613,http://data.bnf.fr/ark:/12148/cb12023012k#about,http://dbpedia.org/resource/Wade_H._McCree,http://viaf.org/viaf/109768106,Ljubomir Tadić,1925,"Juriste, philosophe et homme politique. - Prof..."


matched_results=fuzzymatcher.fuzzy_left_join(df_wk, df_dbp, left_on = "name_wk", right_on = "name_dbp")

 ### Second Method: Recordlinkage

In [126]:
import recordlinkage

In [366]:
BnF_Data = pd.read_csv('df_bnf.csv')
Wikidata = pd.read_csv('df_wk.csv')
DBpedia = pd.read_csv('df_dbp.csv')

In [367]:
# Create an id to DBpedia dataframe
DBpedia["id_dbp"] = DBpedia.index + 0
DBpedia= pd.DataFrame(DBpedia, columns=['uri_dbp','viaf_dbp', 'name_dbp', 'year_dbp', 'placeOfBirth_dbp', 'nationality_dbp'
],index=DBpedia["id_dbp"])
print(len(DBpedia))
DBpedia[:30]

6123


Unnamed: 0_level_0,uri_dbp,viaf_dbp,name_dbp,year_dbp,placeOfBirth_dbp,nationality_dbp
id_dbp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926,,
1,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857,,
2,http://dbpedia.org/resource/Mason_Gaffney,http://viaf.org/viaf/9960617,Mason Gaffney,1923,,United States
3,http://dbpedia.org/resource/Hermann_Heinrich_G...,http://viaf.org/viaf/9939728,Hermann Heinrich Gossen,1810,Düren,
4,http://dbpedia.org/resource/Gottfried_Haberler,http://viaf.org/viaf/99257315,Gottfried Haberler,1900,Purkersdorf,
5,http://dbpedia.org/resource/Michael_C._Burda,http://viaf.org/viaf/9922987,Michael C. Burda,1959,,United States
6,http://dbpedia.org/resource/Xavier_Vives,http://viaf.org/viaf/9920331,Xavier Vives,1955,,
7,http://dbpedia.org/resource/Vittorio_Emanuele_...,http://viaf.org/viaf/9914155,Vittorio Emanuele Orlando,1860,Kingdom of the Two Sicilies,Italian nationality law
8,http://dbpedia.org/resource/James_M._Poterba,http://viaf.org/viaf/9910825,James M. Poterba,1958,,
9,http://dbpedia.org/resource/David_Laibman,http://viaf.org/viaf/9910613,David Laibman,1942,,


In [346]:
# Create an BnF Data id dataframe
inc_BnF=len(DBpedia)+1
BnF_Data["id_bnf"] = BnF_Data.index + 0
BnF_Data= pd.DataFrame(BnF_Data, columns=['uri_bnf','viaf_bnf', 'name_bnf', 'year_bnf', 'placeOfBirth_bnf', 'bio_bnf'],index=BnF_Data["id_bnf"])
print(len(BnF_Data))
BnF_Data

11100


Unnamed: 0_level_0,uri_bnf,viaf_bnf,name_bnf,year_bnf,placeOfBirth_bnf,bio_bnf
id_bnf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,1836,,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,1874,Joigny (Yonne),Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,1794,Cahors (Lot),"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,1852,,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,1843,,Juriste. - Bibliophile
...,...,...,...,...,...,...
11095,http://data.bnf.fr/ark:/12148/cb11475627b#about,,Joan Mitchell,1920,,Économiste. - Professeur d'économie de l'unive...
11096,http://data.bnf.fr/ark:/12148/cb10562770v#about,,Kazimierz Zimmermann,1874,Trzemeszno (Pologne),Chanoine. - Economiste. - Recteur de l'Univers...
11097,http://data.bnf.fr/ark:/12148/cb17701366b#about,,ʿUmar ʿAzīz,1949,,Chercheur et professeur d'économie. - Militant...
11098,http://data.bnf.fr/ark:/12148/cb17877820g#about,,John Davenport,1904,"Philadelphie (Pennsylvanie, États-Unis)","Journaliste économiste. - Journaliste à : ""For..."


In [347]:
# Create an id to Wikidata dataframe
Wikidata["id_wk"] = Wikidata.index + 0
Wikidata= pd.DataFrame(Wikidata, columns=['uri_wk','viaf_wk', 'name_wk', 'year_wk', 'yearDeath_wk', 'nationality_wk'],index=Wikidata["id_wk"])
print(len(Wikidata))
Wikidata[-20:]

53200


Unnamed: 0_level_0,uri_wk,viaf_wk,name_wk,year_wk,yearDeath_wk,nationality_wk
id_wk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
53180,http://www.wikidata.org/entity/Q106158833,,,1945,,http://www.wikidata.org/entity/Q17
53181,http://www.wikidata.org/entity/Q106203537,,,1961,2013.0,
53182,http://www.wikidata.org/entity/Q106072014,,,1981,,
53183,http://www.wikidata.org/entity/Q106072057,,,1977,,
53184,http://www.wikidata.org/entity/Q106079947,,,1979,,
53185,http://www.wikidata.org/entity/Q106085959,,,1965,,
53186,http://www.wikidata.org/entity/Q106095610,,,1977,,
53187,http://www.wikidata.org/entity/Q106107894,,,1963,,
53188,http://www.wikidata.org/entity/Q106223503,,,1966,,
53189,http://www.wikidata.org/entity/Q106267248,,,1989,,


#### Match between BnF Data and DBpedia

indexer = recordlinkage.Index()
indexer.full()

In [348]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='name_bnf', right_on='name_dbp')
candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

5017


indexer = recordlinkage.Index()
indexer.block(left_on=['name_bnf', 'uri_bnf'],
              right_on=['name_dbp', 'uri_dbp'])
pairs = indexer.index(df_bnf, df_dbp)


candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

In [349]:
compare = recordlinkage.Compare()
compare.string('name_bnf',
            'name_dbp',
            threshold=0.85,
            label='name_bnf_dbp')
compare.numeric('year_bnf',
            'year_dbp',
            label='year_bnf_dbp')
compare.string('placeOfBirth_bnf',
            'placeOfBirth_dbp',
            threshold=0.85,
            label='placeOfBirth_bnf_dbp')
features = compare.compute(candidates, BnF_Data, DBpedia)

In [350]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0       5
2.0     159
1.5       1
1.0      69
0.5      53
0.0    4730
dtype: int64

In [351]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,name_bnf_dbp,year_bnf_dbp,placeOfBirth_bnf_dbp
id_bnf,id_dbp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,1722,0.0,0.0,0.0
19,5443,0.0,0.0,0.0
20,5165,0.0,0.0,0.0
21,145,0.0,0.0,0.0
25,1558,0.0,0.0,0.0
...,...,...,...,...
11087,2341,0.0,0.0,0.0
11090,4695,0.0,0.0,0.0
11091,2150,0.0,0.0,0.0
11095,3120,0.0,0.0,0.0


In [352]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'name_bnf_dbp':'placeOfBirth_bnf_dbp'].sum(axis=1)
potential_matches

Unnamed: 0,id_bnf,id_dbp,name_bnf_dbp,year_bnf_dbp,placeOfBirth_bnf_dbp,Score
0,7013,3091,1.0,1.0,0.0,2.0
1,8876,3026,1.0,1.0,0.0,2.0
2,20,1,1.0,1.0,0.0,2.0
3,85,7,1.0,1.0,0.0,2.0
4,105,13,1.0,1.0,0.0,2.0
...,...,...,...,...,...,...
160,10910,4580,1.0,1.0,0.0,2.0
161,1031,5297,1.0,1.0,0.0,2.0
162,6734,1219,1.0,1.0,0.0,2.0
163,9552,1187,1.0,1.0,0.0,2.0


In [353]:
BnF_Data.loc[7013,:]

uri_bnf               http://data.bnf.fr/ark:/12148/cb14872534q#about
viaf_bnf                               http://viaf.org/viaf/208210933
name_bnf                                             Jean-Paul Barety
year_bnf                                                         1928
placeOfBirth_bnf                                                 Nice
bio_bnf             Avocat. - Député du Rassemblement pour la Répu...
Name: 7013, dtype: object

In [354]:
DBpedia.loc[3091,:]

uri_dbp             http://dbpedia.org/resource/Jean-Paul_Baréty
viaf_dbp                                                     NaN
name_dbp                                        Jean-Paul Baréty
year_dbp                                                    1928
placeOfBirth_dbp                                          France
nationality_dbp                                           France
Name: 3091, dtype: object

In [355]:
BnF_Data['name_bnf']=BnF_Data['name_bnf'].astype(str)
DBpedia['name_dbp']=DBpedia['name_dbp'].astype(str)

BnF_Data['viaf_bnf']=BnF_Data['viaf_bnf'].astype(str)
DBpedia['viaf_dbp']=DBpedia['viaf_dbp'].astype(str)

BnF_Data['uri_bnf']=BnF_Data['uri_bnf'].astype(str)
DBpedia['uri_dbp']=DBpedia['uri_dbp'].astype(str)

BnF_Data['placeOfBirth_bnf']=BnF_Data['placeOfBirth_bnf'].astype(str)
DBpedia['placeOfBirth_dbp']=DBpedia['placeOfBirth_dbp'].astype(str)

BnF_Data['year_bnf']=BnF_Data['year_bnf'].astype(str)
DBpedia['year_dbp']=DBpedia['year_dbp'].astype(str)

BnF_Data['bio_bnf']=BnF_Data['bio_bnf'].astype(str)
DBpedia['nationality_dbp']=DBpedia['nationality_dbp'].astype(str)

In [356]:
BnF_Data['BnF_Name_Lookup'] = BnF_Data[[
   'name_bnf', 'year_bnf' ,'placeOfBirth_bnf' ,'bio_bnf' 
]].apply(lambda x: '|'.join(x), axis=1)

DBpedia['dbp_Name_Lookup'] = DBpedia[[
   'name_dbp', 'year_dbp', 'placeOfBirth_dbp', 'nationality_dbp'
]].apply(lambda x: '|'.join(x), axis=1)

BnF_Data_lookup = BnF_Data[['BnF_Name_Lookup']].reset_index()
DBpedia_lookup = DBpedia[['dbp_Name_Lookup']].reset_index()


In [357]:
BnF_Data_lookup

Unnamed: 0,id_bnf,BnF_Name_Lookup
0,0,Léon Garnier|1836|nan|Juriste. - Administrateu...
1,1,Gaston de Pawlowski|1874|Joigny (Yonne)|Docteu...
2,2,Jean-Michel Berton|1794|Cahors (Lot)|Écrivain ...
3,3,Emmanuel Mathieu|1852|nan|Docteur en droit (Pa...
4,4,Josiah Henry Benton|1843|nan|Juriste. - Biblio...
...,...,...
11095,11095,Joan Mitchell|1920|nan|Économiste. - Professeu...
11096,11096,Kazimierz Zimmermann|1874|Trzemeszno (Pologne)...
11097,11097,ʿUmar ʿAzīz|1949|nan|Chercheur et professeur d...
11098,11098,John Davenport|1904|Philadelphie (Pennsylvanie...


In [358]:
BnF_merge = potential_matches.merge(BnF_Data_lookup, how='left')

In [359]:
final_merge = BnF_merge.merge(DBpedia_lookup, how='left')

In [360]:
cols = ['id_bnf', 'id_dbp', 'Score',
        'BnF_Name_Lookup', 'dbp_Name_Lookup']
final=final_merge[cols].sort_values(by=[ 'Score'], ascending=True)
final[:50]

Unnamed: 0,id_bnf,id_dbp,Score,BnF_Name_Lookup,dbp_Name_Lookup
88,5114,5304,1.5,"Panagis Tsaldaris|1867|Kamari (Corinthe, Grèce...",Panagis Tsaldaris|1868|Greece|nan
0,7013,3091,2.0,Jean-Paul Barety|1928|Nice|Avocat. - Député du...,Jean-Paul Baréty|1928|France|France
106,7059,1256,2.0,"Alf Ross|1899|nan|Juriste, spécialisé en philo...",Alf Ross|1899|Denmark|nan
107,9209,1221,2.0,Friedrich List|1789|nan|Professeur d'économie ...,Friedrich List|1789|Duchy of Württemberg|nan
108,7269,1271,2.0,"Tobias Barreto|1839|nan|Philosophe, juriste. -...",Tobias Barreto|1839|Brazil|nan
109,7693,5340,2.0,René Cassin|1887|Bayonne (Pyrénées-Atlantiques...,René Cassin|1887|French Basque Country|nan
110,7916,2225,2.0,"Govind Ballabh Pant|1887|Khoont-Dhaamas, Almor...",Govind Ballabh Pant|1887|Presidencies and prov...
111,8168,5573,2.0,Daniele Manin|1804|Venise (Italie)|Homme polit...,Daniele Manin|1804|Venice|Italians
112,8485,2600,2.0,Alfons Goppel|1905|Ratisbonne (Allemagne)|Avoc...,Alfons Goppel|1905|Kingdom of Bavaria|nan
113,8574,1472,2.0,"John Paul Stevens|1920|Chicago (Illinois, État...",John Paul Stevens|1920|nan|nan


#### Match between DBpedia and BnF Data

In [213]:
BnF_Data = pd.read_csv('df_bnf.csv')
Wikidata = pd.read_csv('df_wk.csv')
DBpedia = pd.read_csv('df_dbp.csv')

In [361]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='name_bnf', right_on='name_dbp')
candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

22345


indexer = recordlinkage.Index()
indexer.block(left_on=['name_bnf', 'uri_bnf'],
              right_on=['name_dbp', 'uri_dbp'])
pairs = indexer.index(df_bnf, df_dbp)


candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

In [363]:
compare = recordlinkage.Compare()
compare.string('name_bnf',
            'name_dbp',
            threshold=0.85,
            label='name_bnf_dbp')
compare.numeric('year_bnf',
            'year_dbp',
            label='year_bnf_dbp')
compare.string('placeOfBirth_bnf',
            'placeOfBirth_dbp',
            threshold=0.85,
            label='placeOfBirth_bnf_dbp')
features = compare.compute(candidates, BnF_Data, DBpedia)

RecursionError: maximum recursion depth exceeded while calling a Python object

In [None]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
features

In [162]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'name_bnf_dbp':'placeOfBirth_bnf_dbp'].sum(axis=1)
potential_matches

Unnamed: 0,id_bnf,id_dbp,name_bnf_dbp,year_bnf_dbp,placeOfBirth_bnf_dbp,Score
0,7013,3091,1.0,1.0,0.0,2.0
1,7476,1298,1.0,1.0,0.0,2.0
2,8876,3026,1.0,1.0,0.0,2.0
3,6136,1151,1.0,1.0,0.0,2.0
4,6154,1157,1.0,1.0,0.0,2.0
...,...,...,...,...,...,...
66,10612,390,1.0,1.0,0.0,2.0
67,10910,4580,1.0,1.0,0.0,2.0
68,6734,1219,1.0,1.0,0.0,2.0
69,9552,1187,1.0,1.0,0.0,2.0


In [163]:
BnF_Data.loc[7013,:]

uri_bnf               http://data.bnf.fr/ark:/12148/cb14872534q#about
viaf_bnf                               http://viaf.org/viaf/208210933
name_bnf                                             Jean-Paul Barety
year_bnf                                                       1928.0
placeOfBirth_bnf                                                 Nice
bio_bnf             Avocat. - Député du Rassemblement pour la Répu...
BnF_Name_Lookup     Jean-Paul Barety|1928.0|Nice|Avocat. - Député ...
Name: 7013, dtype: object

In [164]:
DBpedia.loc[3091,:]

uri_dbp             http://dbpedia.org/resource/Jean-Paul_Baréty
viaf_dbp                                                     nan
name_dbp                                        Jean-Paul Baréty
year_dbp                                                  1928.0
placeOfBirth_dbp                                          France
nationality_dbp                                           France
dbp_Name_Lookup            Jean-Paul Baréty|1928.0|France|France
Name: 3091, dtype: object

In [165]:
BnF_Data['name_bnf']=BnF_Data['name_bnf'].astype(str)
DBpedia['name_dbp']=DBpedia['name_dbp'].astype(str)

BnF_Data['viaf_bnf']=BnF_Data['viaf_bnf'].astype(str)
DBpedia['viaf_dbp']=DBpedia['viaf_dbp'].astype(str)

BnF_Data['uri_bnf']=BnF_Data['uri_bnf'].astype(str)
DBpedia['uri_dbp']=DBpedia['uri_dbp'].astype(str)

BnF_Data['placeOfBirth_bnf']=BnF_Data['placeOfBirth_bnf'].astype(str)
DBpedia['placeOfBirth_dbp']=DBpedia['placeOfBirth_dbp'].astype(str)

BnF_Data['year_bnf']=BnF_Data['year_bnf'].astype(str)
DBpedia['year_dbp']=DBpedia['year_dbp'].astype(str)

BnF_Data['bio_bnf']=BnF_Data['bio_bnf'].astype(str)
DBpedia['nationality_dbp']=DBpedia['nationality_dbp'].astype(str)

In [166]:
BnF_Data['BnF_Name_Lookup'] = BnF_Data[[
   'name_bnf', 'year_bnf' ,'placeOfBirth_bnf' ,'bio_bnf' 
]].apply(lambda x: '|'.join(x), axis=1)

DBpedia['dbp_Name_Lookup'] = DBpedia[[
   'name_dbp', 'year_dbp', 'placeOfBirth_dbp', 'nationality_dbp'
]].apply(lambda x: '|'.join(x), axis=1)

BnF_Data_lookup = BnF_Data[['BnF_Name_Lookup']].reset_index()
DBpedia_lookup = DBpedia[['dbp_Name_Lookup']].reset_index()


In [167]:
BnF_Data_lookup

Unnamed: 0,id_bnf,BnF_Name_Lookup
0,6124,Marc Aucuy|1881.0|Mauvières (Indre)|Docteur en...
1,6125,Roger Duveau|1907.0|Hortes (Haute-Marne)|Avoca...
2,6126,Marcel Moye|1873.0|Cherbourg (Manche)|Juriste....
3,6127,John Miller|1861.0|nan|Journaliste. - Romancie...
4,6128,"Maurice De Wulf|1867.0|Poperinge, Belgique|Pro..."
...,...,...
11095,17219,nan|nan|nan|nan
11096,17220,nan|nan|nan|nan
11097,17221,nan|nan|nan|nan
11098,17222,nan|nan|nan|nan


In [168]:
BnF_merge = potential_matches.merge(BnF_Data_lookup, how='left')

In [169]:
final_merge = BnF_merge.merge(DBpedia_lookup, how='left')

In [170]:
cols = ['id_bnf', 'id_dbp', 'Score',
        'BnF_Name_Lookup', 'dbp_Name_Lookup']
final=final_merge[cols].sort_values(by=[ 'Score'], ascending=True)
final[:50]

Unnamed: 0,id_bnf,id_dbp,Score,BnF_Name_Lookup,dbp_Name_Lookup
0,7013,3091,2.0,Jean-Paul Barety|1928.0|Nice|Avocat. - Député ...,Jean-Paul Baréty|1928.0|France|France
36,9293,240,2.0,Michał Kalecki|1899.0|nan|Economiste,Michał Kalecki|1899.0|Łódź|Poles
37,9296,230,2.0,Franz Oppenheimer|1864.0|Berlin (Allemagne)|So...,Franz Oppenheimer|1864.0|nan|nan
38,9324,723,2.0,Emil Lederer|1882.0|nan|Economiste. - A été pr...,Emil Lederer|1882.0|Kingdom of Bohemia|German ...
39,9327,1369,2.0,Ragnar Nurkse|1907.0|nan|Né en Estonie. - Etud...,Ragnar Nurkse|1907.0|nan|Estonians
40,9382,716,2.0,Theodor Hertzka|1845.0|Pest|Journaliste et éco...,Theodor Hertzka|1845.0|Kingdom of Hungary|Aust...
41,9397,20,2.0,Oskar Morgenstern|1902.0|Görlitz (Allemagne)|P...,Oskar Morgenstern|1902.0|German Empire|nan
43,9450,530,2.0,"Joan Robinson|1903.0|Camberley (Surrey, Royaum...",Joan Robinson|1903.0|nan|nan
44,9460,1269,2.0,John Bates Clark|1847.0|nan|Economiste. - A ét...,"John Bates Clark|1847.0|Providence, Rhode Isla..."
45,9464,918,2.0,"David S. Landes|1924.0|New York (NY, Etats-Uni...",David S. Landes|1924.0|New York City|United St...


--------------------
#### Matched BnF Data and Wikidata

In [234]:
BnF_Data = pd.read_csv('df_bnf.csv')
Wikidata = pd.read_csv('df_wk.csv')
DBpedia = pd.read_csv('df_dbp.csv')

In [364]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='name_bnf', right_on='name_wk')
candidates = indexer.index(BnF_Data, Wikidata)
print(len(candidates))

14277


indexer = recordlinkage.Index()
indexer.block(left_on=['name_bnf', 'uri_bnf'],
              right_on=['name_dbp', 'uri_dbp'])
pairs = indexer.index(df_bnf, df_dbp)


candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

In [365]:
compare = recordlinkage.Compare()
compare.string('name_bnf',
            'name_wk',
            threshold=0.85,
            label='name_bnf_wk')
compare.numeric('year_bnf',
            'year_wk',
            label='year_bnf_wk')
features = compare.compute(candidates, BnF_Data, Wikidata)

TypeError: unsupported operand type(s) for -: 'int64' and 'object'

In [274]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0     236
1.5       1
1.0     123
0.5      87
0.0    7955
dtype: int64

In [275]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,name_bnf_wk,year_bnf_wk
Unnamed: 0_level_1,id_wk,Unnamed: 2_level_1,Unnamed: 3_level_1
2,42738,0.0,0.0
4,44730,0.0,0.0
7,34789,0.0,0.0
8,38417,0.0,0.0
18,39672,0.0,0.0
...,...,...,...
11090,38099,0.0,0.0
11091,41656,0.0,0.0
11093,41965,0.0,0.0
11095,38961,0.0,0.0


In [267]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'name_bnf_wk':'year_bnf_wk'].sum(axis=1)
potential_matches

Unnamed: 0,level_0,id_wk,name_bnf_wk,year_bnf_wk,Score
0,3782,45808,1.0,1.0,2.0
1,4909,40208,1.0,1.0,2.0
2,7337,43376,1.0,1.0,2.0
3,7913,50545,1.0,1.0,2.0
4,9171,48253,1.0,1.0,2.0
...,...,...,...,...,...
232,4178,43462,1.0,1.0,2.0
233,9474,50068,1.0,1.0,2.0
234,10053,38175,1.0,1.0,2.0
235,11072,42398,1.0,1.0,2.0


In [269]:
Wikidata.loc[45808,:]

uri_wk            http://www.wikidata.org/entity/Q94917668
viaf_wk                     http://viaf.org/viaf/296855014
name_wk                                   Andreas Häussler
year_wk                                             1834.0
yearDeath_wk                                        1921.0
nationality_wk                                         NaN
Name: 45808, dtype: object

In [242]:
BnF_Data.loc[529,:]

Unnamed: 0                                                        529
uri_bnf               http://data.bnf.fr/ark:/12148/cb112856721#about
viaf_bnf                                http://viaf.org/viaf/87818753
name_bnf                                    Raimundo Fernández Cuesta
sName                                                             NaN
year_bnf                                                         1897
placeOfBirth_bnf                                               Madrid
bio_bnf             Homme politique. - Ministre de l'agriculture d...
Name: 529, dtype: object

In [243]:
Wikidata['name_wk']=Wikidata['name_wk'].astype(str)
BnF_Data['name_bnf']=BnF_Data['name_bnf'].astype(str)

Wikidata['viaf_wk']=Wikidata['viaf_wk'].astype(str)
BnF_Data['viaf_bnf']=BnF_Data['viaf_bnf'].astype(str)

Wikidata['uri_wk']=Wikidata['uri_wk'].astype(str)
BnF_Data['uri_bnf']=BnF_Data['uri_bnf'].astype(str)

BnF_Data['placeOfBirth_bnf']=BnF_Data['placeOfBirth_bnf'].astype(str)

Wikidata['year_wk']=Wikidata['year_wk'].astype(str)
Wikidata['yearDeath_wk']=Wikidata['yearDeath_wk'].astype(str)

BnF_Data['year_bnf']=BnF_Data['year_bnf'].astype(str)

Wikidata['nationality_wk']=Wikidata['nationality_wk'].astype(str)
BnF_Data['bio_bnf']=BnF_Data['bio_bnf'].astype(str)


In [244]:
Wikidata['wk_Name_Lookup'] = Wikidata[[
   'name_wk', 'year_wk' ,'yearDeath_wk'
]].apply(lambda x: '|'.join(x), axis=1)

BnF_Data['bnf_Name_Lookup'] = BnF_Data[[
   'name_bnf', 'year_bnf', 'placeOfBirth_bnf', 'bio_bnf'
]].apply(lambda x: '|'.join(x), axis=1)

Wikidata_lookup = Wikidata[['wk_Name_Lookup']].reset_index()
BnF_Data_lookup = BnF_Data[['bnf_Name_Lookup']].reset_index()

In [245]:
Wikidata_lookup

Unnamed: 0,index,wk_Name_Lookup
0,0,Peter Altmaier|1958|nan
1,1,Alfred von Kiderlen-Waechter|1852|1912.0
2,2,Hans Apel|1932|2011.0
3,3,Rainer Rupp|1945|nan
4,4,Heinrich von Bülow|1792|1846.0
...,...,...
53195,53195,nan|1939|nan
53196,53196,nan|1942|nan
53197,53197,nan|1946|nan
53198,53198,nan|1932|2018.0


In [246]:
Wikidata_merge = potential_matches.merge(Wikidata_lookup, how='left')

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [247]:
final_merge = BnF_merge.merge(DBpedia_lookup, how='left')

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [170]:
cols = ['id_bnf', 'id_dbp', 'Score',
        'BnF_Name_Lookup', 'dbp_Name_Lookup']
final=final_merge[cols].sort_values(by=[ 'Score'], ascending=True)
final[:50]

Unnamed: 0,id_bnf,id_dbp,Score,BnF_Name_Lookup,dbp_Name_Lookup
0,7013,3091,2.0,Jean-Paul Barety|1928.0|Nice|Avocat. - Député ...,Jean-Paul Baréty|1928.0|France|France
36,9293,240,2.0,Michał Kalecki|1899.0|nan|Economiste,Michał Kalecki|1899.0|Łódź|Poles
37,9296,230,2.0,Franz Oppenheimer|1864.0|Berlin (Allemagne)|So...,Franz Oppenheimer|1864.0|nan|nan
38,9324,723,2.0,Emil Lederer|1882.0|nan|Economiste. - A été pr...,Emil Lederer|1882.0|Kingdom of Bohemia|German ...
39,9327,1369,2.0,Ragnar Nurkse|1907.0|nan|Né en Estonie. - Etud...,Ragnar Nurkse|1907.0|nan|Estonians
40,9382,716,2.0,Theodor Hertzka|1845.0|Pest|Journaliste et éco...,Theodor Hertzka|1845.0|Kingdom of Hungary|Aust...
41,9397,20,2.0,Oskar Morgenstern|1902.0|Görlitz (Allemagne)|P...,Oskar Morgenstern|1902.0|German Empire|nan
43,9450,530,2.0,"Joan Robinson|1903.0|Camberley (Surrey, Royaum...",Joan Robinson|1903.0|nan|nan
44,9460,1269,2.0,John Bates Clark|1847.0|nan|Economiste. - A ét...,"John Bates Clark|1847.0|Providence, Rhode Isla..."
45,9464,918,2.0,"David S. Landes|1924.0|New York (NY, Etats-Uni...",David S. Landes|1924.0|New York City|United St...


### Between Wikidata and DBpedia

In [171]:
merged_df_1 = pd.merge( df_wk, df_dbp , on='viaf', how='inner', sort='viaf')
merged_df_1[:10]

NameError: name 'df_wk' is not defined

In [37]:
print("the number of merged data from DBpedia and Wikidata is ",len(merged_df_1), "rows.")
print("")
print("The proportion of the number of merged data from DBpedia with Wikidata is ",((len(merged_df_1))/(len(df_dbp))*100),"%")
print("")
print("proportion of the number of merged data from Wikidata with DBpedia is ",((len(merged_df_1))/(len(df_wk))*100),"%")

the number of merged data from DBpedia and Wikidata is  898 rows.

The proportion of the number of merged data from DBpedia with Wikidata is  54.42424242424242 %

proportion of the number of merged data from Wikidata with DBpedia is  4.152985247190491 %


### Between Wikidata and BnF Data

In [38]:
merged_df_2 = pd.merge( df_wk, df_bnf , on='viaf', how='inner', sort='viaf')
print(len(merged_df_2))
merged_df_2[:10]

112


Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q540253,http://viaf.org/viaf/100277874,Clemens Maria Franz von Bönninghausen,1785,http://data.bnf.fr/ark:/12148/cb165146162#about,Clemens Maria Franz von Bönninghausen,,1785,"Avocat, agriculteur, botaniste, homéopathe. - ..."
1,http://www.wikidata.org/entity/Q11724800,http://viaf.org/viaf/101647977,Jerzy Karol Kurnatowski,1874,http://data.bnf.fr/ark:/12148/cb10528392p#about,Jerzy Kurnatowski,,1874,"Publiciste, juriste et économiste"
2,http://www.wikidata.org/entity/Q11738367,http://viaf.org/viaf/101863288,Kazimierz Studentowicz,1903,http://data.bnf.fr/ark:/12148/cb11261387v#about,Kazimierz Studentowicz,,1903,"Juriste, homme politique"
3,http://www.wikidata.org/entity/Q104820757,http://viaf.org/viaf/107036313,Félix Garcin,1879,http://data.bnf.fr/ark:/12148/cb13073916m#about,Félix Garcin,,1879,"Journaliste, directeur de ""Nouvelliste"", Lyon...."
4,http://www.wikidata.org/entity/Q1680590,http://viaf.org/viaf/107145857823423020439,Josef Redlich,1869,http://data.bnf.fr/ark:/12148/cb12818732h#about,Josef Redlich,,1869,Professeur de droit public et d'administration...
5,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
6,http://www.wikidata.org/entity/Q88911,http://viaf.org/viaf/108482851,Otto Nathan,1893,http://data.bnf.fr/ark:/12148/cb12874301d#about,Otto Nathan,,1893,Économiste. - Avocat. - Exécuteur testamentair...
7,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
8,http://www.wikidata.org/entity/Q87110924,http://viaf.org/viaf/110494713,Henri Denis,1913,http://data.bnf.fr/ark:/12148/cb12103181f#about,Henri Denis,,1913,"Docteur en droit (Paris, 1938). - Professeur d..."
9,http://www.wikidata.org/entity/Q1345621,http://viaf.org/viaf/111314151,Marco Biagi,1950,http://data.bnf.fr/ark:/12148/cb150888041#about,Marco Biagi,,1950,Juriste


In [39]:
print("The number of merged data from BnF Data and Wikidata is ",len(merged_df_2), "rows.")

print("")

print("The proportion of the number of merged data from BnF Data with Wikidata is ",((len(merged_df_2))/(len(df_bnf))*100),"%")

print("")

print("The proportion of the number of merged data from Wikidata with Wikidata is ",((len(merged_df_2))/(len(df_wk))*100),"%")

The number of merged data from BnF Data and Wikidata is  112 rows.

The proportion of the number of merged data from BnF Data with Wikidata is  1.224445173280857 %

The proportion of the number of merged data from Wikidata with Wikidata is  0.5179669796050502 %


### Between DBpedia and BnF Data

In [40]:
merged_df_3 = pd.merge( df_bnf, df_dbp , on='viaf', how='inner', sort='viaf')
print(len(merged_df_3))
merged_df_3[:10]

88


Unnamed: 0,uri_bnf,viaf,name_bnf,sName,year_bnf,bio_bnf,uri_dbp,name_dbp,year_dbp
0,http://data.bnf.fr/ark:/12148/cb122145877#about,http://viaf.org/viaf/100966624,John Humphrey,,1905,Juriste. - A été professeur de droit internati...,http://dbpedia.org/resource/John_Peters_Humphrey,John Peters Humphrey,1905
1,http://data.bnf.fr/ark:/12148/cb12327654n#about,http://viaf.org/viaf/107536763,Louis Renault,,1843,Juriste. - Professeur de droit international à...,http://dbpedia.org/resource/Louis_Renault_(jur...,Louis Renault,1843
2,http://data.bnf.fr/ark:/12148/cb122775427#about,http://viaf.org/viaf/108173876,Ronald Myles Dworkin,,1931,Juriste. - Professeur de jurisprudence à la Ya...,http://dbpedia.org/resource/Ronald_Dworkin,,1931
3,http://data.bnf.fr/ark:/12148/cb11927239j#about,http://viaf.org/viaf/108188941,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,...",http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922
4,http://data.bnf.fr/ark:/12148/cb120906270#about,http://viaf.org/viaf/108565309,Paul Abraham Freund,,1908,"Professeur de droit, ""Harvard Law School""",http://dbpedia.org/resource/Paul_A._Freund,Paul Abraham Freund,1908
5,http://data.bnf.fr/ark:/12148/cb119084288#about,http://viaf.org/viaf/108587991,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938
6,http://data.bnf.fr/ark:/12148/cb128832222#about,http://viaf.org/viaf/108624624,Muḥammad Ẓafr Allāh H̱ān,,1893,"Juriste, diplomate et homme politique",http://dbpedia.org/resource/Muhammad_Zafarulla...,CH Muhammad Zafarullah Khan,1893
7,http://data.bnf.fr/ark:/12148/cb12299375j#about,http://viaf.org/viaf/108794549,Karl Engisch,,1899,Juriste. - Spécialiste de philosophie du droit...,http://dbpedia.org/resource/Karl_Engisch,Karl Engisch,1899
8,http://data.bnf.fr/ark:/12148/cb118935370#about,http://viaf.org/viaf/111389197,Georges Bousquet,,1846,Avocat au Barreau de Paris (en 1866). - Engagé...,http://dbpedia.org/resource/Georges_Hilaire_Bo...,Georges Hilaire Bousquet,1845
9,http://data.bnf.fr/ark:/12148/cb12328362p#about,http://viaf.org/viaf/11396531,John Paul Stevens,,1920,Juriste américain,http://dbpedia.org/resource/John_Paul_Stevens,John Paul Stevens,1920


In [41]:
print("The number of merged data from BnF Data and Wikidata is ",len(merged_df_3), "rows.")

print("")

print("The proportion of the number of merged data from BnF Data with DBpedia is ",((len(merged_df_3))/(len(df_dbp))*100),"%")

print("")

print("The proportion of the number of merged data from DBpedia with BnF Data is ",((len(merged_df_3))/(len(df_bnf))*100),"%")

The number of merged data from BnF Data and Wikidata is  88 rows.

The proportion of the number of merged data from BnF Data with DBpedia is  5.333333333333334 %

The proportion of the number of merged data from DBpedia with BnF Data is  0.9620640647206734 %


### Between Wikidata, BnF Data and DBpedia

In [42]:
merged_df = pd.merge( merged_df_1, df_bnf , on='viaf', how='inner', sort='viaf')
merged_df[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
1,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
2,http://www.wikidata.org/entity/Q3085838,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,François Simiand,1873,http://data.bnf.fr/ark:/12148/cb12301152q#about,François Simiand,,1873,Philosophe. - Agrégé de philosophie. - Docteur...
3,http://www.wikidata.org/entity/Q61956,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,Lorenz von Stein,1815,http://data.bnf.fr/ark:/12148/cb12001622n#about,Lorenz von Stein,,1815,"Juriste et économiste. - Professeur à Kiel, Al..."
4,http://www.wikidata.org/entity/Q231690,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,Bhimrao Ramji Ambedkar,1891,http://data.bnf.fr/ark:/12148/cb12126992f#about,Bhimrao Ramji Ambedkar,,1891,Homme politique d'origine harijan mahar. - Étu...
5,http://www.wikidata.org/entity/Q215961,http://viaf.org/viaf/50021033,Franz Hermann Schulze-Delitzsch,1808,http://dbpedia.org/resource/Franz_Hermann_Schu...,Hermann Schulze-Delitzsch,1808,http://data.bnf.fr/ark:/12148/cb12088660j#about,Hermann Schulze-Delitzsch,,1808,"Juriste, homme politique et économiste alleman..."
6,http://www.wikidata.org/entity/Q4893263,http://viaf.org/viaf/69263532,Joan Sardà i Dexeus,1910,http://dbpedia.org/resource/Joan_Sardà_i_Dexeus,Joan Sardà i Dexeus,1910,http://data.bnf.fr/ark:/12148/cb158098327#about,Juan Sardá Dexeus,,1910,Docteur en droit. - Économiste
7,http://www.wikidata.org/entity/Q7836141,http://viaf.org/viaf/73921034,Travers Twiss,1809,http://dbpedia.org/resource/Travers_Twiss,Travers Twiss,1809,http://data.bnf.fr/ark:/12148/cb12314495r#about,Travers Twiss,,1809,Juriste. - Spécialiste de droit international


In [43]:
print("The number of merged data from DBpedia, Wikidata and BnF Data is",len(merged_df),"rows.")
print("")
print("The proportion of the number of merged data from DBpedia, Wikidata and BnF Data is ",(len(merged_df))/(len(df_bnf))*100,"%" )

The number of merged data from DBpedia, Wikidata and BnF Data is 8 rows.

The proportion of the number of merged data from DBpedia, Wikidata and BnF Data is  0.08746036952006123 %


### Append DBpedia, BnF Data and Wikidata to the merged Dataframe who don't have viaf.org values in common

In [44]:
# To achieve this, I am served on these pages: 
# https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html (en)
# https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html (en)
# http://www.python-simple.com/python-pandas/concatenations-joins-dataframe.php (fr)

result = merged_df.append([merged_df_1, merged_df_2, merged_df_3, df_bnf, df_wk, df_dbp], sort=False)
print(len(result))
result[:10]

33526


Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922.0,"Docteur en droit (University of Chicago, Ill.,..."
1,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938.0,Juriste et économiste. - Professeur à l'Univer...
2,http://www.wikidata.org/entity/Q3085838,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,François Simiand,1873,http://data.bnf.fr/ark:/12148/cb12301152q#about,François Simiand,,1873.0,Philosophe. - Agrégé de philosophie. - Docteur...
3,http://www.wikidata.org/entity/Q61956,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,Lorenz von Stein,1815,http://data.bnf.fr/ark:/12148/cb12001622n#about,Lorenz von Stein,,1815.0,"Juriste et économiste. - Professeur à Kiel, Al..."
4,http://www.wikidata.org/entity/Q231690,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,Bhimrao Ramji Ambedkar,1891,http://data.bnf.fr/ark:/12148/cb12126992f#about,Bhimrao Ramji Ambedkar,,1891.0,Homme politique d'origine harijan mahar. - Étu...
5,http://www.wikidata.org/entity/Q215961,http://viaf.org/viaf/50021033,Franz Hermann Schulze-Delitzsch,1808,http://dbpedia.org/resource/Franz_Hermann_Schu...,Hermann Schulze-Delitzsch,1808,http://data.bnf.fr/ark:/12148/cb12088660j#about,Hermann Schulze-Delitzsch,,1808.0,"Juriste, homme politique et économiste alleman..."
6,http://www.wikidata.org/entity/Q4893263,http://viaf.org/viaf/69263532,Joan Sardà i Dexeus,1910,http://dbpedia.org/resource/Joan_Sardà_i_Dexeus,Joan Sardà i Dexeus,1910,http://data.bnf.fr/ark:/12148/cb158098327#about,Juan Sardá Dexeus,,1910.0,Docteur en droit. - Économiste
7,http://www.wikidata.org/entity/Q7836141,http://viaf.org/viaf/73921034,Travers Twiss,1809,http://dbpedia.org/resource/Travers_Twiss,Travers Twiss,1809,http://data.bnf.fr/ark:/12148/cb12314495r#about,Travers Twiss,,1809.0,Juriste. - Spécialiste de droit international
0,http://www.wikidata.org/entity/Q9387,http://viaf.org/viaf/100180950,Max Weber,1864,http://dbpedia.org/resource/Max_Weber,,1864,,,,,
1,http://www.wikidata.org/entity/Q15999850,http://viaf.org/viaf/100246974,Peter J. Hammond,1945,http://dbpedia.org/resource/Peter_J._Hammond_(...,Peter Hammond,1945,,,,,


In [45]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section
result_test=result

# Replace null value name

## BnF Data
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_dbp'],result['name_bnf'])
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_wk'],result['name_bnf'])
## DBpedia
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_bnf'],result['name_dbp'])
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_wk'],result['name_dbp'])
## Wikidata
result_test['name'] = np.where(result['name_wk'].isnull(),result['name_bnf'],result['name_wk'])
result_test['name'] = np.where(result['name_wk'].isnull(),result['name_dbp'],result['name_wk'])

# Replace null value year

## BnF Data
result_test['year'] = np.where(result['year_bnf'].isnull(),result['year_dbp'],result['year_bnf'])
result_test['year'] = np.where(result['year_bnf'].isnull(),result['year_wk'],result['year_bnf'])
## DBpedia
result_test['year'] = np.where(result['year_dbp'].isnull(),result['year_bnf'],result['year_dbp'])
result_test['year'] = np.where(result['year_dbp'].isnull(),result['year_wk'],result['year_dbp'])
## Wikidata
result_test['year'] = np.where(result['year_wk'].isnull(),result['year_bnf'],result['year_wk'])
result_test['year'] = np.where(result['year_wk'].isnull(),result['year_dbp'],result['year_wk'])

result_test=result_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf', 'uri_wk']]
result_test[:5]

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
0,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,http://data.bnf.fr/ark:/12148/cb11927239j#about,http://www.wikidata.org/entity/Q518859
1,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,http://data.bnf.fr/ark:/12148/cb119084288#about,http://www.wikidata.org/entity/Q652154
2,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,http://data.bnf.fr/ark:/12148/cb12301152q#about,http://www.wikidata.org/entity/Q3085838
3,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,http://data.bnf.fr/ark:/12148/cb12001622n#about,http://www.wikidata.org/entity/Q61956
4,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,http://data.bnf.fr/ark:/12148/cb12126992f#about,http://www.wikidata.org/entity/Q231690


In [46]:
# Extract only rows without VIAF uri from DBpeida
df_dbp_test = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf', 'name_dbp', 'year_dbp'])

df_dbp_test_mask=df_dbp_test['viaf']==''
filtered_df_dbp_test = df_dbp_test[df_dbp_test_mask]
print(len(filtered_df_dbp_test))
filtered_df_dbp_test[:10]

6798


Unnamed: 0,uri_dbp,viaf,name_dbp,year_dbp
1747,http://dbpedia.org/resource/Luc-Normand_Tellier,,Luc-Normand Tellier,1944
1748,http://dbpedia.org/resource/Madhu_Verma,,Madhu Verma,1961
1749,http://dbpedia.org/resource/Magda_Kandil,,Magda ElSayed Kandil,1958
1750,http://dbpedia.org/resource/Magnus_Johannesson,,Magnus Johannesson,1964
1751,http://dbpedia.org/resource/Mahendra_P._Lama,,Mahendra P. Lama,1961
1752,http://dbpedia.org/resource/Mainul_Islam,,Mainul Islam,1950
1753,http://dbpedia.org/resource/Urs_Meisterhans,,Urs Meisterhans,1960
1754,http://dbpedia.org/resource/Rosalind_Blauer,,Rosalind Blauer,1943
1755,http://dbpedia.org/resource/Makoto_Yano,,Makoto Yano,1952
1756,http://dbpedia.org/resource/Krzysztof_Zamasz,,Krzysztof Zamasz,1974


In [47]:
# drop duplicates in DBpedia dataframe
filtered_df_dbp_test.drop_duplicates(subset ="uri_dbp", keep = False, inplace=True)
print(len(filtered_df_dbp_test))

6320


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
# Extract only rows without VIAF uri from BnF Data
df_bnf_test = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'sName','year_bnf','bio_bnf'])

df_bnf_test_mask=df_bnf_test['viaf']==''
filtered_df_bnf_test = df_bnf_test[df_bnf_test_mask]
len(filtered_df_bnf_test)

2054

In [49]:
# drop duplicates in BnF Data dataframe

filtered_df_bnf_test.drop_duplicates(subset ="uri_bnf", keep = 'first', inplace=True)
print(len(filtered_df_bnf_test))

2054


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
##### Extract only rows without VIAF uri from BnF Data
df_wk_test = pd.DataFrame(result_wikidata, columns=['uri_wk', 'viaf', 'name_wk','year_wk'])

df_wk_test_mask=df_wk_test['viaf']==''
filtered_df_wk_test = df_wk_test[df_wk_test_mask]
len(filtered_df_wk_test)
filtered_df_wk_test[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk
22,http://www.wikidata.org/entity/Q116500,,Hans Kaufmann,1948
42,http://www.wikidata.org/entity/Q116475,,Hannes Germann,1956
83,http://www.wikidata.org/entity/Q92134,,Uta Nickel,1941
134,http://www.wikidata.org/entity/Q117426,,Peter Briner,1943
158,http://www.wikidata.org/entity/Q71778,,"Donatus, Landgrave of Hesse",1966
167,http://www.wikidata.org/entity/Q74023,,Liudmyla Denisova,1960
176,http://www.wikidata.org/entity/Q119987,,Martin Baltisser,1969
183,http://www.wikidata.org/entity/Q120799,,Arthur Loepfe,1942
355,http://www.wikidata.org/entity/Q75582,,Shkëlqim Cani,1956
364,http://www.wikidata.org/entity/Q123964,,Werner Hennig,1928


In [51]:
# drop duplicates in Wikidata dataframe

filtered_df_wk_test.drop_duplicates(subset ="uri_wk", keep = 'first', inplace=True)
print(len(filtered_df_wk_test))

7418


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
# merge the three dateframes 
filtered_dbp_bnf_test= filtered_df_dbp_test.append(filtered_df_bnf_test, sort=True)
len(filtered_dbp_bnf_test)
filtered_dbp_bnf_wk_test=filtered_dbp_bnf_test.append(filtered_df_wk_test, sort=True)
print(len(filtered_dbp_bnf_wk_test))
filtered_dbp_bnf_wk_test[-1500:]

15792


Unnamed: 0,bio_bnf,name_bnf,name_dbp,name_wk,sName,uri_bnf,uri_dbp,uri_wk,viaf,year_bnf,year_dbp,year_wk
27003,,,,Marek Matejun,,,,http://www.wikidata.org/entity/Q66974612,,,,1977
27008,,,,Lluís Mosella i Ximenez,,,,http://www.wikidata.org/entity/Q66978783,,,,1975
27009,,,,Fawzi Al-Qaisi,,,,http://www.wikidata.org/entity/Q67031593,,,,1926
27010,,,,Hasan Al-Ameri,,,,http://www.wikidata.org/entity/Q66828293,,,,1938
27013,,,,Isidre Sala Queralt,,,,http://www.wikidata.org/entity/Q67123989,,,,1973
27017,,,,Örs Farkas,,,,http://www.wikidata.org/entity/Q105079853,,,,1988
27018,,,,Zsófia Lakatos,,,,http://www.wikidata.org/entity/Q105098380,,,,1975
27019,,,,Nora Grisáková,,,,http://www.wikidata.org/entity/Q105178097,,,,1978
27020,,,,Darko Asomaning Nicholas,,,,http://www.wikidata.org/entity/Q105181487,,,,1939
27021,,,,Philipp Schmidt-Dengler,,,,http://www.wikidata.org/entity/Q105187538,,,,1974


In [53]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_f_test=filtered_dbp_bnf_wk_test

# Replace null value name

## BnF Data
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_bnf'].isnull(),filtered_dbp_bnf_wk_test['name_dbp'],filtered_dbp_bnf_wk_test['name_bnf'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_bnf'].isnull(),filtered_dbp_bnf_wk_test['name_wk'],filtered_dbp_bnf_wk_test['name_bnf'])
## DBpedia
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_dbp'].isnull(),filtered_dbp_bnf_wk_test['name_bnf'],filtered_dbp_bnf_wk_test['name_dbp'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_dbp'].isnull(),filtered_dbp_bnf_wk_test['name_wk'],filtered_dbp_bnf_wk_test['name_dbp'])
## Wikidata
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_wk'].isnull(),filtered_dbp_bnf_wk_test['name_bnf'],filtered_dbp_bnf_wk_test['name_wk'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_wk'].isnull(),filtered_dbp_bnf_wk_test['name_dbp'],filtered_dbp_bnf_wk_test['name_wk'])

# Replace null value year

## BnF Data
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_bnf'].isnull(),filtered_dbp_bnf_wk_test['year_dbp'],filtered_dbp_bnf_wk_test['year_bnf'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_bnf'].isnull(),filtered_dbp_bnf_wk_test['year_wk'],filtered_dbp_bnf_wk_test['year_bnf'])
## DBpedia
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_dbp'].isnull(),filtered_dbp_bnf_wk_test['year_bnf'],filtered_dbp_bnf_wk_test['year_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_dbp'].isnull(),filtered_dbp_bnf_wk_test['year_wk'],filtered_dbp_bnf_wk_test['year_dbp'])
## Wikidata
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_wk'].isnull(),filtered_dbp_bnf_wk_test['year_bnf'],filtered_dbp_bnf_wk_test['year_wk'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_wk'].isnull(),filtered_dbp_bnf_wk_test['year_dbp'],filtered_dbp_bnf_wk_test['year_wk'])

result_f_test=result_f_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf', 'uri_wk']]
sort_rft=result_f_test.sort_values(by='name', ascending=False)
sort_rft[:10]

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
20357,,Александр Валерьевич Дубилет,1962,,,http://www.wikidata.org/entity/Q20066943
22398,,Απέργης Νικόλαος,1962,,,http://www.wikidata.org/entity/Q38597547
26495,,ʻAbd al-Ḥusayn Waddāy al-ʻAṭīyah,1929,,,http://www.wikidata.org/entity/Q66428907
682,,Əvəz Ələkbərov,1952,,,http://www.wikidata.org/entity/Q1099741
16808,,Željko Topić,1959,,,http://www.wikidata.org/entity/Q17402923
4114,,Štefan Tiso,1897,http://dbpedia.org/resource/Štefan_Tiso,,
4113,,Štefan Osuský,1889,http://dbpedia.org/resource/Štefan_Osuský,,
24512,,Štefan Bukovec,1929,,,http://www.wikidata.org/entity/Q59851859
14999,,Şəfa Əliyev,1959,,,http://www.wikidata.org/entity/Q12849692
22039,,Şükrü Kızılot,1958,,,http://www.wikidata.org/entity/Q38170668


In [54]:
result_final = sort_rft.append([result_test], sort=False)
print(len(result_final))
result_final[200:250]

49318


Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
5745,,Yuriy Kolobov,1973,,,http://www.wikidata.org/entity/Q4228078
24521,,Yuriy Dzhygyr,1975,,,http://www.wikidata.org/entity/Q64141038
8881,,Yuriy Bazhal,1950,,,http://www.wikidata.org/entity/Q4075070
12092,,Yurii Boiarskyi,1960,,,http://www.wikidata.org/entity/Q12084492
3426,,Yuri Poluneev,1956,,,http://www.wikidata.org/entity/Q800050
23457,,Yuri Movsisyan,1929,,,http://www.wikidata.org/entity/Q62605638
6397,,Yuri Matochkin,1931,,,http://www.wikidata.org/entity/Q4284901
12274,,Yuri Lohush,1945,,,http://www.wikidata.org/entity/Q12118265
6082,,Yuri Lastochkin,1965,,,http://www.wikidata.org/entity/Q4254837
22169,,Yun Hee-suk,1970,,,http://www.wikidata.org/entity/Q55732976


In [55]:
result_final["name"]=result_final["name"].astype(str)

In [58]:
# Drop duplicates while preserving NaN values
# cf. https://stackoverflow.com/questions/23512339/drop-duplicates-while-preserving-nans-in-pandas

## DBpedia
result_final=result_final[result_final['uri_dbp'].isnull() | ~result_final.duplicated(subset='uri_dbp',keep='first')]
## Wikidata
result_final=result_final[result_final['uri_wk'].isnull() | ~result_final.duplicated(subset='uri_wk',keep='first')]
## BnF_Data
result_final=result_final[result_final['uri_bnf'].isnull() | ~result_final.duplicated(subset='uri_bnf',keep='last')]

result_final=result_final.sort_values(by='name', ascending=False)
print(len(result_final))
result_final[:10]

46926


Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
1512,http://viaf.org/viaf/122083064,松本烝治,1877,http://dbpedia.org/resource/Jōji_Matsumoto,,
310,http://viaf.org/viaf/72766671,周鲠生,1889,http://dbpedia.org/resource/Zhou_Gengsheng,,
21523,http://viaf.org/viaf/311774563,برنارد جريتش,1953,,,http://www.wikidata.org/entity/Q57620
514,http://viaf.org/viaf/6088149844962902960006,Тамерлан Кимович Агузар,1963,http://dbpedia.org/resource/Tamerlan_Aguzarov,,
1171,http://viaf.org/viaf/26641927,Мақсұт Нәрікбаев,1940,http://dbpedia.org/resource/Maksut_Narikbaev,,
1507,http://viaf.org/viaf/122252130,Велко Вълканов,1927,http://dbpedia.org/resource/Velko_Valkanov,,
608,http://viaf.org/viaf/56155284772987061505,Андрей Милёхин,1964,http://dbpedia.org/resource/Andrey_Milekhin,,
20357,,Александр Валерьевич Дубилет,1962,,,http://www.wikidata.org/entity/Q20066943
10078,http://viaf.org/viaf/10743147,Аleksandr Rusov,1847,,,http://www.wikidata.org/entity/Q12149410
215,http://viaf.org/viaf/778940,Γεώργιος Χρηστάκης-Ζωγράφος,1863,http://dbpedia.org/resource/Georgios_Christaki...,,


In [63]:
# Test to find similarities between the names BnF Data, DBpedia and Wikidata with collocation

## cf. https://stackoverflow.com/questions/33098040/how-to-use-word-tokenize-in-data-frame
## https://www.nltk.org/howto/collocations.html
from nltk.corpus import stopwords
stopset = stopwords.words('english')
stopset = stopwords.words('french')

result_test['tokenized_sents'] = result_test.apply(lambda row: nltk.word_tokenize(row['name']), axis=1)

bcf = TrigramCollocationFinder.from_documents(result_test['name'])
filter_stops = lambda w: w in stopset
bcf.apply_word_filter(filter_stops)
f= bcf.nbest(TrigramAssocMeasures, 100)
print(f)

TypeError: ('expected string or bytes-like object', 'occurred at index 0')

In [42]:
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
                    'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
                    'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
                    'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
                              'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
                    })

In [43]:
df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
                    'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
                                  'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
                                  'http://example.com/img/id005.png']
                    })

In [11]:
df3_merged = pd.merge(df1, df2)
df3_merged 

Unnamed: 0,user_id,first_name,last_name,email,image_url
0,id001,Rivi,Valti,rvalti0@example.com,http://example.com/img/id001.png
1,id002,Wynnie,McMurty,wmcmurty1@example.com,http://example.com/img/id002.jpg
2,id003,Kristos,Ivanets,kivanets2@example.com,http://example.com/img/id003.bmp
3,id004,Madalyn,Max,mmax3@example.com,http://example.com/img/id004.jpg
4,id005,Tobe,Riddich,triddich4@example.com,http://example.com/img/id005.png
