# Merge BnF DATA, DBpedia and Wikidata

In this notebook, we apply a method to merge three datasets (BnF, DBpedia and Wikidata)

* First, we drop duplicates of each datasets. 

* Secondly, we merge the three datasets  by removing duplicate data. To realise that, we will use the Linkage toolkit who calculate the proximity (by giving a score) between to string from three dataframes.

* Previously, we have to collect data about economists with SPARQL queries.

In [1]:
# Import libraries usefull

from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML
import pprint
import csv
# from bs4 import BeautifulSoup

from collections import Counter
from operator import itemgetter
import pandas as pd


# Query economists and jurists on BnF Data

First step, we need datas about economists and jurists with theirs proprieties on 'BnF Data' so we realise a SPARQL query. We need proprieties to realise the merge between three datasets:
  * Birth date
  
  * Date of death
  
  * Place of Birth
  
  * Place of Death

In addition, we add the biographie to filter the population we need. 

In [23]:
query = """
PREFIX bio: <http://vocab.org/bio/0.1/>
PREFIX  egr:  <http://rdvocab.info/ElementsGr2/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?name ?sName ?uri ?birthDate ?deathDate ?placeOfBirth ?placeOfDeath ?bio
WHERE
  {   { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?bd1)
        FILTER ( ( ( ( ( regex(?bio, "juriste", "i") || regex(?bio, "professeur de droit", "i") ) || regex(?bio, "docteur en droit", "i") ) || regex(?bio, "avocat", "i") ) || regex(?bio, "juge", "i") ) || regex(?bio, "magistrat", "i") )
        OPTIONAL
          { ?s  bio:birth ?birthDate }
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  egr:placeOfBirth ?placeOfBirth}
        OPTIONAL
          { ?s  bio:death       ?deathDate
          }
        OPTIONAL
          { ?s  egr:placeOfDeath ?placeOfDeath}
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
          }
      }
    UNION
      { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?bd1)
        FILTER ( ?birthDate > "1770" )
        FILTER ( ( ( regex(?bio, "économiste") || regex(?bio, "Economiste") ) || regex(?bio, "professeur d'économie", "i") ) || regex(?bio, "docteur en économie", "i") )
        OPTIONAL
          { ?s  bio:birth ?birthDate }
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  egr:placeOfBirth ?placeOfBirth}
        OPTIONAL
          { ?s  egr:placeOfDeath ?placeOfDeath}
        OPTIONAL
          { ?s  bio:death       ?deathDate
          }
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri1
            FILTER regex(?uri1, "viaf.org", "i")
            BIND(strbefore(str(?uri1), "http://viaf.org/viaf/") AS ?uri)
          }
      }
        FILTER ( ?bd1 > "1770" )
  }
ORDER BY DESC(?uri)
"""

In [24]:
sparql = SPARQLWrapper("https://data.bnf.fr/sparql") ##, returnFormat=RDFXML)  [LOCALHOST]

In [25]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [26]:
try:
    rc_bnf = sparql.queryAndConvert()
except Exception as e:
    print(e)

In [27]:
# Number of rows in the result
len(rc_bnf['results']['bindings'])

11132

In [28]:
# Inspect the first hundred rows
i = 0
for l in rc_bnf['results']['bindings']:
    if i < 100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb12981404c#about'}, 'name': {'type': 'literal', 'value': 'Léon Garnier'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99996033'}, 'birthDate': {'type': 'literal', 'value': '1836-11-10'}, 'deathDate': {'type': 'literal', 'value': '1901-05-06'}, 'bio': {'type': 'literal', 'value': "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb13484444m#about'}, 'name': {'type': 'literal', 'value': 'Gaston de Pawlowski'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9999219'}, 'birthDate': {'type': 'literal', 'value': '1874-06-14'}, 'deathDate': {'type': 'literal', 'value': '1933-02-02'}, 'placeOfBirth': {'type': 'literal', 'value': 'Joigny (Yonne)'}, 'placeOfDeath': {'type': 'literal', 'value': 'Paris'}, 'bio': {'type': 'literal', 'value': 'Docteur en droit. - Crit

In [29]:
result_bnf = []
for l in rc_bnf['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            sName = l['sName']['value']
        except Exception as e:
            sName = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            bio = l['bio']['value']
        except Exception as e:
            bio = ''
        try: 
            birthDate = l['birthDate']['value']
        except Exception as e:
            birthDate = ''
        try: 
            deathDate = l['deathDate']['value']
        except Exception as e:
            deathDate = ''
        try: 
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        try: 
            placeOfDeath = l['placeOfDeath']['value']
        except Exception as e:
            placeOfDeath = ''
        result_bnf.append([l['s']['value'], uri, name, sName, birthDate, deathDate, placeOfBirth,placeOfDeath, bio])        
# It creates a list, where it puts proprieties with their value.

In [30]:
print(len(result_bnf))
result_bnf[:5]

11132


[['http://data.bnf.fr/ark:/12148/cb12981404c#about',
  'http://viaf.org/viaf/99996033',
  'Léon Garnier',
  '',
  '1836-11-10',
  '1901-05-06',
  '',
  '',
  "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"],
 ['http://data.bnf.fr/ark:/12148/cb13484444m#about',
  'http://viaf.org/viaf/9999219',
  'Gaston de Pawlowski',
  '',
  '1874-06-14',
  '1933-02-02',
  'Joigny (Yonne)',
  'Paris',
  'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'],
 ['http://data.bnf.fr/ark:/12148/cb134841632#about',
  'http://viaf.org/viaf/9999131',
  'Jean-Michel Berton',
  '',
  '1794-07-03',
  '1845-10-20',
  'Cahors (Lot)',
  '',
  'Écrivain politique, avocat à la Cour de cassation. - Fut fondateur et directeur de la "Revue poétique française et étrangère"'],
 ['http://data.bnf.fr/ark:/12148/cb13379520q#about',
  'http://viaf.org/viaf/9995247',
  'Emmanuel Mathieu',
  '',
  '1

# Query economists and jurists on DBpedia

The query is the same as BnF Data, but we add the nationalities, who they don't on BnF Data.

In [18]:
query_2= """
PREFIX  dbo:  <http://dbpedia.org/ontology/>
PREFIX  dbp:  <http://dbpedia.org/property/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?uri ?name ?birthDate ?deathDate ?abstract ?placeOfBirth ?placeOfDeath ?nationality
WHERE
  {   { ?s  a              dbo:Economist ;
            dbp:birthDate  ?birthDate
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name }
        FILTER ( xsd:string(?name) != "Samuel Bowles" )
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:deathPlace  ?dp }
        OPTIONAL
          { ?s  dbp:deathDate  ?deathDate }
        OPTIONAL
          { ?s dbp:deathDate  ?deathDate }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
          
      }
    UNION
      { ?s  ?propriety  dbr:Economist ;
            dbp:birthDate  ?birthDate

        FILTER ( xsd:date(?birthDate) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:deathPlace  ?dp }
        OPTIONAL
          { ?s dbp:deathDate  ?deathDate }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  ?p             dbr:Jurist ;
            dbp:birthDate  ?birthDate
        FILTER ( xsd:date(?birthDate) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:deathPlace  ?dp }
        OPTIONAL
          { ?s dbp:deathDate  ?deathDate }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  ?p             dbr:Lawyer ;
            dbp:birthDate  ?birthDate
        FILTER ( xsd:date(?birthDate) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:deathPlace  ?dp }
        OPTIONAL
          { ?s dbp:deathDate  ?deathDate }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    UNION
      { ?s  a              dbr:Professor ;
            dbp:birthDate  ?birthDate ;
            dbo:abstract   ?abstract
        FILTER ( ( ( ( regex(?abstract, "lawyer", "i") || regex(?abstract, "jurist", "i") ) || regex(?abstract, "juriste", "i") ) || regex(?abstract, "attorney", "i") ) || regex(?abstract, "legal professional", "i") )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:birthPlace  ?bp }
        OPTIONAL
          { ?s  dbp:deathPlace  ?dp }
        OPTIONAL
          { ?s dbp:deathDate  ?deathDate }
        OPTIONAL
          { ?s  dbp:nationality  ?nationality1 }
      }
    BIND(strafter(str(?bp), "http://dbpedia.org/resource/") AS ?bp1)
    BIND(replace(str(?bp1), "[_]", " ") AS ?placeOfBirth)

    BIND(strafter(str(?dp), "http://dbpedia.org/resource/") AS ?dp1)
    BIND(replace(str(?dp1), "[_]", " ") AS ?placeOfDeath)

    BIND(strafter(str(?nationality1), "http://dbpedia.org/resource/") AS ?nationality2)
    BIND(replace(str(?nationality2), "[_]", " ") AS ?nationality)
  }
ORDER BY DESC(?uri)

"""

In this query, we have made the choice to aggregate, by a UNION clause, several queries to maximise the results' number. Also we request the "economists" and the "jurists" in only one query. 

Obviously, we chose classes and instances directly related to our population, but also the "professor" instance, because some "economists" or "jurists" are in this instance (we have tried with and without them, and there more result when we use them). 

Also, we exclude all classes because they don't add more result, except the "Economist" class (we keep it) 

For exemple, we exclude the resource "personFunction" and the resource "Jurists" because they add no more data. Additionally, we keep only the "Professor" instance for the jurists (it returns result only for the jurists).

In [19]:
sparql = SPARQLWrapper("https://dbpedia.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [20]:
sparql.setQuery(query_2)
sparql.setReturnFormat(JSON)

In [21]:
rc_db = sparql.queryAndConvert()

In [22]:
# Number of rows in the result
len(rc_db['results']['bindings'])

10000

In [15]:
# Inspect the first three rows
i = 0
for l in rc_db['results']['bindings']:
    if i <100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/António_de_Almeida_Santos'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99921066'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'António de Almeida Santos'}, 'birthDate': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#date', 'value': '1926-02-15'}, 'deathDate': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#date', 'value': '2016-01-18'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Carlos_Carvalhas'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99826658'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Carlos Carvalhas'}, 'birthDate': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#date', 'value': '1941-11-09'}, 'placeOfBirth': {'type': 'literal', 'value': 'São Pedro do Sul, Portugal'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Anita_Augspurg'}, 'uri': {'type': 'uri', 'value': 'htt

In [16]:
# Create a list with URI, VIAF URI, name, year
result_dbpedia = []
for l in rc_db['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            birthDate = l['birthDate']['value']
        except Exception as e:
            birthDate = ''
        try: 
            deathDate = l['deathDate']['value']
        except Exception as e:
            deathDate = ''
        try:
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        try:
            placeOfDeath = l['placeOfDeath']['value']
        except Exception as e:
            placeOfDeath = ''
        try:
            nationality = l['nationality']['value']
        except Exception as e:
            nationality = ''
        result_dbpedia.append([l['s']['value'], uri, name, birthDate,deathDate, placeOfBirth, placeOfDeath])

In [20]:
# Inspect the first three of the list
result_dbpedia[:10]

[['http://dbpedia.org/resource/António_de_Almeida_Santos',
  'http://viaf.org/viaf/99921066',
  'António de Almeida Santos',
  '1926-02-15',
  '2016-01-18',
  '',
  ''],
 ['http://dbpedia.org/resource/Carlos_Carvalhas',
  'http://viaf.org/viaf/99826658',
  'Carlos Carvalhas',
  '1941-11-09',
  '',
  'São Pedro do Sul, Portugal',
  ''],
 ['http://dbpedia.org/resource/Anita_Augspurg',
  'http://viaf.org/viaf/9976800',
  'Anita Augspurg',
  '1857-09-22',
  '1943-12-20',
  '',
  ''],
 ['http://dbpedia.org/resource/Paulo_Portas',
  'http://viaf.org/viaf/99455673',
  'Paulo Portas',
  '1962-09-12',
  '',
  'Lisbon',
  ''],
 ['http://dbpedia.org/resource/Paulo_Portas',
  'http://viaf.org/viaf/99455673',
  'Paulo Portas',
  '1962-09-12',
  '',
  'Portugal',
  ''],
 ['http://dbpedia.org/resource/Pedro_Aspe',
  'http://viaf.org/viaf/9928165',
  '',
  '1950-07-07',
  '',
  'Mexico City',
  ''],
 ['http://dbpedia.org/resource/Pedro_Aspe',
  'http://viaf.org/viaf/9928165',
  '',
  '1950-07-07',
  '

# Query economists and jurists on Wikidata

In [147]:
query_3= """
PREFIX  bd:   <http://www.bigdata.com/rdf#>
PREFIX  wdt:  <http://www.wikidata.org/prop/direct/>
PREFIX  wikibase: <http://wikiba.se/ontology#>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX  wd:   <http://www.wikidata.org/entity/>

SELECT DISTINCT  ?s ?uri ?name ?birthDate ?deathDate ?birthPlaceLabel ?deathPlaceLabel
WHERE
  {   { ?s  wdt:P106  wd:Q188094 } # Economists
    UNION
      { ?s  wdt:P106  wd:Q185351 } # Jurists
    OPTIONAL
      { ?s  wdt:P569  ?birthDate } # P569 is the 'date of birth' propriety
    OPTIONAL
      { ?s  wdt:P570  ?deathDate } # P570 is the 'date of death' propriety
    OPTIONAL
      { ?s  wdt:P19  ?birthPlace # P19 is the 'place of birth' propriety
        SERVICE wikibase:label # The use of the SERVICE clause is very important to display the property label. Also, in the SELECT, It must have "Label" used to work.
          { bd:serviceParam
                      wikibase:language  "en"
          }
      }
    OPTIONAL
      { ?s  wdt:P20  ?deathPlace
        SERVICE wikibase:label
          { bd:serviceParam
                      wikibase:language  "en"
          }
      }
    OPTIONAL
      { ?s  wdt:P214  ?oldURI
        BIND(uri(concat("http://viaf.org/viaf/", strafter(str(?oldURI), ""))) AS ?uri)
      }
    FILTER ( ?birthDate >= "1770-01-01"^^xsd:dateTime)
    OPTIONAL
      { ?s  rdfs:label  ?name
        FILTER ( lang(?name) = "en" )
      }
  }
"""

In [148]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [149]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [150]:
rc_wk = sparql.queryAndConvert()

HTTPError: HTTP Error 403: Forbidden

In [42]:
# Number of rows in the result
len(rc_wk['results']['bindings'])

56301

In [44]:
# Create a list with URI, VIAF URI, name, year
result_wikidata = []
for l in rc_wk['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            birthDate = l['birthDate']['value']
        except Exception as e:
            birthDate = ''
        try: 
            deathDate = l['deathDate']['value']
        except Exception as e:
            deathDate = ''
        try:
            birthPlace = l['birthPlaceLabel']['value']
        except Exception as e:
            birthPlace = ''
        try:
            deathPlace = l['deathPlaceLabel']['value']
        except Exception as e:
            deathPlace = ''
        try:
            uri_bnf = l['uri_bnf']['value']
        except Exception as e:
            uri_bnf = ''
        result_wikidata.append([l['s']['value'], name, birthDate,deathDate, birthPlace, deathPlace, uri, uri_bnf])

In [45]:

result_wikidata['birthDate'] = result_wikidata['birthDate']str.rstrip('T00:00:00Z')
result_wikidata[:5]

[['http://www.wikidata.org/entity/Q77390',
  'http://viaf.org/viaf/171463495',
  'Christoph Ahlhaus',
  '1969-08-28T00:00:00Z',
  '',
  'Heidelberg',
  ''],
 ['http://www.wikidata.org/entity/Q77341',
  'http://viaf.org/viaf/54939901',
  'Hans Globke',
  '1898-09-10T00:00:00Z',
  '1973-02-13T00:00:00Z',
  'Düsseldorf',
  'Bad Godesberg'],
 ['http://www.wikidata.org/entity/Q77404',
  'http://viaf.org/viaf/91748910',
  'Ingeborg Schwenzer',
  '1951-10-25T00:00:00Z',
  '',
  'Stuttgart',
  ''],
 ['http://www.wikidata.org/entity/Q72628',
  'http://viaf.org/viaf/54958174',
  'Alfred von Kiderlen-Waechter',
  '1852-07-10T00:00:00Z',
  '1912-12-30T00:00:00Z',
  'Stuttgart',
  'Stuttgart'],
 ['http://www.wikidata.org/entity/Q72553',
  'http://viaf.org/viaf/62342475',
  'Heinrich von Bülow',
  '1792-09-16T00:00:00Z',
  '1846-02-06T00:00:00Z',
  'Schwerin',
  'Berlin']]

In [46]:
engine = create_engine('sqlite:///database.sqlite_2', echo=False)

# Dataframes

The script below serves to change lists into a dataframe.

In [68]:
df_bnf = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'Sname','dateBirth_bnf','dateDeath_bnf' , 'placeOfBirth_bnf','placeOfDeath_bnf','bio_bnf'])
print(len(df_bnf))
df_bnf.fillna('')

df_bnf[:10]

11132


Unnamed: 0,uri_bnf,viaf,name_bnf,Sname,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836-11-10,1901-05-06,,,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874-06-14,1933-02-02,Joigny (Yonne),Paris,Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794-07-03,1845-10-20,Cahors (Lot),,"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852-07-19,19..,,,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,1917,,,Juriste. - Bibliophile
5,http://data.bnf.fr/ark:/12148/cb13322315v#about,http://viaf.org/viaf/9991357,Paul Pic,,1862-05-31,1944-08-16,Alger,"Hartheim, Autriche",Juriste. - Professeur de droit à la Faculté de...
6,http://data.bnf.fr/ark:/12148/cb13193319k#about,http://viaf.org/viaf/9989230,Gaston Ravisse,,1877-05-15,1935-06-25,Calais (Pas-de-Calais),Paris (France),Avocat. - Spécialiste du monde de l'entreprise...
7,http://data.bnf.fr/ark:/12148/cb15042710d#about,http://viaf.org/viaf/99857689,Cândido Jucá Filho,,1900,1982,,,Avocat
8,http://data.bnf.fr/ark:/12148/cb13169620f#about,http://viaf.org/viaf/9985289,Joseph de Trémaudan,,1846,19..,,,Juge à Paimboeuf. - Historien local
9,http://data.bnf.fr/ark:/12148/cb13075767f#about,http://viaf.org/viaf/9982622,Achille Villey-Desmeserets,,1878-11-06,1955-12-08,Caen (Calvados),Paris (France),Avocat. - Préfet


In [69]:
# Drop duplicates lines and keep first
df_bnf.drop_duplicates(subset ="uri_bnf", keep = 'first', inplace=True)
print(len(df_bnf))

11039


In [70]:
# Create an id to Bnf Data dataframe
df_bnf["id_bnf"] = df_bnf.index + 0
df_bnf= pd.DataFrame(df_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'Sname','dateBirth_bnf','dateDeath_bnf' , 'placeOfBirth_bnf','placeOfDeath_bnf','bio_bnf'],index=df_bnf["id_bnf"])
df_bnf[-10:]

Unnamed: 0_level_0,uri_bnf,viaf,name_bnf,Sname,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf
id_bnf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11122,http://data.bnf.fr/ark:/12148/cb114919984#about,,Kazimierz Oryl,,1930,1983,,,Economiste
11123,http://data.bnf.fr/ark:/12148/cb17711209f#about,,Ismāʿīl Ṣabrī ʿAbd Allāh,,1925,2006-11-06,,,Économiste. - Docteur ès sciences économiques....
11124,http://data.bnf.fr/ark:/12148/cb177975050#about,,Lev Pavlovič Garkunov,,1901-03-16,1963-11-20,,,Joueur d'échecs et arbitre international (1956...
11125,http://data.bnf.fr/ark:/12148/cb112424331#about,,Jacques L'Huillier,,1917-07-20,2012-03-17,Genève (Suisse),Thônex (Suisse),Professeur d'économie politique à l'université...
11126,http://data.bnf.fr/ark:/12148/cb10596494m#about,,François Villegardelle,,1810-10-02,1856-12-23,Miremont (Lot-et-Garonne),,"Economiste. - Fouriériste, puis communiste"
11127,http://data.bnf.fr/ark:/12148/cb11475627b#about,,Joan Mitchell,,1920-03-15,2014-02-13,,,Économiste. - Professeur d'économie de l'unive...
11128,http://data.bnf.fr/ark:/12148/cb10562770v#about,,Kazimierz Zimmermann,,1874,1925,Trzemeszno (Pologne),Cracovie (Pologne),Chanoine. - Economiste. - Recteur de l'Univers...
11129,http://data.bnf.fr/ark:/12148/cb17701366b#about,,ʿUmar ʿAzīz,,1949-02-18,2013-02-16,,,Chercheur et professeur d'économie. - Militant...
11130,http://data.bnf.fr/ark:/12148/cb17877820g#about,,John Davenport,,1904-09-11,1987-06-08,"Philadelphie (Pennsylvanie, États-Unis)","Red Bank (New Jersey, États-Unis)","Journaliste économiste. - Journaliste à : ""For..."
11131,http://data.bnf.fr/ark:/12148/cb17877063j#about,,Ricardo Martinez Vargas,,1885-03-13,19..,,,"Diplomate, économiste"


In [51]:
df_dbp = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf', 'name_dbp','birthDate_dbp','deathDate_dbp' , 'placeOfBirth_dbp','placeOfDeath_dbp'])
print(len(df_dbp))
df_dbp.fillna('')
df_dbp.head(20)

10000


Unnamed: 0,uri_dbp,viaf,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926-02-15,2016-01-18,,
1,http://dbpedia.org/resource/Carlos_Carvalhas,http://viaf.org/viaf/99826658,Carlos Carvalhas,1941-11-09,,"São Pedro do Sul, Portugal",
2,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857-09-22,1943-12-20,,
3,http://dbpedia.org/resource/Paulo_Portas,http://viaf.org/viaf/99455673,Paulo Portas,1962-09-12,,Lisbon,
4,http://dbpedia.org/resource/Paulo_Portas,http://viaf.org/viaf/99455673,Paulo Portas,1962-09-12,,Portugal,
5,http://dbpedia.org/resource/Pedro_Aspe,http://viaf.org/viaf/9928165,,1950-07-07,,Mexico City,
6,http://dbpedia.org/resource/Pedro_Aspe,http://viaf.org/viaf/9928165,,1950-07-07,,Mexico,
7,http://dbpedia.org/resource/Fernando_Teixeira_...,http://viaf.org/viaf/99275725,Fernando Teixeira dos Santos,1951-09-13,,"Maia, Portugal",
8,http://dbpedia.org/resource/Fernando_Teixeira_...,http://viaf.org/viaf/99275725,Fernando Teixeira dos Santos,1951-09-13,,Portugal,
9,http://dbpedia.org/resource/Xavier_Vives,http://viaf.org/viaf/9920331,Xavier Vives,1955-01-23,,,


In [53]:
# Drop duplicates lines and keep first
df_dbp.drop_duplicates(subset =["uri_dbp"], keep = 'first', inplace=True)
print(len(df_dbp))
df_dbp.head()

4383


Unnamed: 0,uri_dbp,viaf,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926-02-15,2016-01-18,,
1,http://dbpedia.org/resource/Carlos_Carvalhas,http://viaf.org/viaf/99826658,Carlos Carvalhas,1941-11-09,,"São Pedro do Sul, Portugal",
2,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857-09-22,1943-12-20,,
3,http://dbpedia.org/resource/Paulo_Portas,http://viaf.org/viaf/99455673,Paulo Portas,1962-09-12,,Lisbon,
5,http://dbpedia.org/resource/Pedro_Aspe,http://viaf.org/viaf/9928165,,1950-07-07,,Mexico City,


In [157]:
# Create an id to DBpedia dataframe
df_dbp["id_dbp"] = df_dbp.index + 0
df_dbp= pd.DataFrame(df_dbp, columns=['uri_dbp', 'viaf', 'name_dbp','birthDate_dbp','deathDate_dbp' , 'placeOfBirth_dbp','placeOfDeath_dbp'],index=df_dbp["id_dbp"])
print(len(df_dbp))
df_dbp[-20:]

4383


Unnamed: 0_level_0,uri_dbp,viaf,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
id_dbp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9968,http://dbpedia.org/resource/David_Kaushansky,,David Moiseevich Kaushansky,1893-03-15,after 1950,Chișinău,
9969,http://dbpedia.org/resource/Benjamin_Cudworth_...,,Benjamin Cudworth Yancey Jr,1817-04-27,--10-24,United States,
9971,http://dbpedia.org/resource/Leo_Koretz,,Leo Koretz,1879-07-30,1925-01-08,Bohemia,
9973,http://dbpedia.org/resource/Leopoldo_Bravo,,Leopoldo Bravo,1919-03-15,2006-08-04,"San Juan, Argentina",
9974,http://dbpedia.org/resource/Terry_Nugent,,"Terence James ""Terry"" Nugent",1920-12-09,2006-04-13,,
9975,http://dbpedia.org/resource/Henry_C._Slemp,,Henry C. Slemp,1881-03-26,1901-01-11,"Seminary, Virginia",
9977,http://dbpedia.org/resource/Mitchell_Cary_Alford,,Mitchell Cary Alford,1855-07-10,1914-12-09,"Fayette County, Kentucky",
9978,http://dbpedia.org/resource/Terry_Haskins,,,1955-01-31,2000-10-24,"Oakland County, Michigan",
9981,http://dbpedia.org/resource/Bernard_M._L._Ernst,,Bernard M. L. Ernst,1879-03-17,--11-28,,
9982,http://dbpedia.org/resource/Miha_Krek,,Miha Krek,1897-09-28,1969-11-18,,


In [158]:
df_wk= pd.DataFrame(result_wikidata, columns=['uri_wk', 'name_wk', 'dateBirth_wk', "dateDeath_wk", "placeOfBirth_wk", "placeOfDeath_wk", "uri_bnf" ,'viaf' ])
print(len(df_wk))
df_wk.fillna('')
df_wk.head()

56299


Unnamed: 0,uri_wk,viaf,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
0,http://www.wikidata.org/entity/Q85816,http://viaf.org/viaf/10779712,Walter von Keudell,1884-07-17T00:00:00Z,1973-05-07T00:00:00Z,Castellammare di Stabia,Bonn
1,http://www.wikidata.org/entity/Q85791,http://viaf.org/viaf/110676965,Peter Schulz,1930-04-25T00:00:00Z,2013-05-17T00:00:00Z,Rostock,Hamburg
2,http://www.wikidata.org/entity/Q85825,http://viaf.org/viaf/1437149198284974940006,Werner Hoyer,1951-11-17T00:00:00Z,,Wuppertal,
3,http://www.wikidata.org/entity/Q85842,http://viaf.org/viaf/54138061,Leopold August Warnkönig,1794-08-01T00:00:00Z,1866-08-19T00:00:00Z,Bruchsal,Stuttgart
4,http://www.wikidata.org/entity/Q85844,http://viaf.org/viaf/8275505,Wolfgang Bötsch,1938-09-08T00:00:00Z,2017-10-14T00:00:00Z,Bad Kreuznach,Würzburg


In [153]:
# Drop duplicates lines and keep first
df_wk.drop_duplicates(subset ='uri_wk', keep = 'first', inplace=True)
df_wk.fillna('')
print(len(df_wk))
df_wk.head()

53262


Unnamed: 0,uri_wk,viaf,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
0,http://www.wikidata.org/entity/Q85816,http://viaf.org/viaf/10779712,Walter von Keudell,1884-07-17T00:00:00Z,1973-05-07T00:00:00Z,Castellammare di Stabia,Bonn
1,http://www.wikidata.org/entity/Q85791,http://viaf.org/viaf/110676965,Peter Schulz,1930-04-25T00:00:00Z,2013-05-17T00:00:00Z,Rostock,Hamburg
2,http://www.wikidata.org/entity/Q85825,http://viaf.org/viaf/1437149198284974940006,Werner Hoyer,1951-11-17T00:00:00Z,,Wuppertal,
3,http://www.wikidata.org/entity/Q85842,http://viaf.org/viaf/54138061,Leopold August Warnkönig,1794-08-01T00:00:00Z,1866-08-19T00:00:00Z,Bruchsal,Stuttgart
4,http://www.wikidata.org/entity/Q85844,http://viaf.org/viaf/8275505,Wolfgang Bötsch,1938-09-08T00:00:00Z,2017-10-14T00:00:00Z,Bad Kreuznach,Würzburg


In [154]:
# Create an id to Wikidata dataframe
df_wk["id_wk"] = df_wk.index + 0
df_wk= pd.DataFrame(df_wk, columns=["uri_wk", "viaf", "name_wk","dateBirth_wk","dateDeath_wk","placeOfBirth_wk","placeOfDeath_wk"],index=df_wk["id_wk"])
df_wk[-15:]

Unnamed: 0_level_0,uri_wk,viaf,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
id_wk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
56283,http://www.wikidata.org/entity/Q106095610,,,1977-06-01T00:00:00Z,,,
56284,http://www.wikidata.org/entity/Q106079947,,,1979-08-10T00:00:00Z,,,
56285,http://www.wikidata.org/entity/Q106470635,,,1921-04-12T00:00:00Z,1993-06-04T00:00:00Z,Askarovo,
56286,http://www.wikidata.org/entity/Q106488913,,,1985-08-07T00:00:00Z,,,
56287,http://www.wikidata.org/entity/Q106448050,,,1939-01-01T00:00:00Z,,Bethlehem,
56288,http://www.wikidata.org/entity/Q106488932,,,1988-09-17T00:00:00Z,,,
56289,http://www.wikidata.org/entity/Q104970360,,,1839-08-08T00:00:00Z,1909-02-16T00:00:00Z,Madaras,Budapest District V
56290,http://www.wikidata.org/entity/Q106267248,,,1989-03-19T00:00:00Z,,,
56291,http://www.wikidata.org/entity/Q106203537,,,1961-12-07T00:00:00Z,2013-11-03T00:00:00Z,,
56292,http://www.wikidata.org/entity/Q104834824,http://viaf.org/viaf/1468846,,1946-09-19T00:00:00Z,2020-12-19T00:00:00Z,Gniezno,


deathDate_dbp# RecordLinkage

It permits to calculate a match score between to strings. Here, we use the "fuzzymatcher" library.

This article explains  very well how uses it. cf. https://pbpython.com/record-linking.html

There is also a documentation but it is very light. cf. https://fuzzymatcher.readthedocs.io/en/latest/

It seems work well only for the strings, because I guess it manages integers as strings. 

In [83]:
from pathlib import Path
import fuzzymatcher

In [77]:
df_bnf.to_csv("df_bnf.csv")


In [78]:
BnF_Data = pd.read_csv('df_bnf.csv')
Wikidata = pd.read_csv('df_wk.csv')
DBpedia = pd.read_csv('df_dbp.csv')

In [79]:
BnF_Data

Unnamed: 0,id_bnf,uri_bnf,viaf,name_bnf,Sname,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf
0,0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836-11-10,1901-05-06,,,Juriste. - Administrateur et homme de lettres....
1,1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874-06-14,1933-02-02,Joigny (Yonne),Paris,Docteur en droit. - Critique littéraire et thé...
2,2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794-07-03,1845-10-20,Cahors (Lot),,"Écrivain politique, avocat à la Cour de cassat..."
3,3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852-07-19,19..,,,"Docteur en droit (Paris, 1873)"
4,4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,1917,,,Juriste. - Bibliophile
...,...,...,...,...,...,...,...,...,...,...
11034,11127,http://data.bnf.fr/ark:/12148/cb11475627b#about,,Joan Mitchell,,1920-03-15,2014-02-13,,,Économiste. - Professeur d'économie de l'unive...
11035,11128,http://data.bnf.fr/ark:/12148/cb10562770v#about,,Kazimierz Zimmermann,,1874,1925,Trzemeszno (Pologne),Cracovie (Pologne),Chanoine. - Economiste. - Recteur de l'Univers...
11036,11129,http://data.bnf.fr/ark:/12148/cb17701366b#about,,ʿUmar ʿAzīz,,1949-02-18,2013-02-16,,,Chercheur et professeur d'économie. - Militant...
11037,11130,http://data.bnf.fr/ark:/12148/cb17877820g#about,,John Davenport,,1904-09-11,1987-06-08,"Philadelphie (Pennsylvanie, États-Unis)","Red Bank (New Jersey, États-Unis)","Journaliste économiste. - Journaliste à : ""For..."


## Recordlinkage between BnF Data and DBpedia

In [94]:
left_on=["name_bnf", "placeOfBirth_bnf","placeOfDeath_bnf"]
right_on=["name_dbp", "placeOfBirth_dbp","placeOfDeath_dbp"]

In [95]:
matched_results = fuzzymatcher.fuzzy_left_join(BnF_Data,
                                            DBpedia,
                                            left_on,
                                            right_on,
                                            left_id_col='uri_bnf',
                                            right_id_col='uri_dbp')

In [106]:
cols_bnf_dbp= ["best_match_score","id_bnf","uri_bnf","viaf_bnf", "name_bnf", "dateBirth_bnf", "dateDeath_bnf","placeOfBirth_bnf","placeOfDeath_bnf", "bio_bnf", "id_dbp","uri_dbp", "viaf_dbp", "name_dbp","birthDate_dbp","deathDate_dbp", "placeOfBirth_dbp","placeOfDeath_dbp"]

In [107]:
best_match_bnf_dbp=matched_results[cols_bnf_dbp].sort_values(by=["best_match_score"], ascending=False).head(10)
best_match_bnf_dbp

Unnamed: 0,best_match_score,id_bnf,uri_bnf,viaf_bnf,name_bnf,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf,id_dbp,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
249451,1.752024,4716,http://data.bnf.fr/ark:/12148/cb11298933w#about,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882,1947.0,Antony (Hauts-de-Seine),Paris,Homme politique. - Avocat près la cour d'appel...,2521.0,http://dbpedia.org/resource/Auguste_Champetier...,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882-07-30,1947-03-06,"Antony, Hauts-de-Seine",
545878,1.213143,9991,http://data.bnf.fr/ark:/12148/cb12649301n#about,,Anísio Teixeira,1900,1971.0,"Caetité, Brésil","Rio de Janeiro, Brésil",Juriste. - Éducateur,5875.0,http://dbpedia.org/resource/Anísio_Teixeira,,Anísio Teixeira,1900-07-11,1971-03-11,Brazil,Rio de Janeiro
66943,1.201671,1257,http://data.bnf.fr/ark:/12148/cb11926733m#about,http://viaf.org/viaf/73860740,Jean-Louis Tixier-Vignancour,1907,1989.0,,,Avocat. - Ancien député. - Candidat à l'électi...,8325.0,http://dbpedia.org/resource/Jean-Louis_Tixier-...,,Jean-Louis Tixier-Vignancour,1907-10-12,1989-09-29,,
522148,1.043139,9539,http://data.bnf.fr/ark:/12148/cb108125478#about,,Luigi Amoroso,1886,1965.0,Naples,Rome,"Mathématicien, professeur d'économie politique...",35.0,http://dbpedia.org/resource/Luigi_Amoroso,http://viaf.org/viaf/9838425,Luigi Amoroso,1886-03-26,1965-10-28,Naples,Rome
94788,0.978183,1779,http://data.bnf.fr/ark:/12148/cb12599024b#about,http://viaf.org/viaf/66585190,Michel Crépeau,1930,1999.0,Fontenay-le-Comte (Vendée),Paris,Avocat. - Homme politique. - Co-fondateur puis...,9544.0,http://dbpedia.org/resource/Michel_Crépeau,,Michel Crépeau,1930-10-30,1999-03-30,Fontenay-le-Comte,
227365,0.950051,4283,http://data.bnf.fr/ark:/12148/cb120792656#about,http://viaf.org/viaf/36942970,Gustav Radbruch,1878,1949.0,Lübeck (Allemagne),Heidelberg (Allemagne),Homme politique. - Philosophe du droit. - Juriste,2340.0,http://dbpedia.org/resource/Gustav_Radbruch,http://viaf.org/viaf/36942970,Gustav Radbruch,1878-11-21,1949-11-23,Lübeck,Heidelberg
137886,0.882604,2607,http://data.bnf.fr/ark:/12148/cb123419831#about,http://viaf.org/viaf/56683086,William Martin Geldart,1870,1922.0,,,"Membre de Trinity College, Oxford, GB. - Juriste",1493.0,http://dbpedia.org/resource/William_Martin_Gel...,http://viaf.org/viaf/56683086,William Martin Geldart,1870-06-07,--02-12,,
520575,0.869083,9499,http://data.bnf.fr/ark:/12148/cb12278698c#about,,Gustav Cassel,1866,1945.0,Stockholm,Joenkoping (Suède),"Docteur ès sciences de l'Université d'Uppsala,...",1808.0,http://dbpedia.org/resource/Gustav_Cassel,http://viaf.org/viaf/49285903,Gustav Cassel,1866-10-20,1945-01-14,Stockholm,Jönköping
366919,0.825417,6738,http://data.bnf.fr/ark:/12148/cb12343132d#about,http://viaf.org/viaf/22214944,Benjamin Nathan Cardozo,1870,1938.0,,,Juriste. - A été attaché à la Cour Suprême des...,3372.0,http://dbpedia.org/resource/Benjamin_N._Cardozo,http://viaf.org/viaf/22214944,Benjamin Nathan Cardozo,1870-05-24,1938-07-09,,
186169,0.818944,3539,http://data.bnf.fr/ark:/12148/cb11047680z#about,http://viaf.org/viaf/46753389,Camille Blaisot,1881,1945.0,Valognes (Manche),Dachau (Allemagne),Avocat à la Cour d'appel de Caen. - A été dépu...,8182.0,http://dbpedia.org/resource/Camille_Blaisot,,Camille Blaisot,1881-01-19,1945-01-24,Valognes,"Dachau, Bavaria"


In [108]:
maReSo=matched_results[cols_bnf_dbp].sort_values(by=["best_match_score"], ascending=True).head(10)
maReSo

Unnamed: 0,best_match_score,id_bnf,uri_bnf,viaf_bnf,name_bnf,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf,id_dbp,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
160996,-2.591975,3062,http://data.bnf.fr/ark:/12148/cb136243015#about,http://viaf.org/viaf/5109006,Menaḥem Zevi Kaddari,1925,2011.0,Mezőkövesd (Hongrie),Ramat-Gan (Israël),"Linguiste. - Professeur émérite, Department of...",1908.0,http://dbpedia.org/resource/Carl_Ernst_Fürst_F...,http://viaf.org/viaf/47524071,Carl Ernst Fürst Fugger von Glött,1859-07-02,1940-04-25,Oberndorf am Lech,Kirchheim in Schwaben
605865,-2.460501,10982,http://data.bnf.fr/ark:/12148/cb17791066c#about,,Sayf ibn Ḥamad ibn Šayẖān ibn Muḥammad ibn ...,1892,1961.0,"Sima, Azki - Oman","Matrah, Mascate - Oman",Poète. - Juge,6057.0,http://dbpedia.org/resource/Ahmed_Ali_Sheikh,,Ahmed Ali M Sheikh,1961-10-03,,,
189849,-2.3339,3606,http://data.bnf.fr/ark:/12148/cb16322024d#about,http://viaf.org/viaf/44752143,Maximilian Brantl,1881,1951.0,Munich (Allemagne),Prien am Chiemsee (Allemagne),"Écrivain, poète et parolier. - A aussi utilisé...",1908.0,http://dbpedia.org/resource/Carl_Ernst_Fürst_F...,http://viaf.org/viaf/47524071,Carl Ernst Fürst Fugger von Glött,1859-07-02,1940-04-25,Oberndorf am Lech,Kirchheim in Schwaben
41512,-2.079271,790,http://data.bnf.fr/ark:/12148/cb121974159#about,http://viaf.org/viaf/78772873,Franc Miklošič,1813,1891.0,"Radomerščak dans Ljutomer, Styrie (Empire ausr...","Vienne (Empire ausrto-hongrois, aujourd'hui Au...",Docteur en philosophie en 1838 (de l'Universit...,3919.0,http://dbpedia.org/resource/Julius_von_Kirchmann,http://viaf.org/viaf/13104809,,1802-11-05,1884-10-20,,German Empire
198924,-2.042366,3763,http://data.bnf.fr/ark:/12148/cb15984663k#about,http://viaf.org/viaf/4322733,Sándor Boschan,1891,1942.0,"Ada, Comitat Bacs-Bodrog, Hongrie","Csurog, [avant 1920 dans le comitat de Bacs-Bo...",Écrivain. - Avocat à Csurog. - Tué lors de la ...,1547.0,http://dbpedia.org/resource/Sándor_Wekerle,http://viaf.org/viaf/54951146,Sándor Wekerle,1848-11-14,1921-08-26,Kingdom of Hungary,Kingdom of Hungary (1920–1946)
434551,-1.837949,7890,http://data.bnf.fr/ark:/12148/cb11229242q#about,http://viaf.org/viaf/160351646,Milivoje Perović,1912,1975.0,Mala Braina près de Medveđa (Royaume de Serbie...,"Belgrade (Yougoslavie, aujourd'hui Serbie)",Docteur en droit. - Écrivain,2169.0,http://dbpedia.org/resource/Smilja_Avramov,http://viaf.org/viaf/41910880,Smilja Avramov,1918-02-15,2018-10-02,Austria-Hungary,Belgrade
602073,-1.785585,10919,http://data.bnf.fr/ark:/12148/cb17708334d#about,,Nicasio Idar,1955,1914.0,"Point Isabel (Tx., États-Unis)","Laredo (Tx., État-Unis)",Journaliste et avocat. Défenseur des droits ci...,7846.0,http://dbpedia.org/resource/Charles_Wingender,,http://dbpedia.org/resource/Denver_Pioneers_fo...,1884-09-20,1943,"Mineral Point, Wisconsin",
32323,-1.782468,596,http://data.bnf.fr/ark:/12148/cb16184017h#about,http://viaf.org/viaf/84288151,Mehmed Spaho,1883,1939.0,"Sarajevo (Empire austro-hongrois, aujourd'hui ...","Sarajevo (Royaume de Yougoslavie, aujourd'hui ...",Docteur en droit. - Homme politique et avocat....,7885.0,http://dbpedia.org/resource/Alen_Hujić,,Alen Hujić,1967-05-11,,Sarajevo,
273005,-1.767561,5109,http://data.bnf.fr/ark:/12148/cb10818445r#about,http://viaf.org/viaf/310510743,Etbin Henrik Costa,1832,1875.0,"Novo mesto (Empire austro-hongrois, aujourd'hu...","Ljubljana (Empire austro-hongrois, aujourd'hui...","Bibliographe, avocat et homme politique",5031.0,http://dbpedia.org/resource/Rui_Costa_(politic...,,Rui Costa,1963-01-18,,,
296239,-1.764684,5509,http://data.bnf.fr/ark:/12148/cb12025353g#about,http://viaf.org/viaf/29547542,Neoklī́s Kazázīs,1849,1936.0,"Pétra Lésvou (Mytilène/Lesbos, Îles Égéennes s...","Athènes (Attique, Grèce)",Juriste. - Professeur d'économie politique (18...,1540.0,http://dbpedia.org/resource/Henry_Farber,http://viaf.org/viaf/55088948,Henry S. Farber,1951-01-29,,,


In [115]:
matched_bnf_dbp=matched_results[cols_bnf_dbp].query("best_match_score <= .05").sort_values(
    by=['best_match_score'], ascending=False)
print(len(matched_bnf_dbp))
matched_bnf_dbp[:10]

10336


Unnamed: 0,best_match_score,id_bnf,uri_bnf,viaf_bnf,name_bnf,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf,id_dbp,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
261401,0.049958,4914,http://data.bnf.fr/ark:/12148/cb13543089k#about,http://viaf.org/viaf/316439034,José Cafranga,1780,1854.0,,,Juriste et homme politique espagnol,6240.0,http://dbpedia.org/resource/José_Fragelli,,José Fragelli,1915-12-31,2010-04-30,,
564947,0.049958,10292,http://data.bnf.fr/ark:/12148/cb12958606p#about,,José Castillejo,1877,1945.0,,,Philosophe et juriste,6240.0,http://dbpedia.org/resource/José_Fragelli,,José Fragelli,1915-12-31,2010-04-30,,
359415,0.049958,6605,http://data.bnf.fr/ark:/12148/cb13328185x#about,http://viaf.org/viaf/226077615,José Balarello,1926,2015.0,,,Avocat et homme politique. - Sénateur des Alpe...,6240.0,http://dbpedia.org/resource/José_Fragelli,,José Fragelli,1915-12-31,2010-04-30,,
188418,0.04961,3582,http://data.bnf.fr/ark:/12148/cb15918325q#about,http://viaf.org/viaf/45402214,Carl Meyer,1873,1947.0,,,Juriste et homme politique du canton d'Appenze...,153.0,http://dbpedia.org/resource/Laurence_Meyer,http://viaf.org/viaf/91708741,Laurence Meyer,1944-03-08,,,
109760,0.049027,2080,http://data.bnf.fr/ark:/12148/cb10551494k#about,http://viaf.org/viaf/63874811,John Frederick Lewis,1860,1932.0,,,Avocat né à Philadelphie. A légué sa collectio...,5763.0,http://dbpedia.org/resource/John_Feerick,,John D. Feerick,1936-07-12,,,
535731,0.048658,9827,http://data.bnf.fr/ark:/12148/cb165642447#about,,David Anisi,1949,2008.0,,,Economiste,9696.0,http://dbpedia.org/resource/David_Jones_(Cardi...,,David Jones,1883-10-29,1966-07-28,"Johnstown, Pennsylvania",
453765,0.048011,8237,http://data.bnf.fr/ark:/12148/cb12392644k#about,http://viaf.org/viaf/126145857808723020035,William Winthrop,1831,1899.0,,,Colonnel. - Juge militaire dans l'armée améric...,4443.0,http://dbpedia.org/resource/William_Davis_(jou...,http://viaf.org/viaf/105869848,William Davis,1933-03-06,2019-02-02,,
158653,0.047238,3023,http://data.bnf.fr/ark:/12148/cb10938104x#about,http://viaf.org/viaf/51681643,Georges Boyer,1896,1960.0,,,Juriste,7243.0,http://dbpedia.org/resource/Blair_Boyer,,Blair Boyer,1981-03-30,,,
31245,0.046638,574,http://data.bnf.fr/ark:/12148/cb10725608d#about,http://viaf.org/viaf/85931863,Joseph-Georges Demangeat,1787,1866.0,,,"Procureur du roi à Nantes, puis avocat général...",1768.0,http://dbpedia.org/resource/Joseph_George_Rose...,http://viaf.org/viaf/49981696,Joseph George Rosengarten,1835-07-14,1921-01-14,,
499983,0.046638,9074,http://data.bnf.fr/ark:/12148/cb12413087z#about,http://viaf.org/viaf/100292131,Joseph Chitty,1776,1841.0,,,Juriste,4290.0,http://dbpedia.org/resource/Joseph_Luns,http://viaf.org/viaf/109095225,Joseph Luns,1911-08-28,2002-07-17,,


In [184]:
bnf_dbp=pd.DataFrame(matched_bnf_dbp, columns=['uri_bnf','uri_dbp','viaf_bnf','name_bnf', 'year_bnf','bio_bnf'])


Unnamed: 0,uri_bnf,uri_dbp,viaf_bnf,name_bnf,year_bnf,bio_bnf
62919,http://data.bnf.fr/ark:/12148/cb121108528#about,http://dbpedia.org/resource/Jacques_Mairesse_(...,http://viaf.org/viaf/7421350,Jacques Mercier,1921,Avocat et homme politique. - Avocat à la cour ...
429849,http://data.bnf.fr/ark:/12148/cb14316906f#about,http://dbpedia.org/resource/Béla_Szászy,http://viaf.org/viaf/166146883,Béla Szász,1868,Écrivain. - Juge. - Traducteur. - A fait des é...
160756,http://data.bnf.fr/ark:/12148/cb11906728m#about,http://dbpedia.org/resource/Gisèle_Halimi,http://viaf.org/viaf/51690665,Gisèle Halimi,1927,"Avocate. - Femme politique, députée de l'Isère..."
194723,http://data.bnf.fr/ark:/12148/cb12400819r#about,http://dbpedia.org/resource/Claude_Goasguen,http://viaf.org/viaf/44383712,Claude Goasguen,1945,Avocat. - Inspecteur général de l'éducation na...
531196,http://data.bnf.fr/ark:/12148/cb11907611v#about,http://dbpedia.org/resource/Thomas_Hodgskin,,Thomas Hodgskin,1787,"Jounaliste, économiste. - Un des pionniers du ..."
...,...,...,...,...,...,...
162204,http://data.bnf.fr/ark:/12148/cb10272244m#about,http://dbpedia.org/resource/James_Mitchell_(Ca...,http://viaf.org/viaf/51587696,Yiṣḥaq Zelig Gronemann,1843,"Rabbin. - Père de Sammy Gronemann (1875-1952),..."
469623,http://data.bnf.fr/ark:/12148/cb16761971p#about,http://dbpedia.org/resource/Wade_H._McCree,http://viaf.org/viaf/116498531,Veljko Guberina,1925,Avocat
43336,http://data.bnf.fr/ark:/12148/cb121974159#about,http://dbpedia.org/resource/Mitrofan_Grodzitsky,http://viaf.org/viaf/78772873,Franc Miklošič,1813,Docteur en philosophie en 1838 (de l'Universit...
479613,http://data.bnf.fr/ark:/12148/cb12023012k#about,http://dbpedia.org/resource/Wade_H._McCree,http://viaf.org/viaf/109768106,Ljubomir Tadić,1925,"Juriste, philosophe et homme politique. - Prof..."


## Recordlinkage between BnF Data and Wikidata

In [125]:
left_on=["name_bnf", "placeOfBirth_bnf", "placeOfDeath_bnf"]
right_on=["name_wk", "placeOfBirth_wk", "placeOfDeath_wk"]

In [126]:
matched_results = fuzzymatcher.fuzzy_left_join(BnF_Data,
                                            Wikidata,
                                            left_on,
                                            right_on,
                                            left_id_col='uri_bnf',
                                            right_id_col='uri_wk')

In [134]:
cols_bnf_wk= ["best_match_score","id_bnf","uri_bnf","viaf_bnf", "name_bnf", "dateBirth_bnf", "dateDeath_bnf","placeOfBirth_bnf","placeOfDeath_bnf", "bio_bnf", 'id_wk','uri_wk', 'viaf_wk', 'name_wk', 'dateBirth_wk', "dateDeath_wk", "placeOfBirth_wk", "placeOfDeath_wk"]

In [74]:
best_match_bnf_wk=matched_results[cols_bnf_wk].sort_values(by=["best_match_score"], ascending=False).head(10)
best_match_bnf_wk[:1]

NameError: name 'matched_results' is not defined

In [73]:
worse_bnf_wk=matched_results[cols_bnf_wk].sort_values(by=["best_match_score"], ascending=True).head(10)
worse_bnf_wk[:1]

NameError: name 'matched_results' is not defined

In [140]:
matched_bnf_wk=matched_results[cols_bnf_wk].query("best_match_score >= .35").sort_values(
    by=['best_match_score'], ascending=False)
print(len(matched_bnf_wk))
matched_bnf_wk[-10:]

1098


Unnamed: 0,best_match_score,id_bnf,uri_bnf,viaf_bnf,name_bnf,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf,id_wk,uri_wk,viaf_wk,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
1017942,0.358156,9855,http://data.bnf.fr/ark:/12148/cb12415111g#about,,Hajime Kawakami,1879,1946.0,,,"Economiste et essayiste marxiste, propagateur ...",1695.0,http://www.wikidata.org/entity/Q1135505,http://viaf.org/viaf/22933447,Hajime Kawakami,1879-10-20T00:00:00Z,1946-01-30T00:00:00Z,Yamaguchi Prefecture,
998868,0.358036,9592,http://data.bnf.fr/ark:/12148/cb104536635#about,,Antonio Scialoja,1817,1877.0,,,"Homme politique, économiste. - Professeur d'éc...",13223.0,http://www.wikidata.org/entity/Q2857453,http://viaf.org/viaf/68918836,Antonio Scialoja,1817-08-01T00:00:00Z,1877-10-13T00:00:00Z,Naples,Procida
130344,0.357927,1264,http://data.bnf.fr/ark:/12148/cb119088558#about,http://viaf.org/viaf/73855183,Adolphe Joanne,1813,1881.0,Dijon,Paris,"Voyageur, créateur de collections de guides gé...",1024.0,http://www.wikidata.org/entity/Q481421,http://viaf.org/viaf/34456999,Jean Gaudemet,1908-09-10T00:00:00Z,2001-05-17T00:00:00Z,Dijon,Paris
279611,0.357148,2786,http://data.bnf.fr/ark:/12148/cb12342406f#about,http://viaf.org/viaf/54220961,Riccardo Monaco,1909,2000.0,"Gênes, Italie",Rome,Juriste. - Ancien professeur de droit internat...,42107.0,http://www.wikidata.org/entity/Q61272919,http://viaf.org/viaf/54220961,Riccardo Monaco,1909-01-01T00:00:00Z,2000-01-18T00:00:00Z,Genoa,Rome
1121140,0.35567,10686,http://data.bnf.fr/ark:/12148/cb10333378s#about,,Fredrik Stang,1867,1941.0,,,Professeur de droit. - Homme politique,18550.0,http://www.wikidata.org/entity/Q5499526,,Fredrik Stang Heffermehl,1913-03-22T00:00:00Z,1993-02-27T00:00:00Z,,
688231,0.355487,6568,http://data.bnf.fr/ark:/12148/cb123451337#about,http://viaf.org/viaf/22884434,August Becher,1816,1890.0,,,Avocat et homme politique allemand. - Député a...,6144.0,http://www.wikidata.org/entity/Q760865,http://viaf.org/viaf/22884434,August Becher,1816-02-21T00:00:00Z,1890-08-11T00:00:00Z,Stuttgart,Stuttgart
663158,0.354356,6335,http://data.bnf.fr/ark:/12148/cb10991687c#about,http://viaf.org/viaf/2462419,Armand Bernard,1868,1935.0,Montbéliard (Doubs),Monte-Carlo,Licencié en droit. - Haut-fonctionnaire. - Avo...,39850.0,http://www.wikidata.org/entity/Q47136452,http://viaf.org/viaf/2462419,Armand Bernard,1868-03-08T00:00:00Z,1935-08-01T00:00:00Z,Montbéliard,
387016,0.35289,3832,http://data.bnf.fr/ark:/12148/cb13036547p#about,http://viaf.org/viaf/41975189,Edmond-Eugène Thaller,1851,1918.0,Husseren-Wesserling (Haut-Rhin),Paris,Professeur de droit commercial à l'Université ...,51267.0,http://www.wikidata.org/entity/Q101995798,http://viaf.org/viaf/41975189,Edmond-Eugène Thaller,1851-01-01T00:00:00Z,1918-01-01T00:00:00Z,,
238123,0.352629,2357,http://data.bnf.fr/ark:/12148/cb13005078q#about,http://viaf.org/viaf/59217659,Frédéric Béchard,1824,1898.0,Nîmes,Neuilly-sur-Seine,Romancier et auteur dramatique. - Avocat (1846...,6149.0,http://www.wikidata.org/entity/Q180409,http://viaf.org/viaf/24606197,Frédéric Passy,1822-05-20T00:00:00Z,1912-06-12T00:00:00Z,Paris,Neuilly-sur-Seine
1005801,0.350575,9681,http://data.bnf.fr/ark:/12148/cb12246037d#about,,Stefan Starzyński,1893,1939.0,Varsovie,Varsovie,Homme politique et économiste. - Maire de la v...,17341.0,http://www.wikidata.org/entity/Q3849484,http://viaf.org/viaf/27872662,Stefan Starzyński,1893-08-19T00:00:00Z,1943-01-01T00:00:00Z,Warsaw,Dachau concentration camp


## Recordlinkage between DBpedia and Wikidata

In [9]:
left_on=["name_dbp", "placeOfBirth_dbp", "placeOfDeath_dbp", "birthDate_dbp" ]
right_on=["name_wk", "placeOfBirth_wk", "placeOfDeath_wk", "dateBirth_wk"]

In [10]:
matched_results = fuzzymatcher.fuzzy_left_join(DBpedia,
                                            Wikidata,
                                            left_on,
                                            right_on,
                                            left_id_col='uri_dbp',
                                            right_id_col='uri_wk')

In [11]:
cols_dbp_wk= ["best_match_score","uri_dbp","viaf_dbp", "name_dbp", "birthDate_dbp", "deathDate_dbp","placeOfBirth_dbp","placeOfDeath_dbp", "uri_wk", "viaf_wk", "name_wk","dateBirth_wk","dateDeath_wk","placeOfBirth_wk","placeOfDeath_wk"]

In [12]:
best_match_dbp_wk=matched_results[cols_dbp_wk].sort_values(by=["best_match_score"], ascending=False).head(10)
best_match_dbp_wk

Unnamed: 0,best_match_score,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp,uri_wk,viaf_wk,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
100825,3.217544,http://dbpedia.org/resource/Carl_Ernst_Fürst_F...,http://viaf.org/viaf/47524071,Carl Ernst Fürst Fugger von Glött,1859-07-02,1940-04-25,Oberndorf am Lech,Kirchheim in Schwaben,http://www.wikidata.org/entity/Q18130550,http://viaf.org/viaf/47524071,Carl Ernst Fürst Fugger von Glött,1859-07-02T00:00:00Z,1940-04-25T00:00:00Z,Oberndorf am Lech,Kirchheim in Schwaben
83247,1.963506,http://dbpedia.org/resource/Pieter_Cort_van_de...,http://viaf.org/viaf/55359941,Pieter Cort van der Linden,1846-05-14,1935-07-15,The Hague,The Hague,http://www.wikidata.org/entity/Q1397415,http://viaf.org/viaf/55359941,Pieter Cort van der Linden,1846-05-14T00:00:00Z,1935-07-15T00:00:00Z,The Hague,The Hague
221067,1.844043,http://dbpedia.org/resource/K._N._Raj,http://viaf.org/viaf/110206928,K. N. Raj,1924-05-13,2010-02-10,Thrissur district,Thiruvananthapuram,http://www.wikidata.org/entity/Q6323660,http://viaf.org/viaf/110206928,K. N. Raj,1924-05-13T00:00:00Z,2010-02-10T00:00:00Z,Thrissur district,Thiruvananthapuram
248885,1.744106,http://dbpedia.org/resource/Bento_de_Jesus_Caraça,,Bento de Jesus Caraça,1901-04-18,--06-25,Vila Viçosa,Lisbon,http://www.wikidata.org/entity/Q4890520,http://viaf.org/viaf/32124313,Bento de Jesus Caraça,1901-04-18T00:00:00Z,1948-06-25T00:00:00Z,Vila Viçosa,Lisbon
39107,1.699398,http://dbpedia.org/resource/Peter_S._Albin,http://viaf.org/viaf/73451176,Peter S. Albin,1934-12-20,2008-02-20,New York City,New York City,http://www.wikidata.org/entity/Q4060600,http://viaf.org/viaf/73451176,Peter S. Albin,1934-12-20T00:00:00Z,2008-02-20T00:00:00Z,New York City,New York City
247808,1.699032,http://dbpedia.org/resource/Bagrat_Asatryan,,Bagrat Asatryan,1956-02-02,,Armenian Soviet Socialist Republic,,http://www.wikidata.org/entity/Q4071171,,Bagrat Asatryan,1956-02-02T00:00:00Z,,Armenian Soviet Socialist Republic,
146525,1.678884,http://dbpedia.org/resource/Wouter_Koolmees,http://viaf.org/viaf/290887085,Wouter Koolmees,1977-03-20,,Capelle aan den IJssel,,http://www.wikidata.org/entity/Q2595414,http://viaf.org/viaf/290887085,Wouter Koolmees,1977-03-20T00:00:00Z,,Capelle aan den IJssel,
63114,1.656931,http://dbpedia.org/resource/Oscar_Espinosa_Chepe,http://viaf.org/viaf/64254939,Oscar Espinosa Chepe,1940-11-29,2013-09-23,Cienfuegos,Cercedilla,http://www.wikidata.org/entity/Q7105959,http://viaf.org/viaf/64254939,Oscar Espinosa Chepe,1940-11-29T00:00:00Z,2013-09-23T00:00:00Z,Cienfuegos,Cercedilla
359358,1.611539,http://dbpedia.org/resource/Mariano_Rajoy_Sobredo,,Mariano Rajoy Sobredo,1921-08-28,2018-11-01,Santiago de Compostela,Madrid,http://www.wikidata.org/entity/Q22122581,,Mariano Rajoy Sobredo,1921-08-28T00:00:00Z,2018-11-01T00:00:00Z,Santiago de Compostela,Madrid
110938,1.600508,http://dbpedia.org/resource/Justin_Wolfers,http://viaf.org/viaf/42892681,Justin Wolfers,1972-12-11,,Papua New Guinea,,http://www.wikidata.org/entity/Q1714242,http://viaf.org/viaf/42892681,Justin Wolfers,1972-12-11T00:00:00Z,,Papua New Guinea,


In [13]:
worse_dbp_wk=matched_results[cols_dbp_wk].sort_values(by=["best_match_score"], ascending=True).head(10)
worse_dbp_wk

Unnamed: 0,best_match_score,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp,uri_wk,viaf_wk,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
559316,-1.729896,http://dbpedia.org/resource/Gilberto_Concepció...,,Gilberto Concepción de Gracia,1909-07-09,1968-03-16,"Vega Alta, Puerto Rico","Santurce, San Juan, Puerto Rico",http://www.wikidata.org/entity/Q5561403,,"Gilberto Teodoro, Sr.",1927-05-07T00:00:00Z,2008-02-22T00:00:00Z,,
154237,-1.704865,http://dbpedia.org/resource/C._Hayavadana_Rao,http://viaf.org/viaf/273110043,http://dbpedia.org/resource/Rao_Bahadur,1865-07-10,1946-01-27,Presidencies and provinces of British India,Kingdom of Mysore,http://www.wikidata.org/entity/Q4890148,http://viaf.org/viaf/95158137,Benoy Kumar Sarkar,1887-12-26T00:00:00Z,1949-11-24T00:00:00Z,Provinces of India,"Washington, D.C."
390291,-1.656593,http://dbpedia.org/resource/Vernon_Geddy,,http://dbpedia.org/resource/William_&_Mary_Tri...,1897-11-11,1952-10-18,"Williamsburg, Virginia","James City County, Virginia",http://www.wikidata.org/entity/Q25483624,,,1880-11-14T00:00:00Z,1932-01-01T00:00:00Z,,Granite City
323713,-1.637836,http://dbpedia.org/resource/Byron_B._Harlan,,Byron Berry Harlan,1886-10-22,1949-11-11,"Greenville, Ohio","Cogan House Township, Lycoming County, Pennsyl...",http://www.wikidata.org/entity/Q5075578,http://viaf.org/viaf/25845506,Charles Berry,1930-01-01T00:00:00Z,2007-01-01T00:00:00Z,,
697509,-1.499284,http://dbpedia.org/resource/Kinahan_Cornwallis...,,Kinahan Cornwallis [Albert MacKenzie Russell K...,1837-12-24,--08-15,United Kingdom of Great Britain and Ireland,,http://www.wikidata.org/entity/Q7381844,http://viaf.org/viaf/85295959,Russell W. Cooper,1955-01-01T00:00:00Z,,United States of America,
176915,-1.492256,http://dbpedia.org/resource/Asta_Hampe,http://viaf.org/viaf/213397360,Prof. Dr. rer.pol. und Dipl.-Ing. Asta Hampe,1907-05-24,2003-10-22,Helmstedt,Hamburg,http://www.wikidata.org/entity/Q96178,http://viaf.org/viaf/30485602,Emil Puhl,1889-08-28T00:00:00Z,1962-03-30T00:00:00Z,Berlin,Hamburg
555251,-1.484274,http://dbpedia.org/resource/Clément-Charles_Sa...,,Clément-Charles Sabrevois de Bleury,1798-10-28,1862-09-15,Lower Canada,"Saint-Vincent-de-Paul, Quebec",http://www.wikidata.org/entity/Q2958324,http://viaf.org/viaf/194159830,Charles-Victor de Bavay,1801-01-01T00:00:00Z,1875-11-28T00:00:00Z,,
231567,-1.479503,http://dbpedia.org/resource/Giuseppe_Toniolo,http://viaf.org/viaf/10642260,http://dbpedia.org/resource/Beatification,1845-03-07,1918-10-07,Kingdom of Lombardy–Venetia,Kingdom of Italy,http://www.wikidata.org/entity/Q12360110,,Avo Org,1952-03-23T00:00:00Z,,,
463185,-1.414708,http://dbpedia.org/resource/Charles_A._Eldredge,,Charles A. Eldredge,1820-02-27,1896-10-26,"Bridport, Vermont","Fond du Lac, Wisconsin",http://www.wikidata.org/entity/Q5234631,http://viaf.org/viaf/31429379,David Hale,1951-11-22T00:00:00Z,2015-10-19T00:00:00Z,Vermont,
549970,-1.414587,http://dbpedia.org/resource/Caleb_Dorsey,,Caleb Dorsey,1825-05-25,1885-03-29,"Anne Arundel County, Maryland","Sonora, California",http://www.wikidata.org/entity/Q5109543,,Christian Dorsey,1971-01-01T00:00:00Z,,Atlantic City,


In [22]:
matched_dbp_wk=matched_results[cols_dbp_wk].query("best_match_score >= .15").sort_values(
    by=['best_match_score'], ascending=False)
print(len(matched_dbp_wk))
matched_dbp_wk[-10:]

1255


Unnamed: 0,best_match_score,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp,uri_wk,viaf_wk,name_wk,dateBirth_wk,dateDeath_wk,placeOfBirth_wk,placeOfDeath_wk
253927,0.15406,http://dbpedia.org/resource/Ken_Ofori-Atta,,Kenneth Ofori-Atta,1958-11-07,,"Kibi, Ghana",,http://www.wikidata.org/entity/Q20022251,,Ken Ofori-Atta,1959-01-01T00:00:00Z,,Kibi,
281870,0.154006,http://dbpedia.org/resource/Maria_Kiwanuka,,Maria Kiwanuka,1955-05-12,,Uganda,,http://www.wikidata.org/entity/Q6761316,,Maria Kiwanuka,1955-05-12T00:00:00Z,,Kampala,
224324,0.15392,http://dbpedia.org/resource/Heinz_Arndt,http://viaf.org/viaf/108960473,Heinz Arndt,1915-02-26,2002-04-06,,,http://www.wikidata.org/entity/Q5700560,http://viaf.org/viaf/108960473,Heinz Arndt,1915-02-26T00:00:00Z,2002-05-06T00:00:00Z,Wrocław,Canberra
276498,0.153733,http://dbpedia.org/resource/Pilar_Nores_de_García,,Pilar Nores,1949-03-11,,Argentina,,http://www.wikidata.org/entity/Q9059638,,Pilar Nores de García,1949-03-11T00:00:00Z,,Córdoba,
10116,0.153416,http://dbpedia.org/resource/Laurence_Meyer,http://viaf.org/viaf/91708741,Laurence Meyer,1944-03-08,,,,http://www.wikidata.org/entity/Q6500730,http://viaf.org/viaf/91708741,Laurence Meyer,1944-03-08T00:00:00Z,,The Bronx,
139403,0.153068,http://dbpedia.org/resource/Martín_Guzmán,http://viaf.org/viaf/306384144,Martín Guzmán,1982-10-12,,Argentina,,http://www.wikidata.org/entity/Q77605455,http://viaf.org/viaf/306384144,Martín Guzmán,1982-10-12T00:00:00Z,,La Plata,
107028,0.15255,http://dbpedia.org/resource/Laura_Tyson,http://viaf.org/viaf/44397217,Laura Tyson,1947-06-28,,United States,,http://www.wikidata.org/entity/Q460911,http://viaf.org/viaf/44397217,Laura Tyson,1947-06-28T00:00:00Z,,Bayonne,
280497,0.151896,http://dbpedia.org/resource/Edward_Sandoyan,,Edward Sandoyan,1961-06-04,,Soviet Union,,http://www.wikidata.org/entity/Q4407546,,Edward Sandoyan,1961-06-04T00:00:00Z,,Yerevan,
15852,0.150671,http://dbpedia.org/resource/Manuel_H._Johnson,http://viaf.org/viaf/8678224,Manley Johnson,1949-02-10,,,,http://www.wikidata.org/entity/Q3286848,http://viaf.org/viaf/8678224,Manuel H. Johnson,1949-02-10T00:00:00Z,,Troy,
43492,0.150061,http://dbpedia.org/resource/Thorstein_Veblen,http://viaf.org/viaf/71451539,,1857-07-30,1929-08-03,,,http://www.wikidata.org/entity/Q4105702,,,1857-07-22T00:00:00Z,1927-03-16T00:00:00Z,,


In [21]:
left_on=["name_bnf"]
right_on=["name_wk"]

In [22]:
matched_results = fuzzymatcher.fuzzy_left_join(BnF_Data,
                                            Wikidata,
                                            left_on,
                                            right_on,
                                            left_id_col='uri_bnf',
                                            right_id_col='uri_wk')

In [34]:
cols_bnf_wk= ["best_match_score","uri_bnf","viaf_bnf", "name_bnf", "yearBirth_bnf", "yearDeath_bnf","placeOfBirth_bnf","placeOfDeath_bnf","bio_bnf", "uri_wk", "viaf_wk", "name_wk","yearBirth_wk","yearDeath_wk","placeOfBirth_wk","placeOfDeath_wk"]

In [49]:
best_match_bnf_wk=matched_results[cols_bnf_wk].sort_values(by=["best_match_score"], ascending=True).head(10)
best_match_bnf_wk

Unnamed: 0,best_match_score,uri_bnf,viaf_bnf,name_bnf,yearBirth_bnf,yearDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf,uri_wk,viaf_wk,name_wk,yearBirth_wk,yearDeath_wk,placeOfBirth_wk,placeOfDeath_wk
429712,-1.702404,http://data.bnf.fr/ark:/12148/cb17791066c#about,,Sayf ibn Ḥamad ibn Šayẖān ibn Muḥammad ibn ...,1892,1961.0,"Sima, Azki - Oman","Matrah, Mascate - Oman",Poète. - Juge,http://www.wikidata.org/entity/Q60576073,http://viaf.org/viaf/47994464,Muhammad al-Imadi,1930.0,,Damascus,
365152,-0.824534,http://data.bnf.fr/ark:/12148/cb12090818k#about,,Federigo Melis,1914,1973.0,"Florence, Italie","Florence, Italie",Historien et économiste. - Professeur d'histoi...,http://www.wikidata.org/entity/Q15879146,,Marie Jacob Hendrik de Bruyn van Melis- en Mar...,1891.0,1964.0,,
152012,-0.690152,http://data.bnf.fr/ark:/12148/cb12536971c#about,http://viaf.org/viaf/39489194,ʿAbd al-Hādī ibn Muḥammad Bū-Ṭālib,1923,2009.0,,,Homme politique. - Professeur de droit constit...,http://www.wikidata.org/entity/Q770901,http://viaf.org/viaf/34529103,Richard T. Ely,1854.0,1943.0,Ripley,Old Lyme
114667,-0.631786,http://data.bnf.fr/ark:/12148/cb10272244m#about,http://viaf.org/viaf/51587696,Yiṣḥaq Zelig Gronemann,1843,1918.0,"Flötenstein (Prusse-Occidentale), aujourd'hui ...",Hannover (Allemagne),"Rabbin. - Père de Sammy Gronemann (1875-1952),...",http://www.wikidata.org/entity/Q2157418,http://viaf.org/viaf/86448277,Robert H. Frank,1945.0,,,
274785,-0.622651,http://data.bnf.fr/ark:/12148/cb16732284q#about,http://viaf.org/viaf/21154368,Yūsuf Hammām Āṣāf,1859,1938.0,,,Historie et homme de lettres. - A été juriste ...,http://www.wikidata.org/entity/Q100989335,,Nils af Björksten,1931.0,,,
220666,-0.62139,http://data.bnf.fr/ark:/12148/cb10099705v#about,http://viaf.org/viaf/286365153,ʿAbd al-qādir ibn Muṣṭafā al- Maġribī,1867,1956.0,Lattaquié,,Juriste. - Journaliste. - A enseigné la langue...,http://www.wikidata.org/entity/Q770901,http://viaf.org/viaf/34529103,Richard T. Ely,1854.0,1943.0,Ripley,Old Lyme
317240,-0.586223,http://data.bnf.fr/ark:/12148/cb14552345w#about,http://viaf.org/viaf/15012801,ʿAzīz Muḥammad Abāẓaẗ,1899,1973.0,Muhafazat al-Charqiyya (Egypte),,Poète et dramaturge. - Avocat. - Député,http://www.wikidata.org/entity/Q4225600,http://viaf.org/viaf/62135397,M. M. Kovalev,1947.0,,Dokšycy District,
115758,-0.585637,http://data.bnf.fr/ark:/12148/cb136243015#about,http://viaf.org/viaf/5109006,Menaḥem Zevi Kaddari,1925,2011.0,Mezőkövesd (Hongrie),Ramat-Gan (Israël),"Linguiste. - Professeur émérite, Department of...",http://www.wikidata.org/entity/Q94878988,http://viaf.org/viaf/57363085,Ernst Am Ende,1804.0,1876.0,,
192591,-0.569673,http://data.bnf.fr/ark:/12148/cb10372967c#about,http://viaf.org/viaf/314851273,Jeanin Roustan,1846,1886.0,Gordes (Vaucluse),Nice (Alpes-Maritimes),Avocat à la Cour d'appel de Lyon. - Prénoms co...,http://www.wikidata.org/entity/Q42257717,http://viaf.org/viaf/94355869,Ann Friedlaender,1938.0,1992.0,Philadelphia,Boston
348242,-0.569062,http://data.bnf.fr/ark:/12148/cb128832222#about,http://viaf.org/viaf/108624624,Muḥammad Ẓafr Allāh H̱ān,1893,1985.0,Sialkot (Pakistan),Lahore (Pakistan),"Juriste, diplomate et homme politique",http://www.wikidata.org/entity/Q60576073,http://viaf.org/viaf/47994464,Muhammad al-Imadi,1930.0,,Damascus,


 ### Second Method: Recordlinkage

In [35]:
import recordlinkage

In [87]:
BnF_Data = pd.read_csv('df_bnf.csv', index_col='id_bnf')
Wikidata = pd.read_csv('df_wk.csv', index_col='id_wk')
DBpedia = pd.read_csv('df_dbp.csv', index_col='id_dbp')

In [88]:
BnF_Data

Unnamed: 0_level_0,uri_bnf,viaf,name_bnf,Sname,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf
id_bnf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836-11-10,1901-05-06,,,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874-06-14,1933-02-02,Joigny (Yonne),Paris,Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794-07-03,1845-10-20,Cahors (Lot),,"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852-07-19,19..,,,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,1917,,,Juriste. - Bibliophile
...,...,...,...,...,...,...,...,...,...
11127,http://data.bnf.fr/ark:/12148/cb11475627b#about,,Joan Mitchell,,1920-03-15,2014-02-13,,,Économiste. - Professeur d'économie de l'unive...
11128,http://data.bnf.fr/ark:/12148/cb10562770v#about,,Kazimierz Zimmermann,,1874,1925,Trzemeszno (Pologne),Cracovie (Pologne),Chanoine. - Economiste. - Recteur de l'Univers...
11129,http://data.bnf.fr/ark:/12148/cb17701366b#about,,ʿUmar ʿAzīz,,1949-02-18,2013-02-16,,,Chercheur et professeur d'économie. - Militant...
11130,http://data.bnf.fr/ark:/12148/cb17877820g#about,,John Davenport,,1904-09-11,1987-06-08,"Philadelphie (Pennsylvanie, États-Unis)","Red Bank (New Jersey, États-Unis)","Journaliste économiste. - Journaliste à : ""For..."


In [89]:
DBpedia

Unnamed: 0_level_0,uri_dbp,viaf_dbp,name_dbp,birthDate_dbp,deathDate_dbp,placeOfBirth_dbp,placeOfDeath_dbp
id_dbp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926-02-15,2016-01-18,,
1,http://dbpedia.org/resource/Carlos_Carvalhas,http://viaf.org/viaf/99826658,Carlos Carvalhas,1941-11-09,,"São Pedro do Sul, Portugal",
2,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857-09-22,1943-12-20,,
3,http://dbpedia.org/resource/Paulo_Portas,http://viaf.org/viaf/99455673,Paulo Portas,1962-09-12,,Lisbon,
5,http://dbpedia.org/resource/Pedro_Aspe,http://viaf.org/viaf/9928165,,1950-07-07,,Mexico City,
...,...,...,...,...,...,...,...
9989,http://dbpedia.org/resource/Beatriz_Corredor,,Beatriz Corredor,1968-07-01,,Madrid,
9991,http://dbpedia.org/resource/Beau_Correll,,Beau Correll,1982-07-20,,,
9992,http://dbpedia.org/resource/Bekir_Bozdağ,,Bekir Bozdağ,1965-04-01,,Akdağmadeni,
9997,http://dbpedia.org/resource/Betty_Korir,,Betty Korir,1977-07-01,,Kenya,


In [67]:
BnF_Data

Unnamed: 0_level_0,uri_bnf,viaf_bnf,name_bnf,Sname,dateBirth_bnf,dateDeath_bnf,placeOfBirth_bnf,placeOfDeath_bnf,bio_bnf
id_bnf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836,1901.0,,,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874,1933.0,Joigny (Yonne),Paris,Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794,1845.0,Cahors (Lot),,"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852,,,,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,1917.0,,,Juriste. - Bibliophile
...,...,...,...,...,...,...,...,...,...
11196,http://data.bnf.fr/ark:/12148/cb11475627b#about,,Joan Mitchell,,1920,2014.0,,,Économiste. - Professeur d'économie de l'unive...
11197,http://data.bnf.fr/ark:/12148/cb10562770v#about,,Kazimierz Zimmermann,,1874,1925.0,Trzemeszno (Pologne),Cracovie (Pologne),Chanoine. - Economiste. - Recteur de l'Univers...
11198,http://data.bnf.fr/ark:/12148/cb17701366b#about,,ʿUmar ʿAzīz,,1949,2013.0,,,Chercheur et professeur d'économie. - Militant...
11199,http://data.bnf.fr/ark:/12148/cb17877820g#about,,John Davenport,,1904,1987.0,"Philadelphie (Pennsylvanie, États-Unis)","Red Bank (New Jersey, États-Unis)","Journaliste économiste. - Journaliste à : ""For..."


#### Match between BnF Data and DBpedia

indexer = recordlinkage.Index()
indexer.full()

In [90]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='name_bnf', right_on='name_dbp')
candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

4266


indexer = recordlinkage.Index()
indexer.block(left_on=['name_bnf', 'uri_bnf'],
              right_on=['name_dbp', 'uri_dbp'])
pairs = indexer.index(df_bnf, df_dbp)


candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

In [91]:
compare = recordlinkage.Compare()
compare.string('name_bnf',
            'name_dbp',
            method='jarowinkler',
            threshold=0.85,
            label='name_bnf_dbp')
compare.exact('dateBirth_bnf',
            'birthDate_dbp',
            label='birthDate_bnf_dbp')
compare.exact('dateDeath_bnf',
            'deathDate_dbp',
            label='deathDate_bnf_dbp')
features = compare.compute(candidates, BnF_Data, DBpedia)

In [92]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,name_bnf_dbp,birthDate_bnf_dbp,deathDate_bnf_dbp
id_bnf,id_dbp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,6120,0.0,0,0
20,6013,0.0,0,0
21,307,0.0,0,0
25,4316,1.0,0,0
26,4575,0.0,0,0
...,...,...,...,...
11119,6209,0.0,0,0
11120,7073,0.0,0,0
11124,6264,0.0,0,0
11127,9115,0.0,0,0


In [93]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0      93
2.0      24
1.0    1172
0.0    2977
dtype: int64

In [94]:
features[features.sum(axis=1) > 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,name_bnf_dbp,birthDate_bnf_dbp,deathDate_bnf_dbp
id_bnf,id_dbp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8876,8224,1.0,1,1
9459,210,1.0,1,1
9597,994,1.0,1,1
9946,9555,1.0,1,1
20,2,1.0,1,1
...,...,...,...,...
10593,4503,1.0,1,1
1643,869,1.0,1,1
6734,3361,1.0,1,1
7257,9838,1.0,1,1


In [95]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'name_bnf_dbp':'deathDate_bnf_dbp'].sum(axis=1)
potential_matches

Unnamed: 0,id_bnf,id_dbp,name_bnf_dbp,birthDate_bnf_dbp,deathDate_bnf_dbp,Score
0,2110,9566,1.0,0,1,2.0
1,5555,8405,1.0,1,0,2.0
2,8876,8224,1.0,1,1,3.0
3,9459,210,1.0,1,1,3.0
4,9597,994,1.0,1,1,3.0
...,...,...,...,...,...,...
112,10651,943,1.0,1,0,2.0
113,1643,869,1.0,1,1,3.0
114,6734,3361,1.0,1,1,3.0
115,7257,9838,1.0,1,1,3.0


In [99]:
BnF_Data.loc[8876,:]

uri_bnf               http://data.bnf.fr/ark:/12148/cb119433785#about
viaf                                    http://viaf.org/viaf/10654890
name_bnf                                            Chitta Ranjan Das
Sname                                                             NaN
dateBirth_bnf                                              1870-11-05
dateDeath_bnf                                              1925-06-16
placeOfBirth_bnf          Bikrampur, Dhaka (maintenant au Bangladesh)
placeOfDeath_bnf                                           Darjeeling
bio_bnf             Poète et homme politique. - Avocat nationalist...
Name: 8876, dtype: object

In [100]:
DBpedia.loc[8224,:]

uri_dbp             http://dbpedia.org/resource/Chittaranjan_Das
viaf_dbp                                                     NaN
name_dbp                                        Chittaranjan Das
birthDate_dbp                                         1870-11-05
deathDate_dbp                                         1925-06-16
placeOfBirth_dbp     Presidencies and provinces of British India
placeOfDeath_dbp                                             NaN
Name: 8224, dtype: object

In [102]:

BnF_Data['name_bnf']=BnF_Data['name_bnf'].astype(str)
DBpedia['name_dbp']=DBpedia['name_dbp'].astype(str)

BnF_Data['viaf']=BnF_Data['viaf'].astype(str)
DBpedia['viaf_dbp']=DBpedia['viaf_dbp'].astype(str)

BnF_Data['uri_bnf']=BnF_Data['uri_bnf'].astype(str)
DBpedia['uri_dbp']=DBpedia['uri_dbp'].astype(str)

BnF_Data['placeOfBirth_bnf']=BnF_Data['placeOfBirth_bnf'].astype(str)
DBpedia['placeOfBirth_dbp']=DBpedia['placeOfBirth_dbp'].astype(str)


BnF_Data['placeOfDeath_bnf']=BnF_Data['placeOfDeath_bnf'].astype(str)
DBpedia['placeOfDeath_dbp']=DBpedia['placeOfDeath_dbp'].astype(str)

BnF_Data['dateBirth_bnf']=BnF_Data['dateBirth_bnf'].astype(str)
DBpedia['birthDate_dbp']=DBpedia['birthDate_dbp'].astype(str)

BnF_Data['dateDeath_bnf']=BnF_Data['dateDeath_bnf'].astype(str)
DBpedia['deathDate_dbp']=DBpedia['deathDate_dbp'].astype(str)

BnF_Data['bio_bnf']=BnF_Data['bio_bnf'].astype(str)

In [107]:
BnF_Data['BnF_Name_Lookup'] = BnF_Data[[
   'name_bnf', 'dateBirth_bnf','dateDeath_bnf','placeOfBirth_bnf','placeOfDeath_bnf','bio_bnf' 
]].apply(lambda x: '|'.join(x), axis=1)

DBpedia['dbp_Name_Lookup'] = DBpedia[[
   'name_dbp', 'birthDate_dbp','deathDate_dbp','placeOfBirth_dbp','placeOfDeath_dbp'
]].apply(lambda x: '|'.join(x), axis=1)

BnF_Data_lookup = BnF_Data[['BnF_Name_Lookup']].reset_index()
DBpedia_lookup = DBpedia[['dbp_Name_Lookup']].reset_index()


In [108]:
BnF_merge = potential_matches.merge(BnF_Data_lookup, how='left')

In [109]:
final_merge = BnF_merge.merge(DBpedia_lookup, how='left')

In [110]:
cols = ['id_bnf', 'id_dbp', 'Score',
        'BnF_Name_Lookup', 'dbp_Name_Lookup']
final=final_merge[cols].sort_values(by=[ 'Score'], ascending=True)
print(len(final))
final

117


Unnamed: 0,id_bnf,id_dbp,Score,BnF_Name_Lookup,dbp_Name_Lookup
0,2110,9566,2.0,Mihai A. Antonescu|1907-11-18|1946-06-01|Nucet...,Mihai Antonescu|1904-11-18|1946-06-01|Kingdom ...
84,9181,3201,2.0,Vilfredo Pareto|1848-07-15|1923-08-20|Paris|Cé...,Vilfredo Pareto|1848-07-15|1923-08-19|nan|nan
34,3014,8313,2.0,Gisèle Halimi|1927-07-27|2020-07-28|La Goulett...,Gisèle Halimi|1927-07-28|2020-07-28|French pro...
33,2607,1493,2.0,William Martin Geldart|1870-06-07|1922-02-12|n...,William Martin Geldart|1870-06-07|--02-12|nan|nan
44,3539,8182,2.0,Camille Blaisot|1881-01-19|1945|Valognes (Manc...,Camille Blaisot|1881-01-19|1945-01-24|Valognes...
...,...,...,...,...,...
32,2585,1483,3.0,Heinrich Lammasch|1853-05-21|1920-01-06|Seiten...,Heinrich Lammasch|1853-05-21|1920-01-06|Lower ...
31,2506,1417,3.0,Hubert Pierlot|1883-12-23|1963-12-13|Cugnon (B...,Hubert Pierlot|1883-12-23|1963-12-13|nan|nan
30,2363,8691,3.0,Jean Drapeau|1916-02-18|1999-08-12|nan|nan|Avo...,Jean Drapeau|1916-02-18|1999-08-12|nan|nan
63,5475,2728,3.0,Morris Raphael Cohen|1880-07-25|1947-01-28|Min...,Morris Raphael Cohen|1880-07-25|1947-01-28|Min...


#### Match between DBpedia and Wikidata

In [125]:
Wikidata

Wikidata['dateBirth_wk'] = Wikidata['dateBirth_wk'].str.rstrip('T00:00:00Z')
Wikidata['dateDeath_wk'] = Wikidata['dateDeath_wk'].str.rstrip('T00:00:00Z')


In [5]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='dateDeath_wk', right_on='deathDate_dbp')
candidates = indexer.index(Wikidata, DBpedia)
print(len(candidates))

8485


indexer = recordlinkage.Index()
indexer.block(left_on=['name_bnf', 'uri_bnf'],
              right_on=['name_dbp', 'uri_dbp'])
pairs = indexer.index(df_bnf, df_dbp)


candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

In [8]:
compare = recordlinkage.Compare()
compare.string('name_wk',
            'name_dbp',
            threshold=0.85,
            label='name_wk_dbp')
compare.exact('dateDeath_wk',
            'deathDate_dbp',
            label='deathDate_wk_dbp')
compare.exact('dateBirth_wk',
            'birthDate_dbp',
            label='birthDate_wk_dbp')
features = compare.compute(candidates, Wikidata, DBpedia)
# !!!! see if use https://recordlinkage.readthedocs.io/en/latest/ref-compare.html#recordlinkage.Compare.add works to add method like compare 'exact' on 'name'

In [9]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0     414
2.0     229
1.0    1117
0.0    6725
dtype: int64

In [10]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,name_wk_dbp,deathDate_wk_dbp,birthDate_wk_dbp
id_wk,id_dbp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17,7331,0.0,0,0
36,9434,0.0,0,0
50,6438,0.0,0,0
55,1465,0.0,0,0
56,2168,1.0,0,1
...,...,...,...,...
56128,9738,0.0,0,0
56168,687,0.0,0,0
56200,7415,0.0,0,0
56236,7486,0.0,0,0


In [14]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'name_wk_dbp':'birthDate_wk_dbp'].sum(axis=1)
potential_matches

Unnamed: 0,id_wk,id_dbp,name_wk_dbp,deathDate_wk_dbp,birthDate_wk_dbp,Score
0,56,2168,1.0,0,1,2.0
1,57,1218,1.0,0,1,2.0
2,260,1800,1.0,0,1,2.0
3,280,1671,1.0,0,1,2.0
4,289,4514,1.0,0,1,2.0
...,...,...,...,...,...,...
638,4690,362,1.0,0,1,2.0
639,6331,1001,1.0,0,1,2.0
640,7085,328,1.0,0,1,2.0
641,16470,4170,1.0,0,1,2.0


In [15]:
Wikidata.loc[2255,:]

uri_wk             http://www.wikidata.org/entity/Q893668
viaf_wk                    http://viaf.org/viaf/112668786
name_wk                                    Boris Fyodorov
dateBirth_wk                                   1958-02-13
dateDeath_wk                                   2008-11-19
placeOfBirth_wk                                    Moscow
placeOfDeath_wk                                    London
Name: 2255, dtype: object

In [16]:
DBpedia.loc[156,:]

uri_dbp             http://dbpedia.org/resource/Boris_Fyodorov
viaf_dbp                         http://viaf.org/viaf/91429025
name_dbp                                        Boris Fyodorov
birthDate_dbp                                       1958-02-13
deathDate_dbp                                       2008-11-20
placeOfBirth_dbp                                  Soviet Union
placeOfDeath_dbp                                        London
Name: 156, dtype: object

In [17]:
Wikidata['name_wk']=Wikidata['name_wk'].astype(str)
DBpedia['name_dbp']=DBpedia['name_dbp'].astype(str)

Wikidata['viaf_wk']=Wikidata['viaf_wk'].astype(str)
DBpedia['viaf_dbp']=DBpedia['viaf_dbp'].astype(str)

Wikidata['uri_wk']=Wikidata['uri_wk'].astype(str)
DBpedia['uri_dbp']=DBpedia['uri_dbp'].astype(str)

Wikidata['placeOfBirth_wk']=Wikidata['placeOfBirth_wk'].astype(str)
DBpedia['placeOfBirth_dbp']=DBpedia['placeOfBirth_dbp'].astype(str)

Wikidata['placeOfDeath_wk']=Wikidata['placeOfDeath_wk'].astype(str)
DBpedia['placeOfDeath_dbp']=DBpedia['placeOfDeath_dbp'].astype(str)

Wikidata['dateDeath_wk']=Wikidata['dateDeath_wk'].astype(str)
DBpedia['deathDate_dbp']=DBpedia['deathDate_dbp'].astype(str)

Wikidata['dateBirth_wk']=Wikidata['dateBirth_wk'].astype(str)
DBpedia['birthDate_dbp']=DBpedia['birthDate_dbp'].astype(str)

In [18]:
Wikidata['Wikidata_Name_Lookup'] = Wikidata[[
   'name_wk', 'dateBirth_wk','dateDeath_wk','placeOfBirth_wk' ,'placeOfDeath_wk' 
]].apply(lambda x: '|'.join(x), axis=1)

DBpedia['dbp_Name_Lookup'] = DBpedia[[
   'name_dbp', 'birthDate_dbp', 'deathDate_dbp','placeOfBirth_dbp', 'placeOfDeath_dbp'
]].apply(lambda x: '|'.join(x), axis=1)

Wikidata_lookup = Wikidata[['Wikidata_Name_Lookup']].reset_index()
DBpedia_lookup = DBpedia[['dbp_Name_Lookup']].reset_index()


In [19]:
Wikidata_merge = potential_matches.merge(Wikidata_lookup, how='left')

In [20]:
final_merge = Wikidata_merge.merge(DBpedia_lookup, how='left')

In [21]:
cols = ['id_wk', 'id_dbp', 'Score',
        'Wikidata_Name_Lookup', 'dbp_Name_Lookup']
final=final_merge[cols].sort_values(by=[ 'Score'], ascending=True)
print(len(final))
final[:20]

643


Unnamed: 0,id_wk,id_dbp,Score,Wikidata_Name_Lookup,dbp_Name_Lookup
0,56,2168,2.0,Alexander Rüstow|1885-04-08|1963-06-3|Wiesbade...,Alexander Rüstow|1885-04-08|1963-06-30|Wiesbad...
353,7496,3530,2.0,Vasily Garbuzov|1911-07-03|1985-11-12|Belgorod...,Vasily Garbuzov|1911-06-20|1985-11-12|Belgorod...
340,6984,4250,2.0,Marinus van der Goes van Naters|1900-12-21|200...,Goes van Naters|1900-12-21|2005-02-12|Nijmegen...
336,20209,4045,2.0,Imre Oltványi|1893-02-2|1963-01-13|Bácsalmás|B...,Imre Oltványi|1893-02-20|1963-01-13|Austria-Hu...
335,6773,54,2.0,Kenneth Arrow|1921-08-23|2017-02-21|New York C...,nan|1921-08-23|2017-02-21|nan|nan
330,6611,3268,2.0,Simon Kuznets|1901-04-3|1985-07-08|Pinsk|Cambr...,Simon Kuznets|1901-04-30|1985-07-08|Russian Em...
329,6594,2505,2.0,Paul Douglas|1892-03-26|1976-09-24|Salem|Washi...,nan|1892-03-26|1976-09-24|nan|nan
326,6497,4527,2.0,John Peters Humphrey|1905-04-3|1995-03-14|New ...,John Peters Humphrey|1905-04-30|1995-03-14|nan...
641,16470,4170,2.0,Tord Palander|1902-10-06|1972-01-01|Stockholm|nan,Tord Palander|1902-10-06|1972|Sweden|nan
317,19177,3433,2.0,Owen Woodhouse|1916-07-18|2014-04-15|Napier|Au...,Sir Owen Woodhouse|1916-07-18|2014-04-15|nan|nan


--------------------
#### Matched BnF Data and Wikidata

In [373]:
BnF_Data = pd.read_csv('df_bnf.csv', index_col="id_bnf")
Wikidata = pd.read_csv('df_wk.csv', index_col="id_wk")
DBpedia = pd.read_csv('df_dbp.csv', index_col="id_dbp")

In [126]:
indexer = recordlinkage.Index()
indexer.sortedneighbourhood(left_on='name_bnf', right_on='name_wk')
candidates = indexer.index(BnF_Data, Wikidata)
print(len(candidates))

14176


indexer = recordlinkage.Index()
indexer.block(left_on=['name_bnf', 'uri_bnf'],
              right_on=['name_dbp', 'uri_dbp'])
pairs = indexer.index(df_bnf, df_dbp)


candidates = indexer.index(BnF_Data, DBpedia)
print(len(candidates))

In [127]:
compare = recordlinkage.Compare()
compare.string('name_bnf',
            'name_wk',
            method='jarowinkler',
            threshold=0.85,
            label='name_bnf_wk')
compare.exact('dateBirth_bnf',
            'dateBirth_wk',
            label='birthDate_bnf_wk')
compare.exact('dateDeath_bnf',
            'dateDeath_wk',
            label='deathDate_bnf_wk')
features = compare.compute(candidates, BnF_Data, Wikidata)

In [128]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

3.0     603
2.0     408
1.0    6765
0.0    6400
dtype: int64

In [129]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,name_bnf_wk,birthDate_bnf_wk,deathDate_bnf_wk
id_bnf,id_wk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,13374,1.0,0,0
4,17051,0.0,0,0
6,3824,1.0,0,0
7,31467,0.0,0,0
8,14068,0.0,0,0
...,...,...,...,...
11125,44371,1.0,0,0
11127,44913,0.0,0,0
11128,37796,1.0,0,0
11130,35596,0.0,0,0


In [130]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'name_bnf_wk':'deathDate_bnf_wk'].sum(axis=1)
potential_matches

Unnamed: 0,id_bnf,id_wk,name_bnf_wk,birthDate_bnf_wk,deathDate_bnf_wk,Score
0,274,6828,1.0,1,1,3.0
1,529,12224,1.0,1,0,2.0
2,696,931,1.0,1,1,3.0
3,758,15631,1.0,1,1,3.0
4,1741,4339,1.0,1,1,3.0
...,...,...,...,...,...,...
1006,10220,12980,1.0,1,0,2.0
1007,10373,42722,1.0,0,1,2.0
1008,11083,18753,1.0,1,1,3.0
1009,11116,53041,1.0,1,1,3.0


In [136]:
Wikidata.loc[6828,:]

uri_wk                   http://www.wikidata.org/entity/Q200913
viaf_wk                           http://viaf.org/viaf/92914741
name_wk            Barbara Wootton, Baroness Wootton of Abinger
dateBirth_wk                                         1897-04-14
dateDeath_wk                                         1988-07-11
placeOfBirth_wk                                       Cambridge
placeOfDeath_wk                                          Surrey
Name: 6828, dtype: object

In [137]:
BnF_Data.loc[274,:]

uri_bnf               http://data.bnf.fr/ark:/12148/cb12332384r#about
viaf                                    http://viaf.org/viaf/92914741
name_bnf                                              Barbara Wootton
Sname                                                             NaN
dateBirth_bnf                                              1897-04-14
dateDeath_bnf                                              1988-07-11
placeOfBirth_bnf                                                  nan
placeOfDeath_bnf                                                  nan
bio_bnf             Juriste. - Professeur de sciences sociales à l...
BnF_Name_Lookup     Barbara Wootton|1897-04-14|1988-07-11|nan|nan|...
Name: 274, dtype: object

In [139]:
Wikidata['name_wk']=Wikidata['name_wk'].astype(str)
BnF_Data['name_bnf']=BnF_Data['name_bnf'].astype(str)

Wikidata['viaf_wk']=Wikidata['viaf_wk'].astype(str)
BnF_Data['viaf']=BnF_Data['viaf'].astype(str)

Wikidata['uri_wk']=Wikidata['uri_wk'].astype(str)
BnF_Data['uri_bnf']=BnF_Data['uri_bnf'].astype(str)

Wikidata['placeOfBirth_wk']=Wikidata['placeOfBirth_wk'].astype(str)
BnF_Data['placeOfBirth_bnf']=BnF_Data['placeOfBirth_bnf'].astype(str)

Wikidata['placeOfDeath_wk']=Wikidata['placeOfDeath_wk'].astype(str)
BnF_Data['placeOfDeath_bnf']=BnF_Data['placeOfDeath_bnf'].astype(str)

BnF_Data['dateDeath_bnf']=BnF_Data['dateDeath_bnf'].astype(str)
Wikidata['dateDeath_wk']=Wikidata['dateDeath_wk'].astype(str)

BnF_Data['dateBirth_bnf']=BnF_Data['dateBirth_bnf'].astype(str)
Wikidata['dateBirth_wk']=Wikidata['dateBirth_wk'].astype(str)

BnF_Data['bio_bnf']=BnF_Data['bio_bnf'].astype(str)


In [140]:
Wikidata['Wikidata_Name_Lookup'] = Wikidata[[
   'name_wk', 'dateBirth_wk','dateDeath_wk','placeOfBirth_wk' ,'placeOfDeath_wk' 
]].apply(lambda x: '|'.join(x), axis=1)

BnF_Data['bnf_Name_Lookup'] = BnF_Data[[
   'name_bnf','dateBirth_bnf','dateDeath_bnf','placeOfBirth_bnf','placeOfDeath_bnf','bio_bnf' 
]].apply(lambda x: '|'.join(x), axis=1)

Wikidata_lookup = Wikidata[['Wikidata_Name_Lookup']].reset_index()
BnF_Data_lookup = BnF_Data[['bnf_Name_Lookup']].reset_index()

In [141]:
Wikidata_lookup

Unnamed: 0,id_wk,Wikidata_Name_Lookup
0,0,Walter von Keudell|1884-07-17|1973-05-07|Caste...
1,1,Peter Schulz|1930-04-25|2013-05-17|Rostock|Ham...
2,2,Werner Hoyer|1951-11-17|nan|Wuppertal|nan
3,3,Leopold August Warnkönig|1794-08-01|1866-08-19...
4,4,Wolfgang Bötsch|1938-09-08|2017-10-14|Bad Kreu...
...,...,...
53257,56293,nan|1838-05-04|1924-03-09|Madaras|Budapest
53258,56294,nan|1866-01-31|1953-10-13|Jičín|Prague
53259,56295,nan|1935-03-2|nan|Zagaj pri Ponikvi|nan
53260,56297,nan|1903-01-01|1993-01-01|nan|nan


In [142]:
Wikidata_merge = potential_matches.merge(Wikidata_lookup, how='left')

In [143]:
final_merge = Wikidata_merge.merge(BnF_Data_lookup, how='left')

In [146]:
cols = ['id_bnf', 'id_wk', 'Score',
        'bnf_Name_Lookup', 'Wikidata_Name_Lookup']
final=final_merge[cols].sort_values(by=[ 'Score'], ascending=True)
print(len(final))
final[:50]

1011


Unnamed: 0,id_bnf,id_wk,Score,bnf_Name_Lookup,Wikidata_Name_Lookup
505,5492,28339,2.0,Yvon Linant de Bellefonds|1904-08-27|1994-12-2...,Yvon Linant de Bellefonds|1904-01-01|1994-12-2...
517,5730,33371,2.0,István Csekey|1889-02-02|1963-08-07|Szolnok (H...,István Csekey|1889-02-02|1963-01-01|Szolnok|Pécs
519,5818,27749,2.0,Federico Patellani|1911-12-01|1977-02-10|Monza...,Federico Patellani|1911-12-01|1977-01-01|Monza...
520,5881,29965,2.0,Maurice Colin|1859-01-19|1920-09-09|Lyon (Rhôn...,Maurice Colin|1859-01-11|1920-09-09|Lyon|Lyon
521,5887,30171,2.0,Charles Lyon-Caen|1843-12-25|1935-09-17|Paris ...,Charles Lyon-Caen|1843-12-25|1935-12-17|Paris|...
522,5889,5616,2.0,Charles-Frédéric Rau|1803-08-03|1877-04-10|Sav...,Charles-Frédéric Rau|1803-08-03|1877-04-1|Boux...
524,5897,12501,2.0,Robert Walter|1931-01-30|2010-12-25|nan|nan|Ju...,Robert Walter|1931-01-3|2010-12-25|Vienna|Vienna
515,5715,5403,2.0,Arturo Capdevila|1889-03-14|1967-12-20|Cordoba...,"Arturo Capdevila|1889-03-14|1967-12-2|El Pepe,..."
525,5899,27689,2.0,Salvatore Riccobono|1864-01-31|1958-04-05|San ...,Salvatore Riccobono|1864-01-31|1958-04-12|San ...
527,5905,1166,2.0,Ottó Bihari|1921-01-11|1983-01-04|Timis̡oara (...,Ottó Bihari|1921-01-13|1983-01-04|Timișoara|Pécs


In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer

Wikidata_name = Wikidata['name_wk'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(Wikidata_name)

BnF_Data_name = BnF_Data['name_bnf'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(BnF_Data_name)


In [72]:
#BnF_Data.head()

In [141]:
import re
def ngrams(string, n=3):
    string = string.encode("UTF-8", errors="ignore").decode() 
    string = string.lower()
    chars_to_remove = [')', '(', '.', '|', '[', ']', '{', '}', "'" ]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) # remove the list of chars defined above
    string = string.replace('&', 'and')
    string = string.replace(',', ' ').replace('-', ' ')
    string = string.title() # Capital at start of each word
    string = re.sub(' +',' ',string).strip() # combine whitespace
    string = ' ' + string + ' ' # pad
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [142]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [143]:
BnF_Data_clean = BnF_Data["name_bnf"].unique()

print('Vectorizing the data - this could take a few minutes for large datasets...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

tfidf = vectorizer.fit_transform(BnF_Data_clean)
print('Vectorizing completed...')

from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

name_column = 'name_wk' #column to match against in the messy data
unique_name = set(Wikidata[name_column].values) # set used for increased performance

Vectorizing the data - this could take a few minutes for large datasets...
Vectorizing completed...


In [144]:
###matching query:
def getNearestN(query):
    queryTFIDF_ = vectorizer.transform(query)
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

import time
t1 = time.time()
print('getting nearest n...')
distances, indices = getNearestN(unique_name)
t = time.time()-t1
print("COMPLETED IN:", t)

unique_name = list(unique_name) #need to convert back to a list
print('finding matches...')
matches = []
for i,j in enumerate(indices):
    temp = [round(distances[i][0],2), BnF_Data.values[j][0][2],unique_name[i]]
    matches.append(temp)

print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['Match confidence (lower is better)','Matched name','Original name'])
print('Done')

getting nearest n...


KeyboardInterrupt: 

In [151]:
# Match string with TF_IDF method
# https://bergvca.github.io/2017/10/14/super-fast-string-matching.html
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import rand
import sparse_dot_topn.sparse_dot_topn as ct
from sparse_dot_topn import awesome_cossim_topn

N = 10
a = rand(100, 1000000, density=0.005, format='csr')
b = rand(1000000, 200, density=0.005, format='csr')

# Use standard implementation

c = awesome_cossim_topn(a, b, N, 0.01)

# Use parallel implementation with 4 threads

d = awesome_cossim_topn(a, b, N, 0.01, use_threads=True, n_jobs=4)

In [152]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.3138604164123535


In [153]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                            'right_side': right_side,
                              'similairity': similairity})
        


In [156]:
matches_df = get_matches_df(matches, BnF_Data_name, top=11171)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)


Unnamed: 0,left_side,right_side,similairity
9358,Michel Morineau,Michel Morin,0.820119
6672,Jean Schmidt,Jean Charles Schmidt,0.806333
9999,Jean Charles Schmidt,Jean Schmidt,0.806333
1860,Georges-Henri Bousquet,Georges Bousquet,0.820439
3491,ʿAlī ʿAlāʾ al-Dīn al- ʿĀlūsī,Ṣalāḥ al-Dīn al- Nāhī,0.804265
6364,Édouard Grar,Édouard Gérard,0.921296
5807,Raymond de Geouffre de La Pradelle,Paul de Geouffre de La Pradelle,0.836728
6209,Charles Houyvet,Henri Charles Houyvet,0.89024
1463,Hippolyte Bérard Des Glajeux,Étienne-Hippolyte-Paul Bérard des Glajeux,0.850011
7435,Charles-Jules Giraud,Charles Giraud,0.867012


In [135]:
matches_df.sort_values(['similairity'], ascending=False).head(10)

Unnamed: 0,left_side,right_side,similairity
4268,Joseph-Marie de Kersauson de Pennendreff,Joseph-Marc-Marie de Kersauson de Pennendreff,0.948466
5851,Joseph-Marc-Marie de Kersauson de Pennendreff,Joseph-Marie de Kersauson de Pennendreff,0.948466
6809,Rodolphe Dareste de La Chavanne,Pierre-Rodolphe Dareste de La Chavanne,0.937893
1387,Pierre-Rodolphe Dareste de La Chavanne,Rodolphe Dareste de La Chavanne,0.937893
6053,Charles Merveilleux du Vignaux,François-Charles Merveilleux du Vignaux,0.935526
5884,François-Charles Merveilleux du Vignaux,Charles Merveilleux du Vignaux,0.935526
4995,Jean Gilbert Villeneuve,Gilbert Villeneuve,0.925843
910,Gilbert Villeneuve,Jean Gilbert Villeneuve,0.925843
7108,Édouard Gérard,Édouard Grar,0.921296
6364,Édouard Grar,Édouard Gérard,0.921296


In [86]:
matches.head(10)

Unnamed: 0,Match confidence (lower is better),Matched name,Original name
0,1.08,Sándor Boschan,Ernst Geiger
1,1.03,Armand Dorville,Robert Basmann
2,1.1,Jesse James,Paul Javor
3,1.0,Jean-Claude Colliard,Luis Bates
4,1.04,Gian Domenico Pisapia,Max Ludwig Boeckh
5,1.12,Charles Libman,Yann Moulier-Boutang
6,1.15,Frank Eaton,Haldur Grüner
7,1.2,Gabriele Criscuoli,Mikalai Zaichanka
8,0.99,József Katona,Bonifaz Sander
9,1.13,Nicolás Tenorio y Cerero,Angel Agache


### Between Wikidata and DBpedia

In [4]:
merged_df_wk_dbp = pd.merge( Wikidata, DBpedia , on='viaf', how='inner', sort='viaf')
merged_df_wk_dbp[:10]

KeyError: 'viaf'

In [37]:
print("the number of merged data from DBpedia and Wikidata is ",len(merged_df_1), "rows.")
print("")
print("The proportion of the number of merged data from DBpedia with Wikidata is ",((len(merged_df_1))/(len(df_dbp))*100),"%")
print("")
print("proportion of the number of merged data from Wikidata with DBpedia is ",((len(merged_df_1))/(len(df_wk))*100),"%")

the number of merged data from DBpedia and Wikidata is  898 rows.

The proportion of the number of merged data from DBpedia with Wikidata is  54.42424242424242 %

proportion of the number of merged data from Wikidata with DBpedia is  4.152985247190491 %


### Between Wikidata and BnF Data

In [38]:
merged_df_2 = pd.merge( df_wk, df_bnf , on='viaf', how='inner', sort='viaf')
print(len(merged_df_2))
merged_df_2[:10]

112


Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q540253,http://viaf.org/viaf/100277874,Clemens Maria Franz von Bönninghausen,1785,http://data.bnf.fr/ark:/12148/cb165146162#about,Clemens Maria Franz von Bönninghausen,,1785,"Avocat, agriculteur, botaniste, homéopathe. - ..."
1,http://www.wikidata.org/entity/Q11724800,http://viaf.org/viaf/101647977,Jerzy Karol Kurnatowski,1874,http://data.bnf.fr/ark:/12148/cb10528392p#about,Jerzy Kurnatowski,,1874,"Publiciste, juriste et économiste"
2,http://www.wikidata.org/entity/Q11738367,http://viaf.org/viaf/101863288,Kazimierz Studentowicz,1903,http://data.bnf.fr/ark:/12148/cb11261387v#about,Kazimierz Studentowicz,,1903,"Juriste, homme politique"
3,http://www.wikidata.org/entity/Q104820757,http://viaf.org/viaf/107036313,Félix Garcin,1879,http://data.bnf.fr/ark:/12148/cb13073916m#about,Félix Garcin,,1879,"Journaliste, directeur de ""Nouvelliste"", Lyon...."
4,http://www.wikidata.org/entity/Q1680590,http://viaf.org/viaf/107145857823423020439,Josef Redlich,1869,http://data.bnf.fr/ark:/12148/cb12818732h#about,Josef Redlich,,1869,Professeur de droit public et d'administration...
5,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
6,http://www.wikidata.org/entity/Q88911,http://viaf.org/viaf/108482851,Otto Nathan,1893,http://data.bnf.fr/ark:/12148/cb12874301d#about,Otto Nathan,,1893,Économiste. - Avocat. - Exécuteur testamentair...
7,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
8,http://www.wikidata.org/entity/Q87110924,http://viaf.org/viaf/110494713,Henri Denis,1913,http://data.bnf.fr/ark:/12148/cb12103181f#about,Henri Denis,,1913,"Docteur en droit (Paris, 1938). - Professeur d..."
9,http://www.wikidata.org/entity/Q1345621,http://viaf.org/viaf/111314151,Marco Biagi,1950,http://data.bnf.fr/ark:/12148/cb150888041#about,Marco Biagi,,1950,Juriste


In [39]:
print("The number of merged data from BnF Data and Wikidata is ",len(merged_df_2), "rows.")

print("")

print("The proportion of the number of merged data from BnF Data with Wikidata is ",((len(merged_df_2))/(len(df_bnf))*100),"%")

print("")

print("The proportion of the number of merged data from Wikidata with Wikidata is ",((len(merged_df_2))/(len(df_wk))*100),"%")

The number of merged data from BnF Data and Wikidata is  112 rows.

The proportion of the number of merged data from BnF Data with Wikidata is  1.224445173280857 %

The proportion of the number of merged data from Wikidata with Wikidata is  0.5179669796050502 %


### Between DBpedia and BnF Data

In [40]:
merged_df_3 = pd.merge( df_bnf, df_dbp , on='viaf', how='inner', sort='viaf')
print(len(merged_df_3))
merged_df_3[:10]

88


Unnamed: 0,uri_bnf,viaf,name_bnf,sName,year_bnf,bio_bnf,uri_dbp,name_dbp,year_dbp
0,http://data.bnf.fr/ark:/12148/cb122145877#about,http://viaf.org/viaf/100966624,John Humphrey,,1905,Juriste. - A été professeur de droit internati...,http://dbpedia.org/resource/John_Peters_Humphrey,John Peters Humphrey,1905
1,http://data.bnf.fr/ark:/12148/cb12327654n#about,http://viaf.org/viaf/107536763,Louis Renault,,1843,Juriste. - Professeur de droit international à...,http://dbpedia.org/resource/Louis_Renault_(jur...,Louis Renault,1843
2,http://data.bnf.fr/ark:/12148/cb122775427#about,http://viaf.org/viaf/108173876,Ronald Myles Dworkin,,1931,Juriste. - Professeur de jurisprudence à la Ya...,http://dbpedia.org/resource/Ronald_Dworkin,,1931
3,http://data.bnf.fr/ark:/12148/cb11927239j#about,http://viaf.org/viaf/108188941,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,...",http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922
4,http://data.bnf.fr/ark:/12148/cb120906270#about,http://viaf.org/viaf/108565309,Paul Abraham Freund,,1908,"Professeur de droit, ""Harvard Law School""",http://dbpedia.org/resource/Paul_A._Freund,Paul Abraham Freund,1908
5,http://data.bnf.fr/ark:/12148/cb119084288#about,http://viaf.org/viaf/108587991,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938
6,http://data.bnf.fr/ark:/12148/cb128832222#about,http://viaf.org/viaf/108624624,Muḥammad Ẓafr Allāh H̱ān,,1893,"Juriste, diplomate et homme politique",http://dbpedia.org/resource/Muhammad_Zafarulla...,CH Muhammad Zafarullah Khan,1893
7,http://data.bnf.fr/ark:/12148/cb12299375j#about,http://viaf.org/viaf/108794549,Karl Engisch,,1899,Juriste. - Spécialiste de philosophie du droit...,http://dbpedia.org/resource/Karl_Engisch,Karl Engisch,1899
8,http://data.bnf.fr/ark:/12148/cb118935370#about,http://viaf.org/viaf/111389197,Georges Bousquet,,1846,Avocat au Barreau de Paris (en 1866). - Engagé...,http://dbpedia.org/resource/Georges_Hilaire_Bo...,Georges Hilaire Bousquet,1845
9,http://data.bnf.fr/ark:/12148/cb12328362p#about,http://viaf.org/viaf/11396531,John Paul Stevens,,1920,Juriste américain,http://dbpedia.org/resource/John_Paul_Stevens,John Paul Stevens,1920


In [41]:
print("The number of merged data from BnF Data and Wikidata is ",len(merged_df_3), "rows.")

print("")

print("The proportion of the number of merged data from BnF Data with DBpedia is ",((len(merged_df_3))/(len(df_dbp))*100),"%")

print("")

print("The proportion of the number of merged data from DBpedia with BnF Data is ",((len(merged_df_3))/(len(df_bnf))*100),"%")

The number of merged data from BnF Data and Wikidata is  88 rows.

The proportion of the number of merged data from BnF Data with DBpedia is  5.333333333333334 %

The proportion of the number of merged data from DBpedia with BnF Data is  0.9620640647206734 %


### Between Wikidata, BnF Data and DBpedia

In [42]:
merged_df = pd.merge( merged_df_1, df_bnf , on='viaf', how='inner', sort='viaf')
merged_df[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
1,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
2,http://www.wikidata.org/entity/Q3085838,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,François Simiand,1873,http://data.bnf.fr/ark:/12148/cb12301152q#about,François Simiand,,1873,Philosophe. - Agrégé de philosophie. - Docteur...
3,http://www.wikidata.org/entity/Q61956,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,Lorenz von Stein,1815,http://data.bnf.fr/ark:/12148/cb12001622n#about,Lorenz von Stein,,1815,"Juriste et économiste. - Professeur à Kiel, Al..."
4,http://www.wikidata.org/entity/Q231690,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,Bhimrao Ramji Ambedkar,1891,http://data.bnf.fr/ark:/12148/cb12126992f#about,Bhimrao Ramji Ambedkar,,1891,Homme politique d'origine harijan mahar. - Étu...
5,http://www.wikidata.org/entity/Q215961,http://viaf.org/viaf/50021033,Franz Hermann Schulze-Delitzsch,1808,http://dbpedia.org/resource/Franz_Hermann_Schu...,Hermann Schulze-Delitzsch,1808,http://data.bnf.fr/ark:/12148/cb12088660j#about,Hermann Schulze-Delitzsch,,1808,"Juriste, homme politique et économiste alleman..."
6,http://www.wikidata.org/entity/Q4893263,http://viaf.org/viaf/69263532,Joan Sardà i Dexeus,1910,http://dbpedia.org/resource/Joan_Sardà_i_Dexeus,Joan Sardà i Dexeus,1910,http://data.bnf.fr/ark:/12148/cb158098327#about,Juan Sardá Dexeus,,1910,Docteur en droit. - Économiste
7,http://www.wikidata.org/entity/Q7836141,http://viaf.org/viaf/73921034,Travers Twiss,1809,http://dbpedia.org/resource/Travers_Twiss,Travers Twiss,1809,http://data.bnf.fr/ark:/12148/cb12314495r#about,Travers Twiss,,1809,Juriste. - Spécialiste de droit international


In [43]:
print("The number of merged data from DBpedia, Wikidata and BnF Data is",len(merged_df),"rows.")
print("")
print("The proportion of the number of merged data from DBpedia, Wikidata and BnF Data is ",(len(merged_df))/(len(df_bnf))*100,"%" )

The number of merged data from DBpedia, Wikidata and BnF Data is 8 rows.

The proportion of the number of merged data from DBpedia, Wikidata and BnF Data is  0.08746036952006123 %


### Append DBpedia, BnF Data and Wikidata to the merged Dataframe who don't have viaf.org values in common

In [44]:
# To achieve this, I am served on these pages: 
# https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html (en)
# https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html (en)
# http://www.python-simple.com/python-pandas/concatenations-joins-dataframe.php (fr)

result = merged_df.append([merged_df_1, merged_df_2, merged_df_3, df_bnf, df_wk, df_dbp], sort=False)
print(len(result))
result[:10]

33526


Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922.0,"Docteur en droit (University of Chicago, Ill.,..."
1,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938.0,Juriste et économiste. - Professeur à l'Univer...
2,http://www.wikidata.org/entity/Q3085838,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,François Simiand,1873,http://data.bnf.fr/ark:/12148/cb12301152q#about,François Simiand,,1873.0,Philosophe. - Agrégé de philosophie. - Docteur...
3,http://www.wikidata.org/entity/Q61956,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,Lorenz von Stein,1815,http://data.bnf.fr/ark:/12148/cb12001622n#about,Lorenz von Stein,,1815.0,"Juriste et économiste. - Professeur à Kiel, Al..."
4,http://www.wikidata.org/entity/Q231690,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,Bhimrao Ramji Ambedkar,1891,http://data.bnf.fr/ark:/12148/cb12126992f#about,Bhimrao Ramji Ambedkar,,1891.0,Homme politique d'origine harijan mahar. - Étu...
5,http://www.wikidata.org/entity/Q215961,http://viaf.org/viaf/50021033,Franz Hermann Schulze-Delitzsch,1808,http://dbpedia.org/resource/Franz_Hermann_Schu...,Hermann Schulze-Delitzsch,1808,http://data.bnf.fr/ark:/12148/cb12088660j#about,Hermann Schulze-Delitzsch,,1808.0,"Juriste, homme politique et économiste alleman..."
6,http://www.wikidata.org/entity/Q4893263,http://viaf.org/viaf/69263532,Joan Sardà i Dexeus,1910,http://dbpedia.org/resource/Joan_Sardà_i_Dexeus,Joan Sardà i Dexeus,1910,http://data.bnf.fr/ark:/12148/cb158098327#about,Juan Sardá Dexeus,,1910.0,Docteur en droit. - Économiste
7,http://www.wikidata.org/entity/Q7836141,http://viaf.org/viaf/73921034,Travers Twiss,1809,http://dbpedia.org/resource/Travers_Twiss,Travers Twiss,1809,http://data.bnf.fr/ark:/12148/cb12314495r#about,Travers Twiss,,1809.0,Juriste. - Spécialiste de droit international
0,http://www.wikidata.org/entity/Q9387,http://viaf.org/viaf/100180950,Max Weber,1864,http://dbpedia.org/resource/Max_Weber,,1864,,,,,
1,http://www.wikidata.org/entity/Q15999850,http://viaf.org/viaf/100246974,Peter J. Hammond,1945,http://dbpedia.org/resource/Peter_J._Hammond_(...,Peter Hammond,1945,,,,,


In [45]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section
result_test=result

# Replace null value name

## BnF Data
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_dbp'],result['name_bnf'])
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_wk'],result['name_bnf'])
## DBpedia
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_bnf'],result['name_dbp'])
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_wk'],result['name_dbp'])
## Wikidata
result_test['name'] = np.where(result['name_wk'].isnull(),result['name_bnf'],result['name_wk'])
result_test['name'] = np.where(result['name_wk'].isnull(),result['name_dbp'],result['name_wk'])

# Replace null value year

## BnF Data
result_test['year'] = np.where(result['year_bnf'].isnull(),result['year_dbp'],result['year_bnf'])
result_test['year'] = np.where(result['year_bnf'].isnull(),result['year_wk'],result['year_bnf'])
## DBpedia
result_test['year'] = np.where(result['year_dbp'].isnull(),result['year_bnf'],result['year_dbp'])
result_test['year'] = np.where(result['year_dbp'].isnull(),result['year_wk'],result['year_dbp'])
## Wikidata
result_test['year'] = np.where(result['year_wk'].isnull(),result['year_bnf'],result['year_wk'])
result_test['year'] = np.where(result['year_wk'].isnull(),result['year_dbp'],result['year_wk'])

result_test=result_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf', 'uri_wk']]
result_test[:5]

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
0,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,http://data.bnf.fr/ark:/12148/cb11927239j#about,http://www.wikidata.org/entity/Q518859
1,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,http://data.bnf.fr/ark:/12148/cb119084288#about,http://www.wikidata.org/entity/Q652154
2,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,http://data.bnf.fr/ark:/12148/cb12301152q#about,http://www.wikidata.org/entity/Q3085838
3,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,http://data.bnf.fr/ark:/12148/cb12001622n#about,http://www.wikidata.org/entity/Q61956
4,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,http://data.bnf.fr/ark:/12148/cb12126992f#about,http://www.wikidata.org/entity/Q231690


In [46]:
# Extract only rows without VIAF uri from DBpeida
df_dbp_test = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf', 'name_dbp', 'year_dbp'])

df_dbp_test_mask=df_dbp_test['viaf']==''
filtered_df_dbp_test = df_dbp_test[df_dbp_test_mask]
print(len(filtered_df_dbp_test))
filtered_df_dbp_test[:10]

6798


Unnamed: 0,uri_dbp,viaf,name_dbp,year_dbp
1747,http://dbpedia.org/resource/Luc-Normand_Tellier,,Luc-Normand Tellier,1944
1748,http://dbpedia.org/resource/Madhu_Verma,,Madhu Verma,1961
1749,http://dbpedia.org/resource/Magda_Kandil,,Magda ElSayed Kandil,1958
1750,http://dbpedia.org/resource/Magnus_Johannesson,,Magnus Johannesson,1964
1751,http://dbpedia.org/resource/Mahendra_P._Lama,,Mahendra P. Lama,1961
1752,http://dbpedia.org/resource/Mainul_Islam,,Mainul Islam,1950
1753,http://dbpedia.org/resource/Urs_Meisterhans,,Urs Meisterhans,1960
1754,http://dbpedia.org/resource/Rosalind_Blauer,,Rosalind Blauer,1943
1755,http://dbpedia.org/resource/Makoto_Yano,,Makoto Yano,1952
1756,http://dbpedia.org/resource/Krzysztof_Zamasz,,Krzysztof Zamasz,1974


In [47]:
# drop duplicates in DBpedia dataframe
filtered_df_dbp_test.drop_duplicates(subset ="uri_dbp", keep = False, inplace=True)
print(len(filtered_df_dbp_test))

6320


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
# Extract only rows without VIAF uri from BnF Data
df_bnf_test = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'sName','year_bnf','bio_bnf'])

df_bnf_test_mask=df_bnf_test['viaf']==''
filtered_df_bnf_test = df_bnf_test[df_bnf_test_mask]
len(filtered_df_bnf_test)

2054

In [49]:
# drop duplicates in BnF Data dataframe

filtered_df_bnf_test.drop_duplicates(subset ="uri_bnf", keep = 'first', inplace=True)
print(len(filtered_df_bnf_test))

2054


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
##### Extract only rows without VIAF uri from BnF Data
df_wk_test = pd.DataFrame(result_wikidata, columns=['uri_wk', 'viaf', 'name_wk','year_wk'])

df_wk_test_mask=df_wk_test['viaf']==''
filtered_df_wk_test = df_wk_test[df_wk_test_mask]
len(filtered_df_wk_test)
filtered_df_wk_test[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk
22,http://www.wikidata.org/entity/Q116500,,Hans Kaufmann,1948
42,http://www.wikidata.org/entity/Q116475,,Hannes Germann,1956
83,http://www.wikidata.org/entity/Q92134,,Uta Nickel,1941
134,http://www.wikidata.org/entity/Q117426,,Peter Briner,1943
158,http://www.wikidata.org/entity/Q71778,,"Donatus, Landgrave of Hesse",1966
167,http://www.wikidata.org/entity/Q74023,,Liudmyla Denisova,1960
176,http://www.wikidata.org/entity/Q119987,,Martin Baltisser,1969
183,http://www.wikidata.org/entity/Q120799,,Arthur Loepfe,1942
355,http://www.wikidata.org/entity/Q75582,,Shkëlqim Cani,1956
364,http://www.wikidata.org/entity/Q123964,,Werner Hennig,1928


In [51]:
# drop duplicates in Wikidata dataframe

filtered_df_wk_test.drop_duplicates(subset ="uri_wk", keep = 'first', inplace=True)
print(len(filtered_df_wk_test))

7418


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
# merge the three dateframes 
filtered_dbp_bnf_test= filtered_df_dbp_test.append(filtered_df_bnf_test, sort=True)
len(filtered_dbp_bnf_test)
filtered_dbp_bnf_wk_test=filtered_dbp_bnf_test.append(filtered_df_wk_test, sort=True)
print(len(filtered_dbp_bnf_wk_test))
filtered_dbp_bnf_wk_test[-1500:]

15792


Unnamed: 0,bio_bnf,name_bnf,name_dbp,name_wk,sName,uri_bnf,uri_dbp,uri_wk,viaf,year_bnf,year_dbp,year_wk
27003,,,,Marek Matejun,,,,http://www.wikidata.org/entity/Q66974612,,,,1977
27008,,,,Lluís Mosella i Ximenez,,,,http://www.wikidata.org/entity/Q66978783,,,,1975
27009,,,,Fawzi Al-Qaisi,,,,http://www.wikidata.org/entity/Q67031593,,,,1926
27010,,,,Hasan Al-Ameri,,,,http://www.wikidata.org/entity/Q66828293,,,,1938
27013,,,,Isidre Sala Queralt,,,,http://www.wikidata.org/entity/Q67123989,,,,1973
27017,,,,Örs Farkas,,,,http://www.wikidata.org/entity/Q105079853,,,,1988
27018,,,,Zsófia Lakatos,,,,http://www.wikidata.org/entity/Q105098380,,,,1975
27019,,,,Nora Grisáková,,,,http://www.wikidata.org/entity/Q105178097,,,,1978
27020,,,,Darko Asomaning Nicholas,,,,http://www.wikidata.org/entity/Q105181487,,,,1939
27021,,,,Philipp Schmidt-Dengler,,,,http://www.wikidata.org/entity/Q105187538,,,,1974


In [53]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_f_test=filtered_dbp_bnf_wk_test

# Replace null value name

## BnF Data
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_bnf'].isnull(),filtered_dbp_bnf_wk_test['name_dbp'],filtered_dbp_bnf_wk_test['name_bnf'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_bnf'].isnull(),filtered_dbp_bnf_wk_test['name_wk'],filtered_dbp_bnf_wk_test['name_bnf'])
## DBpedia
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_dbp'].isnull(),filtered_dbp_bnf_wk_test['name_bnf'],filtered_dbp_bnf_wk_test['name_dbp'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_dbp'].isnull(),filtered_dbp_bnf_wk_test['name_wk'],filtered_dbp_bnf_wk_test['name_dbp'])
## Wikidata
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_wk'].isnull(),filtered_dbp_bnf_wk_test['name_bnf'],filtered_dbp_bnf_wk_test['name_wk'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_wk'].isnull(),filtered_dbp_bnf_wk_test['name_dbp'],filtered_dbp_bnf_wk_test['name_wk'])

# Replace null value year

## BnF Data
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_bnf'].isnull(),filtered_dbp_bnf_wk_test['year_dbp'],filtered_dbp_bnf_wk_test['year_bnf'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_bnf'].isnull(),filtered_dbp_bnf_wk_test['year_wk'],filtered_dbp_bnf_wk_test['year_bnf'])
## DBpedia
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_dbp'].isnull(),filtered_dbp_bnf_wk_test['year_bnf'],filtered_dbp_bnf_wk_test['year_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_dbp'].isnull(),filtered_dbp_bnf_wk_test['year_wk'],filtered_dbp_bnf_wk_test['year_dbp'])
## Wikidata
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_wk'].isnull(),filtered_dbp_bnf_wk_test['year_bnf'],filtered_dbp_bnf_wk_test['year_wk'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_wk'].isnull(),filtered_dbp_bnf_wk_test['year_dbp'],filtered_dbp_bnf_wk_test['year_wk'])

result_f_test=result_f_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf', 'uri_wk']]
sort_rft=result_f_test.sort_values(by='name', ascending=False)
sort_rft[:10]

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
20357,,Александр Валерьевич Дубилет,1962,,,http://www.wikidata.org/entity/Q20066943
22398,,Απέργης Νικόλαος,1962,,,http://www.wikidata.org/entity/Q38597547
26495,,ʻAbd al-Ḥusayn Waddāy al-ʻAṭīyah,1929,,,http://www.wikidata.org/entity/Q66428907
682,,Əvəz Ələkbərov,1952,,,http://www.wikidata.org/entity/Q1099741
16808,,Željko Topić,1959,,,http://www.wikidata.org/entity/Q17402923
4114,,Štefan Tiso,1897,http://dbpedia.org/resource/Štefan_Tiso,,
4113,,Štefan Osuský,1889,http://dbpedia.org/resource/Štefan_Osuský,,
24512,,Štefan Bukovec,1929,,,http://www.wikidata.org/entity/Q59851859
14999,,Şəfa Əliyev,1959,,,http://www.wikidata.org/entity/Q12849692
22039,,Şükrü Kızılot,1958,,,http://www.wikidata.org/entity/Q38170668


In [54]:
result_final = sort_rft.append([result_test], sort=False)
print(len(result_final))
result_final[200:250]

49318


Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
5745,,Yuriy Kolobov,1973,,,http://www.wikidata.org/entity/Q4228078
24521,,Yuriy Dzhygyr,1975,,,http://www.wikidata.org/entity/Q64141038
8881,,Yuriy Bazhal,1950,,,http://www.wikidata.org/entity/Q4075070
12092,,Yurii Boiarskyi,1960,,,http://www.wikidata.org/entity/Q12084492
3426,,Yuri Poluneev,1956,,,http://www.wikidata.org/entity/Q800050
23457,,Yuri Movsisyan,1929,,,http://www.wikidata.org/entity/Q62605638
6397,,Yuri Matochkin,1931,,,http://www.wikidata.org/entity/Q4284901
12274,,Yuri Lohush,1945,,,http://www.wikidata.org/entity/Q12118265
6082,,Yuri Lastochkin,1965,,,http://www.wikidata.org/entity/Q4254837
22169,,Yun Hee-suk,1970,,,http://www.wikidata.org/entity/Q55732976


In [55]:
result_final["name"]=result_final["name"].astype(str)

In [58]:
# Drop duplicates while preserving NaN values
# cf. https://stackoverflow.com/questions/23512339/drop-duplicates-while-preserving-nans-in-pandas

## DBpedia
result_final=result_final[result_final['uri_dbp'].isnull() | ~result_final.duplicated(subset='uri_dbp',keep='first')]
## Wikidata
result_final=result_final[result_final['uri_wk'].isnull() | ~result_final.duplicated(subset='uri_wk',keep='first')]
## BnF_Data
result_final=result_final[result_final['uri_bnf'].isnull() | ~result_final.duplicated(subset='uri_bnf',keep='last')]

result_final=result_final.sort_values(by='name', ascending=False)
print(len(result_final))
result_final[:10]

46926


Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
1512,http://viaf.org/viaf/122083064,松本烝治,1877,http://dbpedia.org/resource/Jōji_Matsumoto,,
310,http://viaf.org/viaf/72766671,周鲠生,1889,http://dbpedia.org/resource/Zhou_Gengsheng,,
21523,http://viaf.org/viaf/311774563,برنارد جريتش,1953,,,http://www.wikidata.org/entity/Q57620
514,http://viaf.org/viaf/6088149844962902960006,Тамерлан Кимович Агузар,1963,http://dbpedia.org/resource/Tamerlan_Aguzarov,,
1171,http://viaf.org/viaf/26641927,Мақсұт Нәрікбаев,1940,http://dbpedia.org/resource/Maksut_Narikbaev,,
1507,http://viaf.org/viaf/122252130,Велко Вълканов,1927,http://dbpedia.org/resource/Velko_Valkanov,,
608,http://viaf.org/viaf/56155284772987061505,Андрей Милёхин,1964,http://dbpedia.org/resource/Andrey_Milekhin,,
20357,,Александр Валерьевич Дубилет,1962,,,http://www.wikidata.org/entity/Q20066943
10078,http://viaf.org/viaf/10743147,Аleksandr Rusov,1847,,,http://www.wikidata.org/entity/Q12149410
215,http://viaf.org/viaf/778940,Γεώργιος Χρηστάκης-Ζωγράφος,1863,http://dbpedia.org/resource/Georgios_Christaki...,,


In [63]:
# Test to find similarities between the names BnF Data, DBpedia and Wikidata with collocation

## cf. https://stackoverflow.com/questions/33098040/how-to-use-word-tokenize-in-data-frame
## https://www.nltk.org/howto/collocations.html
from nltk.corpus import stopwords
stopset = stopwords.words('english')
stopset = stopwords.words('french')

result_test['tokenized_sents'] = result_test.apply(lambda row: nltk.word_tokenize(row['name']), axis=1)

bcf = TrigramCollocationFinder.from_documents(result_test['name'])
filter_stops = lambda w: w in stopset
bcf.apply_word_filter(filter_stops)
f= bcf.nbest(TrigramAssocMeasures, 100)
print(f)

TypeError: ('expected string or bytes-like object', 'occurred at index 0')

In [42]:
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
                    'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
                    'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
                    'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
                              'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
                    })

In [43]:
df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
                    'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
                                  'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
                                  'http://example.com/img/id005.png']
                    })

In [11]:
df3_merged = pd.merge(df1, df2)
df3_merged 

Unnamed: 0,user_id,first_name,last_name,email,image_url
0,id001,Rivi,Valti,rvalti0@example.com,http://example.com/img/id001.png
1,id002,Wynnie,McMurty,wmcmurty1@example.com,http://example.com/img/id002.jpg
2,id003,Kristos,Ivanets,kivanets2@example.com,http://example.com/img/id003.bmp
3,id004,Madalyn,Max,mmax3@example.com,http://example.com/img/id004.jpg
4,id005,Tobe,Riddich,triddich4@example.com,http://example.com/img/id005.png
