# Merge BnF DATA, DBpedia and Wikidata

In this notebook, we apply a method to merge three datasets (BnF, DBpedia and Wikidata)

* First, we drop duplicates of each datasets. 

* Secondly, we merge the three datasets  by removing duplicate data. To realise that, we will use the Linkage toolkit who calculate the proximity (by giving a score) between to string from two dataframes.

* Previously, we have to collect data about economists with SPARQL queries.

In [3]:
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, TURTLE, XML, RDFXML
import pprint
import csv
# from bs4 import BeautifulSoup

from collections import Counter
from operator import itemgetter
import pandas as pd
from sqlalchemy import create_engine

# Calling the nltk package to merge the data of people without existing VIAF URI in the two datasets 

In [166]:
query = """
PREFIX  egr:  <http://rdvocab.info/ElementsGr2/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?name ?sName ?uri ?year ?placeOfBirth ?bio
WHERE
  {   { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1770" )
        FILTER ( ( ( ( ( regex(?bio, "juriste", "i") || regex(?bio, "professeur de droit", "i") ) || regex(?bio, "docteur en droit", "i") ) || regex(?bio, "avocat", "i") ) || regex(?bio, "juge", "i") ) || regex(?bio, "magistrat", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  egr:placeOfBirth ?placeOfBirth}
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf.org", "i")
          }
      }
    UNION
      { ?s  egr:biographicalInformation  ?bio ;
            egr:dateOfBirth       ?bd
        BIND(strbefore(strafter(str(?bd), "http://data.bnf.fr/date/"), "/") AS ?year)
        FILTER ( ?year > "1770" )
        FILTER ( ( ( regex(?bio, "économiste") || regex(?bio, "Economiste") ) || regex(?bio, "professeur d'économie", "i") ) || regex(?bio, "docteur en économie", "i") )
        OPTIONAL
          { ?s  foaf:name  ?name }
        OPTIONAL
          { ?s  egr:placeOfBirth ?placeOfBirth}
        OPTIONAL
          { ?s  skos:prefLabel  ?sName }
        OPTIONAL
          { ?s  owl:sameAs  ?uri1
            FILTER regex(?uri1, "viaf.org", "i")
            BIND(strbefore(str(?uri1), "http://viaf.org/viaf/") AS ?uri)
          }
      }
  }
ORDER BY DESC(?uri)


"""

In [167]:
sparql = SPARQLWrapper("https://data.bnf.fr/sparql") ##, returnFormat=RDFXML)  [LOCALHOST]

In [168]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

In [169]:
try:
    rc_bnf = sparql.queryAndConvert()
except Exception as e:
    print(e)

In [76]:
# Number of rows in the result
len(rc_bnf['results']['bindings'])

11201

In [77]:
# Inspect the first three rows
i = 0
for l in rc_bnf['results']['bindings']:
    if i < 100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb12981404c#about'}, 'name': {'type': 'literal', 'value': 'Léon Garnier'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99996033'}, 'year': {'type': 'literal', 'value': '1836'}, 'bio': {'type': 'literal', 'value': "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb13484444m#about'}, 'name': {'type': 'literal', 'value': 'Gaston de Pawlowski'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9999219'}, 'year': {'type': 'literal', 'value': '1874'}, 'placeOfBirth': {'type': 'literal', 'value': 'Joigny (Yonne)'}, 'bio': {'type': 'literal', 'value': 'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'}}
{'s': {'type': 'uri', 'value': 'http://data.bnf.fr/ark:/12148/cb134841632#about'}, 'name': {'type': 'literal', 'value': 'Jean

In [78]:
result_bnf = []
for l in rc_bnf['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            sName = l['sName']['value']
        except Exception as e:
            sName = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            bio = l['bio']['value']
        except Exception as e:
            bio = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        try: 
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        result_bnf.append([l['s']['value'], uri, name, sName, year,placeOfBirth, bio])        
            
        

In [79]:
print(len(result_bnf))
result_bnf[:10]

11201


[['http://data.bnf.fr/ark:/12148/cb12981404c#about',
  'http://viaf.org/viaf/99996033',
  'Léon Garnier',
  '',
  '1836',
  '',
  "Juriste. - Administrateur et homme de lettres. - En poste à la Préfecture de la Seine. - Frère de l'explorateur Francis Garnier (1839-1873)"],
 ['http://data.bnf.fr/ark:/12148/cb13484444m#about',
  'http://viaf.org/viaf/9999219',
  'Gaston de Pawlowski',
  '',
  '1874',
  'Joigny (Yonne)',
  'Docteur en droit. - Critique littéraire et théâtral. - Rédacteur en chef de "Comoedia"'],
 ['http://data.bnf.fr/ark:/12148/cb134841632#about',
  'http://viaf.org/viaf/9999131',
  'Jean-Michel Berton',
  '',
  '1794',
  'Cahors (Lot)',
  'Écrivain politique, avocat à la Cour de cassation. - Fut fondateur et directeur de la "Revue poétique française et étrangère"'],
 ['http://data.bnf.fr/ark:/12148/cb13379520q#about',
  'http://viaf.org/viaf/9995247',
  'Emmanuel Mathieu',
  '',
  '1852',
  '',
  'Docteur en droit (Paris, 1873)'],
 ['http://data.bnf.fr/ark:/12148/cb13338

In [170]:
query_2= """
PREFIX  dbo:  <http://dbpedia.org/ontology/>
PREFIX  dbp:  <http://dbpedia.org/property/>
PREFIX  owl:  <http://www.w3.org/2002/07/owl#>
PREFIX  dbr:  <http://dbpedia.org/resource/>
PREFIX  xsd:  <http://www.w3.org/2001/XMLSchema#>
PREFIX  foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT  ?s ?uri ?name (year(xsd:dateTime(?Birth_Date)) AS ?year) ?abstract ?placeOfBirth
WHERE
  {   { ?s  a              dbo:Economist ;
          # "Economist" has a class function, he explains the use of the "a".
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
          # We use this filter to preserve only the persons born after 1800. 
          # Here, the method is a little different because we have a date and not a year. 
          # We convert above the date to a year to have the same format as BnF Data.
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name }
        FILTER ( xsd:string(?name) != "Samuel Bowles" )
        OPTIONAL
          { ?s  dbp:birthPlace ?bp}
     # We use this filter to remove the data concerning "Samuel Bowles" 
     # because there is mistakes in data i.e he is related to wrong people -eg. William Turner (cf. https://dbpedia.org/page/Samuel_Bowles_(economist)).
     # We will can fix this problem later, by entering data about him manually.
      }
    UNION
      { ?s  ?propriety  dbr:Economist
      # Here, "Economist" is a instance, we have as triplet "subject predicate object". 
      # In DBpedia, "Economist" is define as a instance and at the same time as a class. So we use both.
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  dbp:birthPlace ?bp}
      }
    UNION
      { ?s  ?p             dbr:Jurist ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
       # We use this filter to remove the data concerning "Cicero". 
       # He appears in results, certainly because his date of birth contains the sign "-" (cf. https://dbpedia.org/page/Cicero).
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:birthPlace ?bp}
      }
    UNION
      { ?s  ?p             dbr:Lawyer ;
            dbp:birthDate  ?Birth_Date
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
         OPTIONAL
          { ?s  dbp:birthPlace ?bp}
      }
    UNION
      { ?s  a              dbr:Professor ;
            dbp:birthDate  ?Birth_Date ;
            dbo:abstract   ?abstract
        FILTER ( ( ( ( regex(?abstract, "lawyer", "i") || regex(?abstract, "jurist", "i") ) || regex(?abstract, "juriste", "i") ) || regex(?abstract, "attorney", "i") ) || regex(?abstract, "legal professional", "i") )
        OPTIONAL
          { ?s  dbp:name  ?name
            FILTER ( xsd:string(?name) != "Marcus Tullius Cicero" )
            FILTER ( xsd:string(?name) != "Samuel Bowles" )
          }
        FILTER ( xsd:date(?Birth_Date) > "1770-01-01"^^xsd:date )
        OPTIONAL
          { ?s  owl:sameAs  ?uri
            FILTER regex(?uri, "viaf", "i")
          }
        OPTIONAL
          { ?s  dbp:birthPlace ?bp}
      }
    BIND(strafter(str(?bp), "http://dbpedia.org/resource/") AS ?placeOfBirth)
  }
ORDER BY DESC(?uri)
"""

In this query, we have made the choice to aggregate, by a UNION clause, several queries to maximise the results' number. Also we request the "economists" and the "jurists" in only one query. 

Obviously, we chose classes and instances directly related to our population, but also the "professor" instance, because some "economists" or "jurists" are in this instance (we have tried with and without them, and there more result when we use them). 

Also, we exclude all classes because they don't add more result, except the "Economist" class (we keep it) 

For exemple, we exclude the resource "personFunction" and the resource "Jurists" because they add no more data. Additionally, we keep only the "Professor" instance for the jurists (it returns result only for the jurists).

In [171]:
sparql = SPARQLWrapper("https://dbpedia.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [172]:
sparql.setQuery(query_2)
sparql.setReturnFormat(JSON)

In [173]:
rc_db = sparql.queryAndConvert()

In [174]:
# Number of rows in the result
len(rc_db['results']['bindings'])

10000

In [180]:
# Inspect the first three rows
i = 0
for l in rc_db['results']['bindings']:
    if i <100:
        print(l)
        i += 1

{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/António_de_Almeida_Santos'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/99921066'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'António de Almeida Santos'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1926'}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Anita_Augspurg'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9976800'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Anita Augspurg'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'value': '1857'}, 'placeOfBirth': {'type': 'literal', 'value': ''}}
{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Mason_Gaffney'}, 'uri': {'type': 'uri', 'value': 'http://viaf.org/viaf/9960617'}, 'name': {'type': 'literal', 'xml:lang': 'en', 'value': 'Mason Gaffney'}, 'year': {'type': 'typed-literal', 'datatype': 'http://www.w3.o

In [181]:
# Create a list with URI, VIAF URI, name, year
result_dbpedia = []
for l in rc_db['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        try:
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        result_dbpedia.append([l['s']['value'], uri, name, year, placeOfBirth])

In [182]:
# Inspect the first three of the list
result_dbpedia[:10]

[['http://dbpedia.org/resource/António_de_Almeida_Santos',
  'http://viaf.org/viaf/99921066',
  'António de Almeida Santos',
  '1926',
  ''],
 ['http://dbpedia.org/resource/Anita_Augspurg',
  'http://viaf.org/viaf/9976800',
  'Anita Augspurg',
  '1857',
  ''],
 ['http://dbpedia.org/resource/Mason_Gaffney',
  'http://viaf.org/viaf/9960617',
  'Mason Gaffney',
  '1923',
  ''],
 ['http://dbpedia.org/resource/Hermann_Heinrich_Gossen',
  'http://viaf.org/viaf/9939728',
  'Hermann Heinrich Gossen',
  '1810',
  'Düren'],
 ['http://dbpedia.org/resource/Gottfried_Haberler',
  'http://viaf.org/viaf/99257315',
  'Gottfried Haberler',
  '1900',
  'Purkersdorf'],
 ['http://dbpedia.org/resource/Gottfried_Haberler',
  'http://viaf.org/viaf/99257315',
  'Gottfried Haberler',
  '1900',
  'Austria-Hungary'],
 ['http://dbpedia.org/resource/Michael_C._Burda',
  'http://viaf.org/viaf/9922987',
  'Michael C. Burda',
  '1959',
  ''],
 ['http://dbpedia.org/resource/Xavier_Vives',
  'http://viaf.org/viaf/99203

In [188]:
query_3= """
PREFIX  rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX  wd:   <http://www.wikidata.org/entity/>
PREFIX  wdt:  <http://www.wikidata.org/prop/direct/>

SELECT DISTINCT  ?s ?uri ?name ?year ?placeOfBirth
WHERE
  {   { ?s  wdt:P106  wd:Q188094 } # Economists
    OPTIONAL
      { ?s  wdt:P569  wd:Q1322263 } # date of birth
       OPTIONAL
      { ?s  wdt:P19 ?placeOfBirth1.
       ?placeOfBirth1 wdt:P276 ?placeOfBirth}
    OPTIONAL
      { ?s  wdt:P214  ?oldURI
        BIND(uri(concat("http://viaf.org/viaf/", strafter(str(?oldURI), ""))) AS ?uri) # It's useful to have the URI VIAF in the same and merge data.
      }
    BIND(year(?dob) AS ?year)
    FILTER ( ?year > 1770 )
    OPTIONAL
      { ?s  rdfs:label  ?name
        FILTER ( lang(?name) = "en" )
      }
  }
"""

In [189]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")  ## returnFormat=RDFXML)  [LOCALHOST]

In [190]:
sparql.setQuery(query_3)
sparql.setReturnFormat(JSON)

In [191]:
rc_wk = sparql.queryAndConvert()

In [192]:
# Number of rows in the result
len(rc_wk['results']['bindings'])

0

In [193]:
# Create a list with URI, VIAF URI, name, year
result_wikidata = []
for l in rc_wk['results']['bindings']:
        try: 
            name = l['name']['value']
        except Exception as e:
            name = ''
        try: 
            uri = l['uri']['value']
        except Exception as e:
            uri = ''
        try: 
            year = l['year']['value']
        except Exception as e:
            year = ''
        try:
            placeOfBirth = l['placeOfBirth']['value']
        except Exception as e:
            placeOfBirth = ''
        result_wikidata.append([l['s']['value'], uri, name, year, placeOfBirth])

In [194]:
result_wikidata[:10]

[]

In [29]:
engine = create_engine('sqlite:///database.sqlite_2', echo=False)

In [30]:
df_bnf = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf_bnf', 'name_bnf', 'sName', 'year_bnf', 'bio_bnf'])
print(len(df_bnf))
df_bnf.fillna('')

df_bnf[:10]

11201


Unnamed: 0,uri_bnf,viaf_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://data.bnf.fr/ark:/12148/cb12981404c#about,http://viaf.org/viaf/99996033,Léon Garnier,,1836,Juriste. - Administrateur et homme de lettres....
1,http://data.bnf.fr/ark:/12148/cb13484444m#about,http://viaf.org/viaf/9999219,Gaston de Pawlowski,,1874,Docteur en droit. - Critique littéraire et thé...
2,http://data.bnf.fr/ark:/12148/cb134841632#about,http://viaf.org/viaf/9999131,Jean-Michel Berton,,1794,"Écrivain politique, avocat à la Cour de cassat..."
3,http://data.bnf.fr/ark:/12148/cb13379520q#about,http://viaf.org/viaf/9995247,Emmanuel Mathieu,,1852,"Docteur en droit (Paris, 1873)"
4,http://data.bnf.fr/ark:/12148/cb13338312g#about,http://viaf.org/viaf/9994322,Josiah Henry Benton,,1843,Juriste. - Bibliophile
5,http://data.bnf.fr/ark:/12148/cb13322315v#about,http://viaf.org/viaf/9991357,Paul Pic,,1862,Juriste. - Professeur de droit à la Faculté de...
6,http://data.bnf.fr/ark:/12148/cb13193319k#about,http://viaf.org/viaf/9989230,Gaston Ravisse,,1877,Avocat. - Spécialiste du monde de l'entreprise...
7,http://data.bnf.fr/ark:/12148/cb15042710d#about,http://viaf.org/viaf/99857689,Cândido Jucá Filho,,1900,Avocat
8,http://data.bnf.fr/ark:/12148/cb13169620f#about,http://viaf.org/viaf/9985289,Joseph de Trémaudan,,1846,Juge à Paimboeuf. - Historien local
9,http://data.bnf.fr/ark:/12148/cb13075767f#about,http://viaf.org/viaf/9982622,Achille Villey-Desmeserets,,1878,Avocat. - Préfet


In [31]:
df_bnf.drop_duplicates(subset ="uri_bnf", keep = 'first', inplace=True)
print(len(df_bnf))

11100


In [32]:
df_dbp = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf_dbp', 'name_dbp', 'year_dbp'])
print(len(df_dbp))
df_dbp.fillna('')
df_dbp.head()

8545


Unnamed: 0,uri_dbp,viaf_dbp,name_dbp,year_dbp
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926
1,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857
2,http://dbpedia.org/resource/Mason_Gaffney,http://viaf.org/viaf/9960617,Mason Gaffney,1923
3,http://dbpedia.org/resource/Hermann_Heinrich_G...,http://viaf.org/viaf/9939728,Hermann Heinrich Gossen,1810
4,http://dbpedia.org/resource/Gottfried_Haberler,http://viaf.org/viaf/99257315,Gottfried Haberler,1900


In [33]:
df_dbp.drop_duplicates(subset ="uri_dbp", keep = 'first', inplace=True)
print(len(df_dbp))
df_dbp.head()

8008


Unnamed: 0,uri_dbp,viaf_dbp,name_dbp,year_dbp
0,http://dbpedia.org/resource/António_de_Almeida...,http://viaf.org/viaf/99921066,António de Almeida Santos,1926
1,http://dbpedia.org/resource/Anita_Augspurg,http://viaf.org/viaf/9976800,Anita Augspurg,1857
2,http://dbpedia.org/resource/Mason_Gaffney,http://viaf.org/viaf/9960617,Mason Gaffney,1923
3,http://dbpedia.org/resource/Hermann_Heinrich_G...,http://viaf.org/viaf/9939728,Hermann Heinrich Gossen,1810
4,http://dbpedia.org/resource/Gottfried_Haberler,http://viaf.org/viaf/99257315,Gottfried Haberler,1900


In [34]:
df_wk= pd.DataFrame(result_wikidata, columns=['uri_wk', 'viaf_wk', 'name_wk', 'year_wk'])
print(len(df_wk))
df_wk.fillna('')
df_wk.head()

29424


Unnamed: 0,uri_wk,viaf_wk,name_wk,year_wk
0,http://www.wikidata.org/entity/Q65561,http://viaf.org/viaf/232142151,Hans Apel,1932
1,http://www.wikidata.org/entity/Q65589,http://viaf.org/viaf/41917922,Alexander Rüstow,1885
2,http://www.wikidata.org/entity/Q114668,http://viaf.org/viaf/111986511,Nicholas Johannsen,1844
3,http://www.wikidata.org/entity/Q114753,http://viaf.org/viaf/45055809,Adolf Ficker,1816
4,http://www.wikidata.org/entity/Q114498,http://viaf.org/viaf/64075666,Kurt W. Rothschild,1914


In [35]:
df_wk.drop_duplicates(subset ='uri_wk', keep = 'first', inplace=True)
df_dbp.fillna('')
print(len(df_wk))
df_wk.head()

29058


Unnamed: 0,uri_wk,viaf_wk,name_wk,year_wk
0,http://www.wikidata.org/entity/Q65561,http://viaf.org/viaf/232142151,Hans Apel,1932
1,http://www.wikidata.org/entity/Q65589,http://viaf.org/viaf/41917922,Alexander Rüstow,1885
2,http://www.wikidata.org/entity/Q114668,http://viaf.org/viaf/111986511,Nicholas Johannsen,1844
3,http://www.wikidata.org/entity/Q114753,http://viaf.org/viaf/45055809,Adolf Ficker,1816
4,http://www.wikidata.org/entity/Q114498,http://viaf.org/viaf/64075666,Kurt W. Rothschild,1914


## RecordLinked

It permits to calculate a match score between to strings. Here, we use the "fuzzymatcher" library.

This article explains  very well how uses it. cf. https://pbpython.com/record-linking.html

There is also a documentation but it is very light. cf. https://fuzzymatcher.readthedocs.io/en/latest/

It seems work well only for the strings.

In [36]:
from pathlib import Path
import fuzzymatcher

In [37]:
left_on=["name_bnf", "year_bnf"]
right_on=["name_dbp", "year_dbp"]

In [38]:
matched_results = fuzzymatcher.fuzzy_left_join(df_bnf,
                                            df_dbp,
                                            left_on,
                                            right_on,
                                            left_id_col='uri_bnf',
                                            right_id_col='uri_dbp')

In [39]:
cols_bnf_dbp= ["best_match_score","uri_bnf","viaf_bnf", "name_bnf", "year_bnf", "bio_bnf", "uri_dbp", "viaf_dbp", "name_dbp", "year_dbp"]

In [40]:
maReSo=matched_results[cols_bnf_dbp].sort_values(by=["best_match_score"], ascending=False).head(10)
maReSo

Unnamed: 0,best_match_score,uri_bnf,viaf_bnf,name_bnf,year_bnf,bio_bnf,uri_dbp,viaf_dbp,name_dbp,year_dbp
568510,1.321915,http://data.bnf.fr/ark:/12148/cb10114251z#about,,Clément Charles Sabrevois de Bleury,1798,Avocat et député. - Cofondateur du journal can...,http://dbpedia.org/resource/Clément-Charles_Sa...,,Clément-Charles Sabrevois de Bleury,1798
68276,1.298857,http://data.bnf.fr/ark:/12148/cb11926733m#about,http://viaf.org/viaf/73860740,Jean-Louis Tixier-Vignancour,1907,Avocat. - Ancien député. - Candidat à l'électi...,http://dbpedia.org/resource/Jean-Louis_Tixier-...,,Jean-Louis Tixier-Vignancour,1907
99846,1.190376,http://data.bnf.fr/ark:/12148/cb10011983m#about,http://viaf.org/viaf/65994069,Oswald Kerchove de Denterghem,1844,Avocat. - A été sénateur provincial libéral du...,http://dbpedia.org/resource/Oswald_de_Kerchove...,,Oswald de Kerchove de Denterghem,1844
249233,1.179464,http://data.bnf.fr/ark:/12148/cb11298933w#about,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882,Homme politique. - Avocat près la cour d'appel...,http://dbpedia.org/resource/Auguste_Champetier...,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882
88091,1.17446,http://data.bnf.fr/ark:/12148/cb12091668t#about,http://viaf.org/viaf/68956010,Adam von Trott zu Solz,1909,"Juriste allemand, exécuté après l'attentat du ...",http://dbpedia.org/resource/Adam_von_Trott_zu_...,,Adam von Trott zu Solz,1909
52091,1.158476,http://data.bnf.fr/ark:/12148/cb12278447x#about,http://viaf.org/viaf/76377829,Friedrich Carl von Savigny,1779,Juriste. - Spécialiste de droit romain. - Mini...,http://dbpedia.org/resource/Friedrich_Carl_von...,http://viaf.org/viaf/76377829,Friedrich Carl von Savigny,1779
548892,1.032706,http://data.bnf.fr/ark:/12148/cb10668371q#about,,Stephan Kekulé von Stradonitz,1863,Juriste. - Héraldiste et généalogiste,http://dbpedia.org/resource/Stephan_Kekulé_von...,,Stephan Kekulé von Stradonitz,1863
155865,1.023191,http://data.bnf.fr/ark:/12148/cb130896095#about,http://viaf.org/viaf/51826740,Gustave Rolin-Jaequemyns,1835,Juriste et homme politique. - Conseiller du ro...,http://dbpedia.org/resource/Gustave_Rolin-Jaeq...,,Gustave Rolin-Jaequemyns,1835
88267,1.021999,http://data.bnf.fr/ark:/12148/cb12054658d#about,http://viaf.org/viaf/68949059,Helmuth James von Moltke,1907,Comte. - Juriste. - Militant actif de la résis...,http://dbpedia.org/resource/Helmuth_James_von_...,http://viaf.org/viaf/68949059,Helmuth James Graf von Moltke,1907
466577,1.009763,http://data.bnf.fr/ark:/12148/cb12029534q#about,http://viaf.org/viaf/110689328,Kurt Georg Kiesinger,1904,"Avocat, membre du parti chrétien-démocrate (CD...",http://dbpedia.org/resource/Kurt_Georg_Kiesinger,,Kurt Georg Kiesinger,1904


In [46]:
matched_bnf_dbp=matched_results[cols_bnf_dbp].query("best_match_score <= .5").sort_values(
    by=['best_match_score'], ascending=False)
print(len(matched_bnf_dbp))
matched_bnf_dbp[:10]

10882


Unnamed: 0,best_match_score,uri_bnf,viaf_bnf,name_bnf,year_bnf,bio_bnf,uri_dbp,viaf_dbp,name_dbp,year_dbp
501451,0.497742,http://data.bnf.fr/ark:/12148/cb119070534#about,,Friedrich August Hayek,1899,Économiste. - Prix Nobel d'économie (1974) ave...,http://dbpedia.org/resource/Friedrich_Hayek,http://viaf.org/viaf/2471646,Friedrich Hayek,1899
194401,0.49594,http://data.bnf.fr/ark:/12148/cb12283356m#about,http://viaf.org/viaf/44360349,Hans Gross,1847,Juriste. - Professeur de droit pénal à l'Unive...,http://dbpedia.org/resource/Hans_Gross,http://viaf.org/viaf/44360349,Hans Gross,1847
298980,0.49512,http://data.bnf.fr/ark:/12148/cb15772206m#about,http://viaf.org/viaf/28569136,Jesse James,1875,"Avocat. - Fils de : ""James, Jesse (1847-1882)""",http://dbpedia.org/resource/Jesse_E._James,,Jesse E. James,1875
517215,0.494536,http://data.bnf.fr/ark:/12148/cb13592522g#about,,Thomas Tooke,1774,Economiste,http://dbpedia.org/resource/Thomas_Tooke,http://viaf.org/viaf/69099901,Thomas Tooke,1774
112968,0.486569,http://data.bnf.fr/ark:/12148/cb11043700s#about,http://viaf.org/viaf/62455068,Mihai A. Antonescu,1907,Juriste. - Homme politique. - Vice-président d...,http://dbpedia.org/resource/Mihai_Antonescu,,Mihai Antonescu,1904
500067,0.485437,http://data.bnf.fr/ark:/12148/cb122779519#about,,Erik Robert Lindahl,1891,Economiste. - Spécialiste de la théorie économ...,http://dbpedia.org/resource/Erik_Lindahl,http://viaf.org/viaf/110766422,Erik Lindahl,1891
278649,0.483876,http://data.bnf.fr/ark:/12148/cb16912557k#about,http://viaf.org/viaf/305457158,Urbano Rattazzi,1918,Combattant 1939-1945. - Avocat,http://dbpedia.org/resource/Urbano_Rattazzi,,Urbano Rattazzi,1808
175402,0.482397,http://data.bnf.fr/ark:/12148/cb12205077k#about,http://viaf.org/viaf/49274108,Henry Wheaton,1785,Juriste. - Rapporteur des décisions de la Cour...,http://dbpedia.org/resource/Henry_Wheaton,http://viaf.org/viaf/49274108,Henry Wheaton,1785
69688,0.482011,http://data.bnf.fr/ark:/12148/cb109316994#about,http://viaf.org/viaf/73845703,József Antall,1896,"Juriste. - Homme politique, membre du FKgP, Fü...",http://dbpedia.org/resource/József_Antall,http://viaf.org/viaf/100281452,József Antall,1932
109197,0.478863,http://data.bnf.fr/ark:/12148/cb11552619r#about,http://viaf.org/viaf/63999619,Courtenay Ilbert,1841,Administrateur aux Indes. - Juriste,http://dbpedia.org/resource/Courtenay_Ilbert,,Sir Courtenay Ilbert,1841


In [149]:
bnf_dbp=pd.DataFrame(matched_bnf_dbp, columns=['uri_bnf','uri_dbp','viaf_bnf','name_bnf', 'year_bnf','bio_bnf'])

Unnamed: 0,uri_bnf,uri_dbp,viaf_bnf,name_bnf,year_bnf,bio_bnf
268744,http://data.bnf.fr/ark:/12148/cb10114251z#about,http://dbpedia.org/resource/Clément-Charles_Sa...,,Clément Charles Sabrevois de Bleury,1798,Avocat et député. - Cofondateur du journal can...
31393,http://data.bnf.fr/ark:/12148/cb11926733m#about,http://dbpedia.org/resource/Jean-Louis_Tixier-...,http://viaf.org/viaf/73860740,Jean-Louis Tixier-Vignancour,1907,Avocat. - Ancien député. - Candidat à l'électi...
45980,http://data.bnf.fr/ark:/12148/cb10011983m#about,http://dbpedia.org/resource/Oswald_de_Kerchove...,http://viaf.org/viaf/65994069,Oswald Kerchove de Denterghem,1844,Avocat. - A été sénateur provincial libéral du...
114765,http://data.bnf.fr/ark:/12148/cb11298933w#about,http://dbpedia.org/resource/Auguste_Champetier...,http://viaf.org/viaf/31990071,Auguste Champetier de Ribes,1882,Homme politique. - Avocat près la cour d'appel...
40725,http://data.bnf.fr/ark:/12148/cb12091668t#about,http://dbpedia.org/resource/Adam_von_Trott_zu_...,http://viaf.org/viaf/68956010,Adam von Trott zu Solz,1909,"Juriste allemand, exécuté après l'attentat du ..."
...,...,...,...,...,...,...
16036,http://data.bnf.fr/ark:/12148/cb119684600#about,http://dbpedia.org/resource/Friedrich_Litten,http://viaf.org/viaf/8180478,Friedrich Julius Stahl,1802,Juriste et homme politique
230094,http://data.bnf.fr/ark:/12148/cb122145877#about,http://dbpedia.org/resource/John_Peters_Humphrey,http://viaf.org/viaf/100966624,John Humphrey,1905,Juriste. - A été professeur de droit internati...
250042,http://data.bnf.fr/ark:/12148/cb12136045n#about,http://dbpedia.org/resource/Rafael_Núñez_(poli...,,Rafael Núñez,1825,"Avocat (1844). - Journaliste, critique littéra..."
62706,http://data.bnf.fr/ark:/12148/cb13623255k#about,http://dbpedia.org/resource/Yves_Guyot,http://viaf.org/viaf/56786852,Yves Gautier,1964,Professeur de droit public à l'Université Robe...


In [151]:
matched_results=fuzzymatcher.fuzzy_left_join(df_wk, df_dbp, left_on = "name_wk", right_on = "name_dbp")

In [153]:
cols_wk_dbp= ["best_match_score","uri_wk","viaf_wk", "name_wk", "year_wk", "uri_dbp", "viaf_dbp", "name_dbp", "year_dbp"]

In [154]:
matched_results[cols_wk_dbp].sort_values(by=["best_match_score"], ascending=False).head(10)


Unnamed: 0,best_match_score,uri_wk,viaf_wk,name_wk,year_wk,uri_dbp,viaf_dbp,name_dbp,year_dbp
5926,1.237479,http://www.wikidata.org/entity/Q76787,http://viaf.org/viaf/94496,Johann Heinrich von Thünen,1783,http://dbpedia.org/resource/Johann_Heinrich_vo...,http://viaf.org/viaf/94496,Johann Heinrich von Thünen,1783
187837,1.146062,http://www.wikidata.org/entity/Q7045351,,Noah Arthur William Cox-George,1915,http://dbpedia.org/resource/Noah_Arthur_Willia...,,Noah Arthur William Cox-George,1915
68486,1.107536,http://www.wikidata.org/entity/Q923932,http://viaf.org/viaf/109542626,Nicolaas Wilhelmus Posthumus,1880,http://dbpedia.org/resource/Nicolaas_Wilhelmus...,http://viaf.org/viaf/109542626,Nicolaas Wilhelmus Posthumus,1880
37514,1.058618,http://www.wikidata.org/entity/Q1397415,http://viaf.org/viaf/55359941,Pieter Cort van der Linden,1846,http://dbpedia.org/resource/Pieter_Cort_van_de...,http://viaf.org/viaf/55359941,Pieter Cort van der Linden,1846
6923,1.01736,http://www.wikidata.org/entity/Q124316,http://viaf.org/viaf/271422976,Beatrice Weder di Mauro,1965,http://dbpedia.org/resource/Beatrice_Weder_di_...,http://viaf.org/viaf/271422976,Beatrice Weder di Mauro,1965
134683,1.013436,http://www.wikidata.org/entity/Q6217103,http://viaf.org/viaf/26640819,Johannes de Villiers Graaff,1928,http://dbpedia.org/resource/Johannes_de_Villie...,http://viaf.org/viaf/26640819,Johannes de Villiers Graaff,1928
53456,0.995761,http://www.wikidata.org/entity/Q311535,http://viaf.org/viaf/2478543,Eugen Böhm von Bawerk,1851,http://dbpedia.org/resource/Eugen_von_Böhm-Bawerk,,Eugen von Böhm Bawerk,1851
41779,0.980262,http://www.wikidata.org/entity/Q717658,http://viaf.org/viaf/74536680,Jesús Huerta de Soto,1956,http://dbpedia.org/resource/Jesús_Huerta_de_Soto,http://viaf.org/viaf/74536680,Jesús Huerta de Soto,1956
7647,0.971187,http://www.wikidata.org/entity/Q99056,http://viaf.org/viaf/81290410,Juergen B. Donges,1940,http://dbpedia.org/resource/Juergen_B._Donges,http://viaf.org/viaf/81290410,Juergen B. Donges,1940
63297,0.938401,http://www.wikidata.org/entity/Q2911361,http://viaf.org/viaf/46870389,Richard Ritter von Strigl,1891,http://dbpedia.org/resource/Richard_Ritter_von...,http://viaf.org/viaf/46870389,Richard Ritter von Strigl,1891


In [162]:
matched_results[cols_wk_dbp].query("best_match_score <= .35").sort_values(
    by=['best_match_score'], ascending=False).head(5)


Unnamed: 0,best_match_score,uri_wk,viaf_wk,name_wk,year_wk,uri_dbp,viaf_dbp,name_dbp,year_dbp
69937,0.349337,http://www.wikidata.org/entity/Q2132497,http://viaf.org/viaf/287696779,Jan Kuperus,1929,http://dbpedia.org/resource/Jan_Kapras,,Jan Kapras,1880
140890,0.348196,http://www.wikidata.org/entity/Q5234253,http://viaf.org/viaf/44516500,David Gordon,1948,http://dbpedia.org/resource/David_Gordon_(econ...,http://viaf.org/viaf/88341444,David Gordon,1944
140789,0.348196,http://www.wikidata.org/entity/Q5234251,http://viaf.org/viaf/88341444,David Gordon,1944,http://dbpedia.org/resource/David_Gordon_(econ...,http://viaf.org/viaf/88341444,David Gordon,1944
287759,0.347937,http://www.wikidata.org/entity/Q15127189,http://viaf.org/viaf/281244795,Michael Corbey,1963,http://dbpedia.org/resource/Michael_Kirby_(judge),http://viaf.org/viaf/76378094,Michael Kirby,1939
164202,0.346746,http://www.wikidata.org/entity/Q5412242,,Fernando Londoño,1944,http://dbpedia.org/resource/Fernando_Londoño_y...,,Fernando Londoño y Londoño,1910


## Merge between different databases wich have a VIAF URI in common

### Between Wikidata and DBpedia

In [36]:
merged_df_1 = pd.merge( df_wk, df_dbp , on='viaf', how='inner', sort='viaf')
merged_df_1[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp
0,http://www.wikidata.org/entity/Q9387,http://viaf.org/viaf/100180950,Max Weber,1864,http://dbpedia.org/resource/Max_Weber,,1864
1,http://www.wikidata.org/entity/Q15999850,http://viaf.org/viaf/100246974,Peter J. Hammond,1945,http://dbpedia.org/resource/Peter_J._Hammond_(...,Peter Hammond,1945
2,http://www.wikidata.org/entity/Q157255,http://viaf.org/viaf/100258394,Merton Miller,1923,http://dbpedia.org/resource/Merton_Miller,Merton Miller,1923
3,http://www.wikidata.org/entity/Q15990097,http://viaf.org/viaf/100274542,Michael Kaser,1926,http://dbpedia.org/resource/Michael_Kaser,Michael Kaser,1926
4,http://www.wikidata.org/entity/Q9025488,http://viaf.org/viaf/100278275,Luiz Carlos Bresser Pereira,1934,http://dbpedia.org/resource/Luiz_Carlos_Bresse...,Luiz Carlos Bresser-Pereira,1934
5,http://www.wikidata.org/entity/Q968139,http://viaf.org/viaf/100304945,Paul A. Baran,1909,http://dbpedia.org/resource/Paul_A._Baran,Paul Alexander Baran,1909
6,http://www.wikidata.org/entity/Q6794860,http://viaf.org/viaf/100595201,Max Hirsch,1852,http://dbpedia.org/resource/Max_Hirsch_(econom...,Max Hirsch,1852
7,http://www.wikidata.org/entity/Q2608419,http://viaf.org/viaf/10127341,Thomas Woods,1972,http://dbpedia.org/resource/Thomas_Woods,Thomas Woods,1972
8,http://www.wikidata.org/entity/Q78604,http://viaf.org/viaf/101860744,Fritz Machlup,1902,http://dbpedia.org/resource/Fritz_Machlup,Fritz Machlup,1902
9,http://www.wikidata.org/entity/Q11763850,http://viaf.org/viaf/101991628,Ludwik Landau,1902,http://dbpedia.org/resource/Ludwik_Maurycy_Landau,Ludwik Maurycy Landau,1902


In [37]:
print("the number of merged data from DBpedia and Wikidata is ",len(merged_df_1), "rows.")
print("")
print("The proportion of the number of merged data from DBpedia with Wikidata is ",((len(merged_df_1))/(len(df_dbp))*100),"%")
print("")
print("proportion of the number of merged data from Wikidata with DBpedia is ",((len(merged_df_1))/(len(df_wk))*100),"%")

the number of merged data from DBpedia and Wikidata is  898 rows.

The proportion of the number of merged data from DBpedia with Wikidata is  54.42424242424242 %

proportion of the number of merged data from Wikidata with DBpedia is  4.152985247190491 %


### Between Wikidata and BnF Data

In [38]:
merged_df_2 = pd.merge( df_wk, df_bnf , on='viaf', how='inner', sort='viaf')
print(len(merged_df_2))
merged_df_2[:10]

112


Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q540253,http://viaf.org/viaf/100277874,Clemens Maria Franz von Bönninghausen,1785,http://data.bnf.fr/ark:/12148/cb165146162#about,Clemens Maria Franz von Bönninghausen,,1785,"Avocat, agriculteur, botaniste, homéopathe. - ..."
1,http://www.wikidata.org/entity/Q11724800,http://viaf.org/viaf/101647977,Jerzy Karol Kurnatowski,1874,http://data.bnf.fr/ark:/12148/cb10528392p#about,Jerzy Kurnatowski,,1874,"Publiciste, juriste et économiste"
2,http://www.wikidata.org/entity/Q11738367,http://viaf.org/viaf/101863288,Kazimierz Studentowicz,1903,http://data.bnf.fr/ark:/12148/cb11261387v#about,Kazimierz Studentowicz,,1903,"Juriste, homme politique"
3,http://www.wikidata.org/entity/Q104820757,http://viaf.org/viaf/107036313,Félix Garcin,1879,http://data.bnf.fr/ark:/12148/cb13073916m#about,Félix Garcin,,1879,"Journaliste, directeur de ""Nouvelliste"", Lyon...."
4,http://www.wikidata.org/entity/Q1680590,http://viaf.org/viaf/107145857823423020439,Josef Redlich,1869,http://data.bnf.fr/ark:/12148/cb12818732h#about,Josef Redlich,,1869,Professeur de droit public et d'administration...
5,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
6,http://www.wikidata.org/entity/Q88911,http://viaf.org/viaf/108482851,Otto Nathan,1893,http://data.bnf.fr/ark:/12148/cb12874301d#about,Otto Nathan,,1893,Économiste. - Avocat. - Exécuteur testamentair...
7,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
8,http://www.wikidata.org/entity/Q87110924,http://viaf.org/viaf/110494713,Henri Denis,1913,http://data.bnf.fr/ark:/12148/cb12103181f#about,Henri Denis,,1913,"Docteur en droit (Paris, 1938). - Professeur d..."
9,http://www.wikidata.org/entity/Q1345621,http://viaf.org/viaf/111314151,Marco Biagi,1950,http://data.bnf.fr/ark:/12148/cb150888041#about,Marco Biagi,,1950,Juriste


In [39]:
print("The number of merged data from BnF Data and Wikidata is ",len(merged_df_2), "rows.")

print("")

print("The proportion of the number of merged data from BnF Data with Wikidata is ",((len(merged_df_2))/(len(df_bnf))*100),"%")

print("")

print("The proportion of the number of merged data from Wikidata with Wikidata is ",((len(merged_df_2))/(len(df_wk))*100),"%")

The number of merged data from BnF Data and Wikidata is  112 rows.

The proportion of the number of merged data from BnF Data with Wikidata is  1.224445173280857 %

The proportion of the number of merged data from Wikidata with Wikidata is  0.5179669796050502 %


### Between DBpedia and BnF Data

In [40]:
merged_df_3 = pd.merge( df_bnf, df_dbp , on='viaf', how='inner', sort='viaf')
print(len(merged_df_3))
merged_df_3[:10]

88


Unnamed: 0,uri_bnf,viaf,name_bnf,sName,year_bnf,bio_bnf,uri_dbp,name_dbp,year_dbp
0,http://data.bnf.fr/ark:/12148/cb122145877#about,http://viaf.org/viaf/100966624,John Humphrey,,1905,Juriste. - A été professeur de droit internati...,http://dbpedia.org/resource/John_Peters_Humphrey,John Peters Humphrey,1905
1,http://data.bnf.fr/ark:/12148/cb12327654n#about,http://viaf.org/viaf/107536763,Louis Renault,,1843,Juriste. - Professeur de droit international à...,http://dbpedia.org/resource/Louis_Renault_(jur...,Louis Renault,1843
2,http://data.bnf.fr/ark:/12148/cb122775427#about,http://viaf.org/viaf/108173876,Ronald Myles Dworkin,,1931,Juriste. - Professeur de jurisprudence à la Ya...,http://dbpedia.org/resource/Ronald_Dworkin,,1931
3,http://data.bnf.fr/ark:/12148/cb11927239j#about,http://viaf.org/viaf/108188941,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,...",http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922
4,http://data.bnf.fr/ark:/12148/cb120906270#about,http://viaf.org/viaf/108565309,Paul Abraham Freund,,1908,"Professeur de droit, ""Harvard Law School""",http://dbpedia.org/resource/Paul_A._Freund,Paul Abraham Freund,1908
5,http://data.bnf.fr/ark:/12148/cb119084288#about,http://viaf.org/viaf/108587991,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938
6,http://data.bnf.fr/ark:/12148/cb128832222#about,http://viaf.org/viaf/108624624,Muḥammad Ẓafr Allāh H̱ān,,1893,"Juriste, diplomate et homme politique",http://dbpedia.org/resource/Muhammad_Zafarulla...,CH Muhammad Zafarullah Khan,1893
7,http://data.bnf.fr/ark:/12148/cb12299375j#about,http://viaf.org/viaf/108794549,Karl Engisch,,1899,Juriste. - Spécialiste de philosophie du droit...,http://dbpedia.org/resource/Karl_Engisch,Karl Engisch,1899
8,http://data.bnf.fr/ark:/12148/cb118935370#about,http://viaf.org/viaf/111389197,Georges Bousquet,,1846,Avocat au Barreau de Paris (en 1866). - Engagé...,http://dbpedia.org/resource/Georges_Hilaire_Bo...,Georges Hilaire Bousquet,1845
9,http://data.bnf.fr/ark:/12148/cb12328362p#about,http://viaf.org/viaf/11396531,John Paul Stevens,,1920,Juriste américain,http://dbpedia.org/resource/John_Paul_Stevens,John Paul Stevens,1920


In [41]:
print("The number of merged data from BnF Data and Wikidata is ",len(merged_df_3), "rows.")

print("")

print("The proportion of the number of merged data from BnF Data with DBpedia is ",((len(merged_df_3))/(len(df_dbp))*100),"%")

print("")

print("The proportion of the number of merged data from DBpedia with BnF Data is ",((len(merged_df_3))/(len(df_bnf))*100),"%")

The number of merged data from BnF Data and Wikidata is  88 rows.

The proportion of the number of merged data from BnF Data with DBpedia is  5.333333333333334 %

The proportion of the number of merged data from DBpedia with BnF Data is  0.9620640647206734 %


### Between Wikidata, BnF Data and DBpedia

In [42]:
merged_df = pd.merge( merged_df_1, df_bnf , on='viaf', how='inner', sort='viaf')
merged_df[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922,"Docteur en droit (University of Chicago, Ill.,..."
1,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938,Juriste et économiste. - Professeur à l'Univer...
2,http://www.wikidata.org/entity/Q3085838,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,François Simiand,1873,http://data.bnf.fr/ark:/12148/cb12301152q#about,François Simiand,,1873,Philosophe. - Agrégé de philosophie. - Docteur...
3,http://www.wikidata.org/entity/Q61956,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,Lorenz von Stein,1815,http://data.bnf.fr/ark:/12148/cb12001622n#about,Lorenz von Stein,,1815,"Juriste et économiste. - Professeur à Kiel, Al..."
4,http://www.wikidata.org/entity/Q231690,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,Bhimrao Ramji Ambedkar,1891,http://data.bnf.fr/ark:/12148/cb12126992f#about,Bhimrao Ramji Ambedkar,,1891,Homme politique d'origine harijan mahar. - Étu...
5,http://www.wikidata.org/entity/Q215961,http://viaf.org/viaf/50021033,Franz Hermann Schulze-Delitzsch,1808,http://dbpedia.org/resource/Franz_Hermann_Schu...,Hermann Schulze-Delitzsch,1808,http://data.bnf.fr/ark:/12148/cb12088660j#about,Hermann Schulze-Delitzsch,,1808,"Juriste, homme politique et économiste alleman..."
6,http://www.wikidata.org/entity/Q4893263,http://viaf.org/viaf/69263532,Joan Sardà i Dexeus,1910,http://dbpedia.org/resource/Joan_Sardà_i_Dexeus,Joan Sardà i Dexeus,1910,http://data.bnf.fr/ark:/12148/cb158098327#about,Juan Sardá Dexeus,,1910,Docteur en droit. - Économiste
7,http://www.wikidata.org/entity/Q7836141,http://viaf.org/viaf/73921034,Travers Twiss,1809,http://dbpedia.org/resource/Travers_Twiss,Travers Twiss,1809,http://data.bnf.fr/ark:/12148/cb12314495r#about,Travers Twiss,,1809,Juriste. - Spécialiste de droit international


In [43]:
print("The number of merged data from DBpedia, Wikidata and BnF Data is",len(merged_df),"rows.")
print("")
print("The proportion of the number of merged data from DBpedia, Wikidata and BnF Data is ",(len(merged_df))/(len(df_bnf))*100,"%" )

The number of merged data from DBpedia, Wikidata and BnF Data is 8 rows.

The proportion of the number of merged data from DBpedia, Wikidata and BnF Data is  0.08746036952006123 %


### Append DBpedia, BnF Data and Wikidata to the merged Dataframe who don't have viaf.org values in common

In [44]:
# To achieve this, I am served on these pages: 
# https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html (en)
# https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html (en)
# http://www.python-simple.com/python-pandas/concatenations-joins-dataframe.php (fr)

result = merged_df.append([merged_df_1, merged_df_2, merged_df_3, df_bnf, df_wk, df_dbp], sort=False)
print(len(result))
result[:10]

33526


Unnamed: 0,uri_wk,viaf,name_wk,year_wk,uri_dbp,name_dbp,year_dbp,uri_bnf,name_bnf,sName,year_bnf,bio_bnf
0,http://www.wikidata.org/entity/Q518859,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,Gordon Tullock,1922,http://data.bnf.fr/ark:/12148/cb11927239j#about,Gordon Tullock,,1922.0,"Docteur en droit (University of Chicago, Ill.,..."
1,http://www.wikidata.org/entity/Q652154,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,Alexis Jacquemin,1938,http://data.bnf.fr/ark:/12148/cb119084288#about,Alexis Jacquemin,,1938.0,Juriste et économiste. - Professeur à l'Univer...
2,http://www.wikidata.org/entity/Q3085838,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,François Simiand,1873,http://data.bnf.fr/ark:/12148/cb12301152q#about,François Simiand,,1873.0,Philosophe. - Agrégé de philosophie. - Docteur...
3,http://www.wikidata.org/entity/Q61956,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,Lorenz von Stein,1815,http://data.bnf.fr/ark:/12148/cb12001622n#about,Lorenz von Stein,,1815.0,"Juriste et économiste. - Professeur à Kiel, Al..."
4,http://www.wikidata.org/entity/Q231690,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,Bhimrao Ramji Ambedkar,1891,http://data.bnf.fr/ark:/12148/cb12126992f#about,Bhimrao Ramji Ambedkar,,1891.0,Homme politique d'origine harijan mahar. - Étu...
5,http://www.wikidata.org/entity/Q215961,http://viaf.org/viaf/50021033,Franz Hermann Schulze-Delitzsch,1808,http://dbpedia.org/resource/Franz_Hermann_Schu...,Hermann Schulze-Delitzsch,1808,http://data.bnf.fr/ark:/12148/cb12088660j#about,Hermann Schulze-Delitzsch,,1808.0,"Juriste, homme politique et économiste alleman..."
6,http://www.wikidata.org/entity/Q4893263,http://viaf.org/viaf/69263532,Joan Sardà i Dexeus,1910,http://dbpedia.org/resource/Joan_Sardà_i_Dexeus,Joan Sardà i Dexeus,1910,http://data.bnf.fr/ark:/12148/cb158098327#about,Juan Sardá Dexeus,,1910.0,Docteur en droit. - Économiste
7,http://www.wikidata.org/entity/Q7836141,http://viaf.org/viaf/73921034,Travers Twiss,1809,http://dbpedia.org/resource/Travers_Twiss,Travers Twiss,1809,http://data.bnf.fr/ark:/12148/cb12314495r#about,Travers Twiss,,1809.0,Juriste. - Spécialiste de droit international
0,http://www.wikidata.org/entity/Q9387,http://viaf.org/viaf/100180950,Max Weber,1864,http://dbpedia.org/resource/Max_Weber,,1864,,,,,
1,http://www.wikidata.org/entity/Q15999850,http://viaf.org/viaf/100246974,Peter J. Hammond,1945,http://dbpedia.org/resource/Peter_J._Hammond_(...,Peter Hammond,1945,,,,,


In [45]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section
result_test=result

# Replace null value name

## BnF Data
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_dbp'],result['name_bnf'])
result_test['name'] = np.where(result['name_bnf'].isnull(),result['name_wk'],result['name_bnf'])
## DBpedia
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_bnf'],result['name_dbp'])
result_test['name'] = np.where(result['name_dbp'].isnull(),result['name_wk'],result['name_dbp'])
## Wikidata
result_test['name'] = np.where(result['name_wk'].isnull(),result['name_bnf'],result['name_wk'])
result_test['name'] = np.where(result['name_wk'].isnull(),result['name_dbp'],result['name_wk'])

# Replace null value year

## BnF Data
result_test['year'] = np.where(result['year_bnf'].isnull(),result['year_dbp'],result['year_bnf'])
result_test['year'] = np.where(result['year_bnf'].isnull(),result['year_wk'],result['year_bnf'])
## DBpedia
result_test['year'] = np.where(result['year_dbp'].isnull(),result['year_bnf'],result['year_dbp'])
result_test['year'] = np.where(result['year_dbp'].isnull(),result['year_wk'],result['year_dbp'])
## Wikidata
result_test['year'] = np.where(result['year_wk'].isnull(),result['year_bnf'],result['year_wk'])
result_test['year'] = np.where(result['year_wk'].isnull(),result['year_dbp'],result['year_wk'])

result_test=result_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf', 'uri_wk']]
result_test[:5]

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
0,http://viaf.org/viaf/108188941,Gordon Tullock,1922,http://dbpedia.org/resource/Gordon_Tullock,http://data.bnf.fr/ark:/12148/cb11927239j#about,http://www.wikidata.org/entity/Q518859
1,http://viaf.org/viaf/108587991,Alexis Jacquemin,1938,http://dbpedia.org/resource/Alexis_Jacquemin,http://data.bnf.fr/ark:/12148/cb119084288#about,http://www.wikidata.org/entity/Q652154
2,http://viaf.org/viaf/32062931,François Simiand,1873,http://dbpedia.org/resource/François_Simiand,http://data.bnf.fr/ark:/12148/cb12301152q#about,http://www.wikidata.org/entity/Q3085838
3,http://viaf.org/viaf/44308789,Lorenz von Stein,1815,http://dbpedia.org/resource/Lorenz_von_Stein,http://data.bnf.fr/ark:/12148/cb12001622n#about,http://www.wikidata.org/entity/Q61956
4,http://viaf.org/viaf/44331988,B. R. Ambedkar,1891,http://dbpedia.org/resource/B._R._Ambedkar,http://data.bnf.fr/ark:/12148/cb12126992f#about,http://www.wikidata.org/entity/Q231690


In [46]:
# Extract only rows without VIAF uri from DBpeida
df_dbp_test = pd.DataFrame(result_dbpedia, columns=['uri_dbp', 'viaf', 'name_dbp', 'year_dbp'])

df_dbp_test_mask=df_dbp_test['viaf']==''
filtered_df_dbp_test = df_dbp_test[df_dbp_test_mask]
print(len(filtered_df_dbp_test))
filtered_df_dbp_test[:10]

6798


Unnamed: 0,uri_dbp,viaf,name_dbp,year_dbp
1747,http://dbpedia.org/resource/Luc-Normand_Tellier,,Luc-Normand Tellier,1944
1748,http://dbpedia.org/resource/Madhu_Verma,,Madhu Verma,1961
1749,http://dbpedia.org/resource/Magda_Kandil,,Magda ElSayed Kandil,1958
1750,http://dbpedia.org/resource/Magnus_Johannesson,,Magnus Johannesson,1964
1751,http://dbpedia.org/resource/Mahendra_P._Lama,,Mahendra P. Lama,1961
1752,http://dbpedia.org/resource/Mainul_Islam,,Mainul Islam,1950
1753,http://dbpedia.org/resource/Urs_Meisterhans,,Urs Meisterhans,1960
1754,http://dbpedia.org/resource/Rosalind_Blauer,,Rosalind Blauer,1943
1755,http://dbpedia.org/resource/Makoto_Yano,,Makoto Yano,1952
1756,http://dbpedia.org/resource/Krzysztof_Zamasz,,Krzysztof Zamasz,1974


In [47]:
# drop duplicates in DBpedia dataframe
filtered_df_dbp_test.drop_duplicates(subset ="uri_dbp", keep = False, inplace=True)
print(len(filtered_df_dbp_test))

6320


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
# Extract only rows without VIAF uri from BnF Data
df_bnf_test = pd.DataFrame(result_bnf, columns=['uri_bnf', 'viaf', 'name_bnf', 'sName','year_bnf','bio_bnf'])

df_bnf_test_mask=df_bnf_test['viaf']==''
filtered_df_bnf_test = df_bnf_test[df_bnf_test_mask]
len(filtered_df_bnf_test)

2054

In [49]:
# drop duplicates in BnF Data dataframe

filtered_df_bnf_test.drop_duplicates(subset ="uri_bnf", keep = 'first', inplace=True)
print(len(filtered_df_bnf_test))

2054


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
##### Extract only rows without VIAF uri from BnF Data
df_wk_test = pd.DataFrame(result_wikidata, columns=['uri_wk', 'viaf', 'name_wk','year_wk'])

df_wk_test_mask=df_wk_test['viaf']==''
filtered_df_wk_test = df_wk_test[df_wk_test_mask]
len(filtered_df_wk_test)
filtered_df_wk_test[:10]

Unnamed: 0,uri_wk,viaf,name_wk,year_wk
22,http://www.wikidata.org/entity/Q116500,,Hans Kaufmann,1948
42,http://www.wikidata.org/entity/Q116475,,Hannes Germann,1956
83,http://www.wikidata.org/entity/Q92134,,Uta Nickel,1941
134,http://www.wikidata.org/entity/Q117426,,Peter Briner,1943
158,http://www.wikidata.org/entity/Q71778,,"Donatus, Landgrave of Hesse",1966
167,http://www.wikidata.org/entity/Q74023,,Liudmyla Denisova,1960
176,http://www.wikidata.org/entity/Q119987,,Martin Baltisser,1969
183,http://www.wikidata.org/entity/Q120799,,Arthur Loepfe,1942
355,http://www.wikidata.org/entity/Q75582,,Shkëlqim Cani,1956
364,http://www.wikidata.org/entity/Q123964,,Werner Hennig,1928


In [51]:
# drop duplicates in Wikidata dataframe

filtered_df_wk_test.drop_duplicates(subset ="uri_wk", keep = 'first', inplace=True)
print(len(filtered_df_wk_test))

7418


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
# merge the three dateframes 
filtered_dbp_bnf_test= filtered_df_dbp_test.append(filtered_df_bnf_test, sort=True)
len(filtered_dbp_bnf_test)
filtered_dbp_bnf_wk_test=filtered_dbp_bnf_test.append(filtered_df_wk_test, sort=True)
print(len(filtered_dbp_bnf_wk_test))
filtered_dbp_bnf_wk_test[-1500:]

15792


Unnamed: 0,bio_bnf,name_bnf,name_dbp,name_wk,sName,uri_bnf,uri_dbp,uri_wk,viaf,year_bnf,year_dbp,year_wk
27003,,,,Marek Matejun,,,,http://www.wikidata.org/entity/Q66974612,,,,1977
27008,,,,Lluís Mosella i Ximenez,,,,http://www.wikidata.org/entity/Q66978783,,,,1975
27009,,,,Fawzi Al-Qaisi,,,,http://www.wikidata.org/entity/Q67031593,,,,1926
27010,,,,Hasan Al-Ameri,,,,http://www.wikidata.org/entity/Q66828293,,,,1938
27013,,,,Isidre Sala Queralt,,,,http://www.wikidata.org/entity/Q67123989,,,,1973
27017,,,,Örs Farkas,,,,http://www.wikidata.org/entity/Q105079853,,,,1988
27018,,,,Zsófia Lakatos,,,,http://www.wikidata.org/entity/Q105098380,,,,1975
27019,,,,Nora Grisáková,,,,http://www.wikidata.org/entity/Q105178097,,,,1978
27020,,,,Darko Asomaning Nicholas,,,,http://www.wikidata.org/entity/Q105181487,,,,1939
27021,,,,Philipp Schmidt-Dengler,,,,http://www.wikidata.org/entity/Q105187538,,,,1974


In [53]:
# Replace each null value with the value from the other database.
# Create a column with the fresh values.
# I used this page: https://kanoki.org/2019/08/17/pandas-coalesce-replace-value-from-another-column/ 
# particuliarly, the "Using Numpy" section

result_f_test=filtered_dbp_bnf_wk_test

# Replace null value name

## BnF Data
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_bnf'].isnull(),filtered_dbp_bnf_wk_test['name_dbp'],filtered_dbp_bnf_wk_test['name_bnf'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_bnf'].isnull(),filtered_dbp_bnf_wk_test['name_wk'],filtered_dbp_bnf_wk_test['name_bnf'])
## DBpedia
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_dbp'].isnull(),filtered_dbp_bnf_wk_test['name_bnf'],filtered_dbp_bnf_wk_test['name_dbp'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_dbp'].isnull(),filtered_dbp_bnf_wk_test['name_wk'],filtered_dbp_bnf_wk_test['name_dbp'])
## Wikidata
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_wk'].isnull(),filtered_dbp_bnf_wk_test['name_bnf'],filtered_dbp_bnf_wk_test['name_wk'])
result_f_test['name'] = np.where(filtered_dbp_bnf_wk_test['name_wk'].isnull(),filtered_dbp_bnf_wk_test['name_dbp'],filtered_dbp_bnf_wk_test['name_wk'])

# Replace null value year

## BnF Data
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_bnf'].isnull(),filtered_dbp_bnf_wk_test['year_dbp'],filtered_dbp_bnf_wk_test['year_bnf'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_bnf'].isnull(),filtered_dbp_bnf_wk_test['year_wk'],filtered_dbp_bnf_wk_test['year_bnf'])
## DBpedia
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_dbp'].isnull(),filtered_dbp_bnf_wk_test['year_bnf'],filtered_dbp_bnf_wk_test['year_dbp'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_dbp'].isnull(),filtered_dbp_bnf_wk_test['year_wk'],filtered_dbp_bnf_wk_test['year_dbp'])
## Wikidata
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_wk'].isnull(),filtered_dbp_bnf_wk_test['year_bnf'],filtered_dbp_bnf_wk_test['year_wk'])
result_f_test['year'] = np.where(filtered_dbp_bnf_wk_test['year_wk'].isnull(),filtered_dbp_bnf_wk_test['year_dbp'],filtered_dbp_bnf_wk_test['year_wk'])

result_f_test=result_f_test.loc[:,['viaf','name','year','uri_dbp','uri_bnf', 'uri_wk']]
sort_rft=result_f_test.sort_values(by='name', ascending=False)
sort_rft[:10]

Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
20357,,Александр Валерьевич Дубилет,1962,,,http://www.wikidata.org/entity/Q20066943
22398,,Απέργης Νικόλαος,1962,,,http://www.wikidata.org/entity/Q38597547
26495,,ʻAbd al-Ḥusayn Waddāy al-ʻAṭīyah,1929,,,http://www.wikidata.org/entity/Q66428907
682,,Əvəz Ələkbərov,1952,,,http://www.wikidata.org/entity/Q1099741
16808,,Željko Topić,1959,,,http://www.wikidata.org/entity/Q17402923
4114,,Štefan Tiso,1897,http://dbpedia.org/resource/Štefan_Tiso,,
4113,,Štefan Osuský,1889,http://dbpedia.org/resource/Štefan_Osuský,,
24512,,Štefan Bukovec,1929,,,http://www.wikidata.org/entity/Q59851859
14999,,Şəfa Əliyev,1959,,,http://www.wikidata.org/entity/Q12849692
22039,,Şükrü Kızılot,1958,,,http://www.wikidata.org/entity/Q38170668


In [54]:
result_final = sort_rft.append([result_test], sort=False)
print(len(result_final))
result_final[200:250]

49318


Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
5745,,Yuriy Kolobov,1973,,,http://www.wikidata.org/entity/Q4228078
24521,,Yuriy Dzhygyr,1975,,,http://www.wikidata.org/entity/Q64141038
8881,,Yuriy Bazhal,1950,,,http://www.wikidata.org/entity/Q4075070
12092,,Yurii Boiarskyi,1960,,,http://www.wikidata.org/entity/Q12084492
3426,,Yuri Poluneev,1956,,,http://www.wikidata.org/entity/Q800050
23457,,Yuri Movsisyan,1929,,,http://www.wikidata.org/entity/Q62605638
6397,,Yuri Matochkin,1931,,,http://www.wikidata.org/entity/Q4284901
12274,,Yuri Lohush,1945,,,http://www.wikidata.org/entity/Q12118265
6082,,Yuri Lastochkin,1965,,,http://www.wikidata.org/entity/Q4254837
22169,,Yun Hee-suk,1970,,,http://www.wikidata.org/entity/Q55732976


In [55]:
result_final["name"]=result_final["name"].astype(str)

In [58]:
# Drop duplicates while preserving NaN values
# cf. https://stackoverflow.com/questions/23512339/drop-duplicates-while-preserving-nans-in-pandas

## DBpedia
result_final=result_final[result_final['uri_dbp'].isnull() | ~result_final.duplicated(subset='uri_dbp',keep='first')]
## Wikidata
result_final=result_final[result_final['uri_wk'].isnull() | ~result_final.duplicated(subset='uri_wk',keep='first')]
## BnF_Data
result_final=result_final[result_final['uri_bnf'].isnull() | ~result_final.duplicated(subset='uri_bnf',keep='last')]

result_final=result_final.sort_values(by='name', ascending=False)
print(len(result_final))
result_final[:10]

46926


Unnamed: 0,viaf,name,year,uri_dbp,uri_bnf,uri_wk
1512,http://viaf.org/viaf/122083064,松本烝治,1877,http://dbpedia.org/resource/Jōji_Matsumoto,,
310,http://viaf.org/viaf/72766671,周鲠生,1889,http://dbpedia.org/resource/Zhou_Gengsheng,,
21523,http://viaf.org/viaf/311774563,برنارد جريتش,1953,,,http://www.wikidata.org/entity/Q57620
514,http://viaf.org/viaf/6088149844962902960006,Тамерлан Кимович Агузар,1963,http://dbpedia.org/resource/Tamerlan_Aguzarov,,
1171,http://viaf.org/viaf/26641927,Мақсұт Нәрікбаев,1940,http://dbpedia.org/resource/Maksut_Narikbaev,,
1507,http://viaf.org/viaf/122252130,Велко Вълканов,1927,http://dbpedia.org/resource/Velko_Valkanov,,
608,http://viaf.org/viaf/56155284772987061505,Андрей Милёхин,1964,http://dbpedia.org/resource/Andrey_Milekhin,,
20357,,Александр Валерьевич Дубилет,1962,,,http://www.wikidata.org/entity/Q20066943
10078,http://viaf.org/viaf/10743147,Аleksandr Rusov,1847,,,http://www.wikidata.org/entity/Q12149410
215,http://viaf.org/viaf/778940,Γεώργιος Χρηστάκης-Ζωγράφος,1863,http://dbpedia.org/resource/Georgios_Christaki...,,


In [63]:
# Test to find similarities between the names BnF Data, DBpedia and Wikidata with collocation

## cf. https://stackoverflow.com/questions/33098040/how-to-use-word-tokenize-in-data-frame
## https://www.nltk.org/howto/collocations.html
from nltk.corpus import stopwords
stopset = stopwords.words('english')
stopset = stopwords.words('french')

result_test['tokenized_sents'] = result_test.apply(lambda row: nltk.word_tokenize(row['name']), axis=1)

bcf = TrigramCollocationFinder.from_documents(result_test['name'])
filter_stops = lambda w: w in stopset
bcf.apply_word_filter(filter_stops)
f= bcf.nbest(TrigramAssocMeasures, 100)
print(f)

TypeError: ('expected string or bytes-like object', 'occurred at index 0')

In [42]:
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
                    'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
                    'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
                    'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
                              'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
                    })

In [43]:
df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
                    'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
                                  'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
                                  'http://example.com/img/id005.png']
                    })

In [11]:
df3_merged = pd.merge(df1, df2)
df3_merged 

Unnamed: 0,user_id,first_name,last_name,email,image_url
0,id001,Rivi,Valti,rvalti0@example.com,http://example.com/img/id001.png
1,id002,Wynnie,McMurty,wmcmurty1@example.com,http://example.com/img/id002.jpg
2,id003,Kristos,Ivanets,kivanets2@example.com,http://example.com/img/id003.bmp
3,id004,Madalyn,Max,mmax3@example.com,http://example.com/img/id004.jpg
4,id005,Tobe,Riddich,triddich4@example.com,http://example.com/img/id005.png
