Given two sets of person instances $S=\{s_1, s_2, ... , s_n\}$ and $T=\{t_1, t_2, ..., t_n\} $ belonging to two respective data sources NMVW and Bronbeek museums, the task is to find the set $M=\{S \times T: (s_i, t_j);$ where $i, j= 1, 2, ...n$ and $s_i, t_j $ refers to same real-world entity$\}$. 

So, the following code will try different algorithm to find constituent matches between NMVW and Bronbeek based on their name literal.

In [1]:
import sys
sys.path.append("..") 

In [2]:
import pandas, pickle

In [3]:
df1 = pandas.read_pickle("../nmvw_data/ccrdfconst/person_names.pkl")
df2 = pandas.read_csv("/Users/sarah_shoilee/Desktop/Sarah/Bronbeek_Data/csv_dump/Constituents.csv")

In [4]:
print(f"The size of NMVW unique constituent: {len(df1.index)} and size of Bronbeek constituent: {len(df2.index)}")

The size of NMVW unique constituent: 39567 and size of Bronbeek constituent: 15382


# Exact string matching

In [5]:
from matchexactstring.match_exact_string import matchExactString

In [6]:
result_exact = matchExactString(df1, df2)
exact_match = len(result_exact.loc[result_exact["MATCH"] == "YES"])
print(f"The number of positive match is {exact_match}")

15382it [31:54,  8.03it/s]

The number of positive match is 0





In [7]:
with open("results/bronbeekToNmvwExactMatchResults.pkl", "wb") as handle:
    pickle.dump(result_exact, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
df = pandas.read_pickle("results/bronbeekToNmvwExactMatchResults.pkl")
df.loc[df["MATCH"] == "NO"][['FullName', 'DisplayName', 'RetrievedNames', 'MATCH']].sample(n = 10)

Unnamed: 0,FullName,DisplayName,RetrievedNames,MATCH
10062,Judith Slot,mw. Judith Slot,[],NO
13511,Aart Blaak,1e Luitenant KNIL Aart van de Blaak,[],NO
11788,,Mevr. Treur,[],NO
10350,Klaas Vuure,Landstorm Sergeant 2e klas KNIL Klaas van Vuure,[],NO
8794,Jan Johannes Ambrosius Neut,Jan Johannes Ambrosius van der Neut,[],NO
13959,Mahmud Abdul Jalil Rahmatsyah,Sultan Mahmud Abdul Jalil Rahmatsyah,[],NO
10126,W.J.K. Baaij,Generaal-majoor der Generale Staf W.J.K. Baaij,[],NO
13163,Steven Barneveld,Huzaar 2e klas Steven van Barneveld,[],NO
11844,C.J. Schalks,C.J. Schalks,[],NO
692,,Vice-admiraal Rijk,[],NO


# Surname Matching

In [8]:
from matchsurname.match_surname import matchLastName

In [9]:
result_surname = matchLastName(df1, df2)
surname_match = len(result_surname.loc[result_surname["MATCH"] == "YES"])
print(f"The number of positive match is {surname_match}")

15382it [35:54,  7.14it/s]

The number of positive match is 0





In [10]:
with open("results/bronbeekToNmvwSurnameMatchResults.pkl", "wb") as handle:
    pickle.dump(result_surname, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
df = pandas.read_pickle("results/bronbeekToNmvwSurnameMatchResults.pkl")
df.loc[df["MATCH"] == "YES"][['LastName', 'DisplayName', 'RetrievedNames', 'MATCH']].sample(n = 10)

Unnamed: 0,LastName,DisplayName,RetrievedNames,MATCH
10186,Broek,1e Luitenant vlieger ML-KNIL H. van d. Broek,"[Ch.L.J. Palmer van den Broek, W.A. Pertri-van...",YES
15098,Braspot,J.C.A. (Jelle) Braspot,"[Braspot, Willem Adriaan Braspot]",YES
3547,Olland,W.J.Olland & Zoon Java,"[A S C Maier - Olland, H.R. Olland, Willem Jan...",YES
14598,Mulder,G.A. Mulder,"[Mw. F.C. Mulder, Geert-Jan Mulder, F. Mulder,...",YES
10967,Geus,A. de Geus,[Ir. J.G. de Geus],YES
1648,Noten,de Noten,"[F.L. van Noten, F. van Noten]",YES
9274,Visman,Dhr. Sierik Visman,[Dr. Visman],YES
14696,Koumans,J.C. Koumans,[Dr. F.P. Koumans],YES
14234,Drost,Dominee A. Drost ON4,"[L. Drost, D. Drost, Luitenant K. Drost, Drost...",YES
13156,Joosten,Pastoor Joosten,"[J.M. Joosten, Ben Joosten, K.G.R. Joosten, Be...",YES


In [12]:
len(df.loc[df["MATCH"] == "YES"])

6495

# Abbreviation Matching

In [13]:
from matchwithabbreviation.match_with_abbreviation import match_with_abbreviation

In [14]:
result_abbreviation = match_with_abbreviation(df1, df2)
abbreviationlen_match = len(result_abbreviation.index)
print(f"The number of positive match is {abbreviationlen_match}")

The number of positive match is 1177


In [15]:
with open("results/bronbeekToNmvwAbbreviationMatchResults.pkl", "wb") as handle:
    pickle.dump(result_abbreviation, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
df = pandas.read_pickle("results/bronbeekToNmvwAbbreviationMatchResults.pkl")
result_abbreviation[['nmvw_uri', 'name_label', 'Abbreviations', 'FirstName', 'LastName', 'DisplayName']].sample(n = 10)

Unnamed: 0,nmvw_uri,name_label,Abbreviations,FirstName,LastName,DisplayName
237,https://hdl.handle.net/20.500.11840/pi6890,J. Jansen,J. Jansen,J.,Jansen,J. Jansen
503,https://hdl.handle.net/20.500.11840/pi21474,J.C. Meyer,J. Meyer,J.G.K.,Meyer,J.G.K. Meyer
361,https://hdl.handle.net/20.500.11840/pi2125,J.M. Koster,J. Koster,J.W.,Koster,J.W. Koster
1001,https://hdl.handle.net/20.500.11840/pi33419,W. Wagner,W. Wagner,W.A.,Wagner,W.A. Wagter
586,https://hdl.handle.net/20.500.11840/pi63497,E.A. Nolle,E. Nolle,E.A.,Nolle,Mw. E.A. Nolle
369,https://hdl.handle.net/20.500.11840/pi50759,F.W. Kramer,F. Kramer,F.,Kramer,F. Kramer
259,https://hdl.handle.net/20.500.11840/pi64174,J. Jansen,J. Jansen,Johannes,Jansen,"Johannes Jansen, Korporaal der mariniers"
13,https://hdl.handle.net/20.500.11840/pi62641,Emile Deletaille,E. Deletaille,Emile,Deletaille,Emile Deletaille
614,https://hdl.handle.net/20.500.11840/pi30800,J.R.T.M. Peters,J. Peters,J.P.H.,Peters,J.P.H. Peters
959,https://hdl.handle.net/20.500.11840/pi80533,H. Voorrips,H. Voorrips,H.,Voorrips,H. Voorrips


# Fuzzy String Match

In [9]:
from matchfuzzystring.match_fuzzy_string import match_fuzzy_string

In [10]:
result_fuzzymatch = match_fuzzy_string(df1, df2, max_score=75)
fuzzy_match = len(result_fuzzymatch.loc[result_fuzzymatch["MATCH"] == "YES"])
print(f"The number of positive match is {fuzzy_match}")

15382it [29:26:59,  6.89s/it] 

                                    DisplayName  \
0                         Drs. P.J.C. Verhoeven   
1                             Koning Willem III   
2                             Dhr. A.C. Vongers   
3                     Arnhemse Burgervereniging   
4                                    Hornberger   
5  Minister van Marine W.F. van Erp Taalman Kip   
6                            Dhr. H. Achterberg   
7               Mevrouw C.W.M. Ruempol-Diderich   
8                               Dhr. H.J. Londo   
9                            Dhr. H. van Joolen   

                                      RetrievedNames MATCH  
0  [(Verhoeven, 100), (Verhoeven, 100), (Verhoeve...   YES  
1  [(Koning, 100), (Koning, 100), (Koning Willem ...   YES  
2   [(Dhr. Wong, 88), (S. GÃ¼ner, 80), (H. Vos, 80)]   YES  
3    [(Burger, 100), (A. Burger, 88), (Burgers, 86)]   YES  
4  [(H. (Heidi) Hornberger, 100), (A. Kornberger,...   YES  
5   [(A. Maine, 86), (T. Marin, 86), (E. Minns, 86)]   YES  
6  [(Achter




In [11]:
with open("results/bronbeekToNmvwFuzzyStringMatchResults.pkl", "wb") as handle:
    pickle.dump(result_fuzzymatch, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
df = pandas.read_pickle("results/bronbeekToNmvwFuzzyStringMatchResults.pkl")
df.loc[df["MATCH"] == "YES"][['FullName', 'RetrievedNames', 'MATCH']].sample(n = 10)

Unnamed: 0,FullName,RetrievedNames,MATCH
2418,A.J. Kempees,"[(Mej. J.E. Kempees, 91), (J. Kemper, 88), (Ke...",YES
13180,M.J. Rijken-Valewink,"[(Alewijn, 86), (De Val, 83), (De Val, 83)]",YES
11462,Gwendolyn Hollander,"[(Holland, 100), (Holland, 100), (M. Hollander...",YES
3774,J.A.J. Bongers,"[(J. Kol, 100), (A. BÃ¢, 100), (J.J. Bogers, 90)]",YES
3998,H. Vink,"[(A.T. Vink, 88), (H. List, 83), (Kluit, 80)]",YES
9469,Anna Meijgaard-Hijmering,"[(...ei, 100), (A. Heim, 83), (M. King, 83)]",YES
262,Andries Cornelis Dirk Graeff,"[(S. Cornelis, 90), (Wandres, 86), (C.D. Corne...",YES
12335,Hendrik Johannes Wilhelm Leen,"[(Johan, 100), (Wilhelmy, 93), (C. Johannes, 90)]",YES
10141,Robert Frederik Birkenholz,"[((ed.), 100), ((red.), 100), (Robert, S., 100)]",YES
11338,J. Draaiier,"[(Majoor, 100), (Draaijer, 88), (Bergen, 83)]",YES
