# fuzzywuzzy: a comparison of fuzzy string matching methods on people names

In [3]:
# !pip install fuzzywuzzy
from fuzzywuzzy import fuzz
import pandas as pd

import sys
sys.path.append('..')
from heritageconnector.utils import data_loaders

pd.options.mode.chained_assignment = None

In [26]:
loader = data_loaders.local_loader()
people_df = loader.load_mimsy_people_data()

people = people_df[people_df['GENDER'].isin(('M', 'F'))]
people[['TITLE_NAME', 'FIRSTMID_NAME', 'LASTSUFF_NAME', "SUFFIX_NAME", "HONORARY_SUFFIX"]] = people[['TITLE_NAME', 'FIRSTMID_NAME', 'LASTSUFF_NAME', "SUFFIX_NAME", "HONORARY_SUFFIX"]].fillna("")
people[['FIRSTMID_NAME', 'LASTSUFF_NAME']] = people[['FIRSTMID_NAME', 'LASTSUFF_NAME']].astype(str)
people.head(4)

Unnamed: 0,LINK_ID,PREFERRED_NAME,TITLE_NAME,FIRSTMID_NAME,LASTSUFF_NAME,SUFFIX_NAME,HONORARY_SUFFIX,GENDER,BRIEF_BIO,DESCRIPTION,...,NATIONALITY,OCCUPATION,WEBSITE,AFFILIATION,LINGUISTIC_GROUP,TYPE,REFERENCE_NUMBER,SOURCE,CREATE_DATE,UPDATE_DATE
1,10245,"Zenthon, Edward Rupert",,Edward Rupert,Zenthon,,,M,Y,REF: http://www.iwm.org.uk/collections/item/ob...,...,British,engineer,,,,,,N,28-JAN-98,05-AUG-15
2,10269,"Troughton, John",,John,Troughton,,,M,Y,"1739 - Born in Corney, Cumbria, England; Appre...",...,English; British,mathematical instrument maker,,,,,,N,28-JAN-98,06-NOV-18
3,1027,"Link, O Winston",,O Winston,Link,,,M,Y,,...,American,photographer,,,,,,N,08-JUN-96,07-NOV-19
4,1030,"Walton, Stanley V",,Stanley V,Walton,,,M,N,,...,British,railway photographer,,,,,,N,08-JUN-96,06-NOV-18


In [39]:
def compare_fuzz(row):
    pref_name = row['PREFERRED_NAME']
    join_name = row['TITLE_NAME'] + " " + row['FIRSTMID_NAME'] + " " + row['LASTSUFF_NAME']
    
    print(pref_name + " VS " + join_name)
    print(f"  ratio: {fuzz.ratio(pref_name, join_name)}")
    print(f"  partial_ratio: {fuzz.partial_ratio(pref_name, join_name)}")
    print(f"  token_sort_ratio: {fuzz.token_sort_ratio(pref_name, join_name)} ")
    print(f"  token_set_ratio: {fuzz.token_set_ratio(pref_name, join_name)} ")
    

In [40]:
for _, row in people.sample(50).iterrows():
    compare_fuzz(row)

Poulsen, Arnold VS  Arnold Poulsen
  ratio: 47
  partial_ratio: 64
  token_sort_ratio: 100 
  token_set_ratio: 100 
Roberts, David VS  David Roberts
  ratio: 50
  partial_ratio: 67
  token_sort_ratio: 100 
  token_set_ratio: 100 
Spooner, Paul VS  Paul Spooner
  ratio: 54
  partial_ratio: 70
  token_sort_ratio: 100 
  token_set_ratio: 100 
Marcet, Francois VS   Marcet, Francois
  ratio: 94
  partial_ratio: 100
  token_sort_ratio: 100 
  token_set_ratio: 100 
Syntax, Doctor VS  Doctor Syntax
  ratio: 50
  partial_ratio: 50
  token_sort_ratio: 100 
  token_set_ratio: 100 
Oliver, George VS  George Oliver
  ratio: 50
  partial_ratio: 50
  token_sort_ratio: 100 
  token_set_ratio: 100 
Knight, Geoffrey VS  Geoffrey Knight
  ratio: 56
  partial_ratio: 56
  token_sort_ratio: 100 
  token_set_ratio: 100 
Aslett, Alfred VS  Alfred Aslett
  ratio: 50
  partial_ratio: 50
  token_sort_ratio: 100 
  token_set_ratio: 100 
Marsh, Fred VS  Fred Marsh
  ratio: 45
  partial_ratio: 62
  token_sort_ratio

### in heritageconnector

In [4]:
from heritageconnector.nlp.string_pairs import fuzzy_match 

In [5]:
fuzzy_match("Poulsen, Arnold", "Arnold Poulsen", threshold=90)

True