In [2]:
import pandas as pd
import numpy as np
from thefuzz import fuzz, process
import random
from tqdm import tqdm

In [3]:

# Function to generate name variations
def create_name_variation(name):
    variations = []
    # Original name
    first, last = name.split(' ')
    
    # Common variations
    variations.extend([
        f"{first}{last}",  # No space
        f"{first.lower()} {last}",  # Lower first name
        f"{first} {last.lower()}",  # Lower last name
        first[0] + ". " + last,  # Initial for first name
        first.replace('a', 'e'),  # Common vowel swap
        last.replace('o', 'ou'),  # Common addition
        first[:-1] + " " + last,  # Missing last letter in first name
        first + " " + last + "n",  # Extra n at end
        first.replace('ch', 'k'),  # Phonetic variation
        first + " " + last.replace('s', 'z')  # s/z swap
    ])
    
    return variations

# Generate large original dataset
def generate_names(n):
    first_names = [
        "James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph",
        "Thomas", "Charles", "Christopher", "Daniel", "Matthew", "Anthony", "Donald",
        "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Barbara", "Susan",
        "Jessica", "Sarah", "Karen", "Nancy", "Lisa", "Margaret", "Sandra", "Ashley"
    ]
    
    last_names = [
        "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis",
        "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson",
        "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Thompson", "White",
        "Harris", "Clark", "Lewis", "Robinson", "Walker", "Hall", "Young"
    ]
    
    names = []
    for _ in range(n):
        first = random.choice(first_names)
        last = random.choice(last_names)
        names.append(f"{first} {last}")
    
    return list(set(names))  # Remove duplicates

# Function to compare names using different fuzzy matching methods
def compare_names(name1, name2):
    ratio = fuzz.ratio(name1, name2)
    partial_ratio = fuzz.partial_ratio(name1, name2)
    token_sort_ratio = fuzz.token_sort_ratio(name1, name2)
    token_set_ratio = fuzz.token_set_ratio(name1, name2)
    
    return {
        'ratio': ratio,
        'partial_ratio': partial_ratio,
        'token_sort_ratio': token_sort_ratio,
        'token_set_ratio': token_set_ratio
    }

# Generate datasets
print("Generating datasets...")
original_names = generate_names(1000)
variant_names = []

# Create variations
print("Creating variations...")
for name in tqdm(original_names):
    variations = create_name_variation(name)
    variant_names.extend(variations)

# Add some completely different names to variant dataset
extra_names = generate_names(200)
variant_names.extend(extra_names)

# Convert to DataFrames
df_original = pd.DataFrame(original_names, columns=['name'])
df_variants = pd.DataFrame(variant_names, columns=['name'])


Generating datasets...
Creating variations...


100%|██████████| 622/622 [00:00<00:00, 617188.81it/s]


In [4]:

# Perform fuzzy matching
def perform_fuzzy_matching(df_original, df_variants, threshold=80):
    matches = []
    print("Performing fuzzy matching...")
    
    for original_name in tqdm(df_original['name']):
        # Get best matches using token_set_ratio (often best for names)
        best_matches = process.extractBests(
            original_name, 
            df_variants['name'].tolist(),
            scorer=fuzz.token_set_ratio,
            score_cutoff=threshold,
            limit=3
        )
        
        for match, score in best_matches:
            # Get detailed comparison scores
            detailed_scores = compare_names(original_name, match)
            matches.append({
                'original_name': original_name,
                'matched_name': match,
                'token_set_score': score,
                'ratio_score': detailed_scores['ratio'],
                'partial_ratio_score': detailed_scores['partial_ratio'],
                'token_sort_score': detailed_scores['token_sort_ratio']
            })
    
    return pd.DataFrame(matches)

# Perform the matching
results = perform_fuzzy_matching(df_original, df_variants)

# Analysis and visualization
print("\nMatching Results:")
print(f"Total original names: {len(df_original)}")
print(f"Total variant names: {len(df_variants)}")
print(f"Total matches found: {len(results)}")

# Display some example matches
print("\nExample matches (sorted by token_set_score):")
print(results.sort_values('token_set_score', ascending=False).head(10))

# Score distribution analysis
print("\nScore distribution:")
print(results[['token_set_score', 'ratio_score', 
              'partial_ratio_score', 'token_sort_score']].describe())

Performing fuzzy matching...


100%|██████████| 622/622 [00:08<00:00, 76.85it/s]


Matching Results:
Total original names: 622
Total variant names: 6404
Total matches found: 1866

Example matches (sorted by token_set_score):
         original_name      matched_name  token_set_score  ratio_score  \
1865        Sandra Lee            Sandra              100           75   
0         Daniel Jones      daniel Jones              100           92   
1         Daniel Jones      Daniel jones              100           92   
2         Daniel Jones            Daniel              100           67   
3     Jessica Martinez  jessica Martinez              100           94   
4     Jessica Martinez  Jessica martinez              100           94   
5     Jessica Martinez          Martinez              100           67   
6             John Lee          john Lee              100           88   
7             John Lee          John lee              100           88   
8             John Lee              John              100           67   

      partial_ratio_score  token_sort_scor




In [5]:
#https://github.com/Sekinal/fuzzyclass/tree/master
# https://github.com/Sekinal/fuzzyclass/issues/1
# average the 3 scores and return

In [6]:
print(results.sort_values('token_set_score', ascending=False).head(10))

         original_name      matched_name  token_set_score  ratio_score  \
1865        Sandra Lee            Sandra              100           75   
0         Daniel Jones      daniel Jones              100           92   
1         Daniel Jones      Daniel jones              100           92   
2         Daniel Jones            Daniel              100           67   
3     Jessica Martinez  jessica Martinez              100           94   
4     Jessica Martinez  Jessica martinez              100           94   
5     Jessica Martinez          Martinez              100           67   
6             John Lee          john Lee              100           88   
7             John Lee          John lee              100           88   
8             John Lee              John              100           67   

      partial_ratio_score  token_sort_score  
1865                  100                75  
0                      96               100  
1                      92               100  
2

In [7]:
# add new column for average

results['avg_score'] = (results['token_set_score'] + results['ratio_score'] + results['partial_ratio_score'] + results['token_sort_score']) / 4

In [None]:
best_matches = results.loc[results.groupby('original_name')['avg_score'].idxmax()]

print(best_matches)



         original_name matched_name  token_set_score  ratio_score  \
906   Anthony Anderson      Anthony              100           61   
1824     Anthony Brown      Anthony              100           70   
519      Anthony Clark      Anthony              100           70   
97       Anthony Davis      Anthony              100           70   
224     Anthony Harris      Anthony              100           67   
...                ...          ...              ...          ...   
235     William Walker      William              100           67   
952      William White      William              100           70   
1165  William Williams     Williams              100           67   
1677    William Wilson      William              100           67   
1797     William Young      William              100           70   

      partial_ratio_score  token_sort_score  avg_score  
906                   100                61       80.5  
1824                  100                70       85.0  


In [9]:
print(best_matches.sort_values('avg_score', ascending=False).head(700))

       original_name    matched_name  token_set_score  ratio_score  \
1793   Richard Young   Richard Young              100          100   
374   Richard Wilson  Richard Wilson              100          100   
1805   Richard Brown   Richard Brown              100          100   
1091   Richard Lopez   Richard Lopez              100          100   
917    Michael Brown   Michael Brown              100          100   
...              ...             ...              ...          ...   
1128   Lisa Anderson            Lisa              100           47   
309       Robert Lee             Lee              100           46   
984   Lisa Rodriguez            Lisa              100           44   
1491  Mary Rodriguez            Mary              100           44   
657      Michael Lee             Lee              100           43   

      partial_ratio_score  token_sort_score  avg_score  
1793                  100               100      100.0  
374                   100               100  