In [1]:
import numpy as np
import pandas as pd
import pandas as pd
import re
import nltk
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [2]:
base = pd.read_csv('base_names.csv')
base.head()

Unnamed: 0,Base_Name_ID,Base_Name
0,1,John Smith
1,2,Jennifer Brown
2,3,Michael O'Connor
3,4,Maria Garcia
4,5,Robert Lee


In [3]:
base.shape

(20, 2)

In [4]:
variation = pd.read_csv('name_variations.csv')
variation.head()

Unnamed: 0,Variation,Matches_With_Base_Name
0,Thomas King,Thomas King
1,ThomasKing,Thomas King
2,Maria Garcia,Maria Garcia
3,MaryLewis,Mary Lewis
4,Nancy W.,Nancy Wright


In [5]:
variation.shape

(100, 2)

In [6]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [7]:
variation = variation.merge(base[['Base_Name', 'Base_Name_ID']], left_on='Matches_With_Base_Name', right_on='Base_Name', how='left')

In [8]:
variation.rename(columns={'Base_Name_ID': 'Matches_With_Base_Name_ID'}, inplace=True)
variation.drop('Base_Name', axis=1, inplace=True)
variation

Unnamed: 0,Variation,Matches_With_Base_Name,Matches_With_Base_Name_ID
0,Thomas King,Thomas King,15
1,ThomasKing,Thomas King,15
2,Maria Garcia,Maria Garcia,4
3,MaryLewis,Mary Lewis,12
4,Nancy W.,Nancy Wright,16
...,...,...,...
95,Jennifer- Brown,Jennifer Brown,2
96,Daniel- Scott,Daniel Scott,17
97,David M.,David Martinez,9
98,Paul Allen.,Paul Allen,13


In [9]:
base['Base_Name'] = base['Base_Name'].apply(preprocess_text)
variation['Variation'] = variation['Variation'].apply(preprocess_text)
variation['Matches_With_Base_Name'] = variation['Matches_With_Base_Name'].apply(preprocess_text)

In [10]:
base

Unnamed: 0,Base_Name_ID,Base_Name
0,1,john smith
1,2,jennifer brown
2,3,michael connor
3,4,maria garcia
4,5,robert lee
5,6,linda johnson
6,7,william davis
7,8,elizabeth wilson
8,9,david martinez
9,10,susan clark


In [11]:
variation

Unnamed: 0,Variation,Matches_With_Base_Name,Matches_With_Base_Name_ID
0,thomas king,thomas king,15
1,thomasking,thomas king,15
2,maria garcia,maria garcia,4
3,marylewis,mary lewis,12
4,nancy w,nancy wright,16
...,...,...,...
95,jennifer brown,jennifer brown,2
96,daniel scott,daniel scott,17
97,david,david martinez,9
98,paul allen,paul allen,13


In [12]:
def fuzzy_match(variation, base, base_name_id, threshold=80):
    best_match = None
    best_match_id = None
    best_score = 0
    for idx, base in enumerate(base):
        score = fuzz.token_set_ratio(variation, base)  # Using Token Set Ratio
        if score > best_score and score >= threshold:
            best_score = score
            best_match = base
            best_match_id = base_name_id[idx]  # Get the corresponding Query_ID
    return best_match_id, best_score

In [13]:
base

Unnamed: 0,Base_Name_ID,Base_Name
0,1,john smith
1,2,jennifer brown
2,3,michael connor
3,4,maria garcia
4,5,robert lee
5,6,linda johnson
6,7,william davis
7,8,elizabeth wilson
8,9,david martinez
9,10,susan clark


In [16]:
fuzzy_results = []
for uq in variation['Variation']:
    match_id, score = fuzzy_match(uq, base['Base_Name'], base['Base_Name_ID'], threshold=80)
    fuzzy_results.append({'Variation': uq, 'Matches_With_Base_Name_ID': match_id, 'Score': score})

In [17]:
fuzzy_df = pd.DataFrame(fuzzy_results)
fuzzy_df

Unnamed: 0,Variation,Matches_With_Base_Name_ID,Score
0,thomas king,15.0,100
1,thomasking,,0
2,maria garcia,4.0,100
3,marylewis,,0
4,nancy w,16.0,83
...,...,...,...
95,jennifer brown,2.0,100
96,daniel scott,17.0,100
97,david,9.0,100
98,paul allen,13.0,100


In [18]:
fuzzy_df.shape

(100, 3)

In [22]:
correct = 0
for i in range(variation.shape[0]):
    if variation['Matches_With_Base_Name_ID'][i] == fuzzy_df['Matches_With_Base_Name_ID'][i]:
        correct += 1

accuracy = (correct/variation.shape[0])*100
print("Accuracy: ",accuracy)

Accuracy:  94.0
