In [1]:
import re
import pandas as pd
from rapidfuzz import fuzz, process
from sklearn.metrics import accuracy_score

In [2]:
base = pd.read_csv("base_names.csv")
vari = pd.read_csv("name_variations.csv")

In [3]:
def normalize(n):
    if pd.isna(n): 
        return ""
    n = str(n).strip().lower()
    if "," in n:
        parts = [p.strip() for p in n.split(",", 1)]
        if len(parts) == 2:
            n = parts[1] + " " + parts[0]
    n = re.sub(r"[^a-z0-9\s]", " ", n)
    n = re.sub(r"\s+", " ", n).strip()
    return n

base["norm"] = base["Base_Name"].apply(normalize)
vari["norm"] = vari["Variation"].apply(normalize)

In [4]:
rows = []
for v_show, v_norm in zip(vari["Variation"], vari["norm"]):
    candidates = process.extract(
        v_norm,
        base["norm"].tolist(),
        scorer=fuzz.token_set_ratio,
        limit=3
    )
    if candidates:
        best_norm, best_score, best_idx = candidates[0]
        best_name = base.iloc[best_idx]["Base_Name"]
    else:
        best_name, best_score, candidates = "No Match", 0, []
    
    top_names = [base.iloc[idx]["Base_Name"] for (_, _, idx) in candidates]
    top_scores = [score for (_, score, _) in candidates]

    rows.append([v_show, best_name, best_score, top_names, top_scores])

In [5]:
out = pd.DataFrame(rows, columns=[
    "Variation", "Best_Match", "Best_Score", "Top3_Names", "Top3_Scores"
])

print(out.head(10))

       Variation     Best_Match  Best_Score  \
0   Thomas  King    Thomas King  100.000000   
1     ThomasKing    Thomas King   57.142857   
2   Maria Garcia   Maria Garcia  100.000000   
3      MaryLewis     Mary Lewis   52.631579   
4       Nancy W.   Nancy Wright   83.333333   
5   Dani3l Scott   Daniel Scott   91.666667   
6    JOHN  smith     John Smith  100.000000   
7  linda johnson  Linda Johnson  100.000000   
8   N@ncy Wright   Nancy Wright   91.666667   
9  William Davis  William Davis  100.000000   

                                       Top3_Names  \
0           [Thomas King, John Smith, Mary Lewis]   
1    [Thomas King, Linda Johnson, David Martinez]   
2      [Maria Garcia, David Martinez, Mary Lewis]   
3    [Mary Lewis, Elizabeth Wilson, Maria Garcia]   
4    [Nancy Wright, Paul Allen, Michael O'Connor]   
5   [Daniel Scott, Jessica Adams, David Martinez]   
6   [John Smith, Linda Johnson, Michael O'Connor]   
7        [Linda Johnson, John Smith, Sandra Hill]   
8    

In [6]:
if "Matches_With_Base_Name" in vari.columns:
    eval_df = vari[["Variation", "Matches_With_Base_Name"]].merge(
        out[["Variation", "Best_Match"]],
        on="Variation",
        how="left"
    )
    eval_df = eval_df.fillna("No Match")
    
    acc = accuracy_score(eval_df["Matches_With_Base_Name"], eval_df["Best_Match"])
    print("Accuracy:", round(acc, 4))

Accuracy: 1.0
