In [64]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import ratio as levenshtein_ratio
import hdbscan
from tqdm import tqdm
import random


In [69]:
file_path = r"d:/Users/sinshi/Documents/Zothers/IndividualCustomer.xlsx"
df = pd.read_excel(file_path, usecols=['NameList', 'DateLst', 'IDLst'])
df.dropna(subset=['NameList'], inplace=True)
df['group_id'] = df.index

# Save original full alias list before exploding
df['NameList_original'] = df['NameList']

# Expand aliases and clean individual names
df = df.assign(alias_name=df['NameList'].str.split(',')).explode('alias_name')
df['clean_name'] = df['alias_name'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.strip()

df.drop_duplicates(subset=['group_id', 'clean_name'], inplace=True)
df = df.reset_index(drop=True)

# Restore NameList column
df.rename(columns={'NameList_original': 'NameList'}, inplace=True)
csv_file_path = "D:/project-folder/model/metadata.csv"
df.to_csv(csv_file_path, index=False)
#df.to_csv('D:/project-folder/model', index=False)
#df.head()



In [74]:
model = SentenceTransformer('all-MiniLM-L6-v2')

batch_size = 64
embeddings = []

for i in tqdm(range(0, len(df), batch_size)):
    batch = df['clean_name'].iloc[i:i+batch_size].tolist()
    emb = model.encode(batch, show_progress_bar=False)
    embeddings.append(emb)

embeddings = np.vstack(embeddings).astype('float32')
filepath = "D:/project-folder/model/name_embeddings.npy"
np.save(filepath, embeddings)
filepath_2 = "D:/project-folder/model/sentence_model"
model.save(filepath_2)
#index = faiss.IndexFlatL2(embeddings.shape[1])
#index.add(embeddings)
#filepath_2
#faiss.write_index(index, "D:/project-folder/model/faiss.index")

100%|████████████████████████████████████████████████████████████████████████████████| 134/134 [00:57<00:00,  2.33it/s]


In [71]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
df['cluster_id'] = clusterer.fit_predict(embeddings)
df.head()

Unnamed: 0,NameList,DateLst,IDLst,group_id,NameList.1,alias_name,clean_name,cluster_id
0,"Chol Ung Nam,Chol-Ung Nam",,,0,"Chol Ung Nam,Chol-Ung Nam",Chol Ung Nam,chol ung nam,-1
1,"Chol Ung Nam,Chol-Ung Nam",,,0,"Chol Ung Nam,Chol-Ung Nam",Chol-Ung Nam,cholung nam,-1
2,"Apollinaire Hakizimana,Amikwe Lepic,Poete,Le P...",0/0/1964,,1,"Apollinaire Hakizimana,Amikwe Lepic,Poete,Le P...",Apollinaire Hakizimana,apollinaire hakizimana,-1
3,"Apollinaire Hakizimana,Amikwe Lepic,Poete,Le P...",0/0/1964,,1,"Apollinaire Hakizimana,Amikwe Lepic,Poete,Le P...",Amikwe Lepic,amikwe lepic,-1
4,"Apollinaire Hakizimana,Amikwe Lepic,Poete,Le P...",0/0/1964,,1,"Apollinaire Hakizimana,Amikwe Lepic,Poete,Le P...",Poete,poete,-1


In [78]:
def decide_final_match(row, similarity_threshold=0.75):
    if row['cosine_sim'] >= similarity_threshold and (row['dob_match'] or row['id_match']):
        return "Strong Match"
    elif row['cosine_sim'] >= similarity_threshold:
        return "Probable Match"
    else:
        return "Weak or No Match"

def find_similar_names(query_name, input_dob=None, input_id=None, top_k=5, threshold=0.7):
    name_clean = re.sub(r"[^\w\s]", "", query_name.lower()).strip()
    query_vec = model.encode([name_clean]).astype("float32")
    
    # Calculate cosine similarity between the query and all stored embeddings
    sims = cosine_similarity(query_vec, embeddings)[0]
    df['cosine_sim'] = sims  # Add cosine similarity to DataFrame

    # Get top_k matches based on cosine similarity
    top_matches = df.nlargest(top_k, 'cosine_sim').copy()

    # Rule-based filters: Match by Date of Birth (dob) and ID
    top_matches['dob_match'] = top_matches['DateLst'].astype(str) == str(input_dob) if input_dob else False
    top_matches['id_match'] = top_matches['IDLst'].astype(str) == str(input_id) if input_id else False

    # Filter out results that do not meet the cosine similarity threshold
    filtered = top_matches[top_matches['cosine_sim'] >= threshold]

    if filtered.empty:
        return "No match found"

    # Return relevant columns as a dictionary of top matches (including NameList, alias_name, etc.)
    return filtered[['NameList', 'alias_name', 'DateLst', 'IDLst', 'cosine_sim', 'dob_match', 'id_match']].to_dict(orient='records')


In [79]:
# Replace "sinshi" with any input name and optional DOB/ID
results = find_similar_names("mohammad akbar", input_dob=None, input_id=None)
print(results)


[{'NameList': 'Jamil Mukulu,Sheikh Jamil Mukulu,Steven Alirabaki,Abdullah Junju,Alilabaki Kyagulanyi,David Kyagulanyi,Nicolas Luumu,Hussein Muhammad,Musezi Talengelanimiro,Mzee Tutu,Talengelanimiro,Abdullah Junjuaka,Jamil Alirabaki,Mazengo David Amos,Lwanga Thomas Musisi,Nicholas Lumu,Philipp Nyanzi,Abdullah Jjungu,Petanguli Kalemire,Denis Kityo Musoke,David Amos Mazengo,Julius Elius Mashauri,Kyagulanyi Alibaki,Stephen Kyagulanyi,Jamil Ali Libaki,Ismael Rijab,Talengelanimiro Musezi,Kityo Denis Musoke,Jamil Alkyagulanyi,Mukongozzi Sengooba Kyakonye,Kalamire Patanguli,Moses Sngoba,Abdallah Jumju,Ismael Rajabu,Julius Elius Mashairi,Nyanzi Yafeesi Phillip,David Alilabaki,Jamil Makulu,Lumu Nicholas,Sheik Jamil Mukulu Kyagulanyi,Julius Nicholas,Patanguli Kalamire,Jimmy Makulu,Jjungu Abdallah,Julius Elius,Professor Musharaf,Yafesi,Musharaf', 'alias_name': 'Hussein Muhammad', 'DateLst': datetime.datetime(1964, 4, 17, 0, 0), 'IDLst': nan, 'cosine_sim': 0.8152835369110107, 'dob_match': False, 'i

  return filtered[['NameList', 'alias_name', 'DateLst', 'IDLst', 'cosine_sim', 'dob_match', 'id_match']].to_dict(orient='records')


"""pairs = []
labels = []

# Positive pairs: same cluster
for cluster_id in df['cluster_id'].unique():
    cluster_members = df[df['cluster_id'] == cluster_id]
    if cluster_id == -1 or len(cluster_members) < 2:
        continue
    idxs = cluster_members.index.tolist()
    for i in range(len(idxs)):
        for j in range(i+1, len(idxs)):
            pairs.append((idxs[i], idxs[j]))
            labels.append(1)

# Negative pairs: different clusters
negative_pairs_needed = len(pairs)
all_indices = df.index.tolist()

import random
while len(labels) < 2 * negative_pairs_needed:
    i, j = random.sample(all_indices, 2)
    if df.at[i, 'cluster_id'] != df.at[j, 'cluster_id']:
        pairs.append((i, j))
        labels.append(0)"""


In [18]:
"""def extract_features(idx1, idx2):
    name1 = df.loc[idx1, 'clean_name']
    name2 = df.loc[idx2, 'clean_name']
    embedding1 = embeddings[idx1]
    embedding2 = embeddings[idx2]
    
    cos_sim = cosine_similarity([embedding1], [embedding2])[0][0]
    lev_sim = levenshtein_ratio(name1, name2)
    dob_match = float(df.loc[idx1, 'DateLst'] == df.loc[idx2, 'DateLst'])
    id_match = float(df.loc[idx1, 'IDLst'] == df.loc[idx2, 'IDLst'])
    
    return [cos_sim, lev_sim, dob_match, id_match]

X = np.array([extract_features(i, j) for i, j in pairs])
y = np.array(labels)"""


"def extract_features(idx1, idx2):\n    name1 = df.loc[idx1, 'clean_name']\n    name2 = df.loc[idx2, 'clean_name']\n    embedding1 = embeddings[idx1]\n    embedding2 = embeddings[idx2]\n    \n    cos_sim = cosine_similarity([embedding1], [embedding2])[0][0]\n    lev_sim = levenshtein_ratio(name1, name2)\n    dob_match = float(df.loc[idx1, 'DateLst'] == df.loc[idx2, 'DateLst'])\n    id_match = float(df.loc[idx1, 'IDLst'] == df.loc[idx2, 'IDLst'])\n    \n    return [cos_sim, lev_sim, dob_match, id_match]\n\nX = np.array([extract_features(i, j) for i, j in pairs])\ny = np.array(labels)"

In [19]:
"""X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))"""


'X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nclf = RandomForestClassifier(n_estimators=100, random_state=42)\nclf.fit(X_train, y_train)\n\ny_pred = clf.predict(X_test)\nprint(classification_report(y_test, y_pred))'

In [20]:
"""def hybrid_name_matcher(input_name, input_dob=None, input_id=None, top_k=5, ml_threshold=0.5):
    name_clean = re.sub(r"[^\w\s]", "", input_name.lower()).strip()
    query_vec = model.encode([name_clean]).astype('float32')

    cos_sims = cosine_similarity(query_vec, embeddings)[0]
    best_index = np.argmax(cos_sims)
    cluster_id = df.loc[best_index, 'cluster_id']

    if cluster_id == -1:
        top_indices = np.argsort(cos_sims)[::-1][:top_k*3]
        candidates = df.loc[top_indices].copy()
        candidate_embeddings = embeddings[top_indices]
        candidate_cos_sims = cos_sims[top_indices]
    else:
        candidates = df[df['cluster_id'] == cluster_id].copy()
        candidate_embeddings = embeddings[candidates.index]
        candidate_cos_sims = cosine_similarity(query_vec, candidate_embeddings)[0]

    candidates = candidates.assign(cos_sim=candidate_cos_sims)
    candidates['levenshtein'] = candidates['clean_name'].apply(lambda x: levenshtein_ratio(name_clean, x))
    candidates['dob_match'] = candidates['DateLst'].astype(str) == str(input_dob) if input_dob else False
    candidates['id_match'] = candidates['IDLst'].astype(str) == str(input_id) if input_id else False

    features = candidates[['cos_sim', 'levenshtein', 'dob_match', 'id_match']].astype(float)
    candidates['match_prob'] = clf.predict_proba(features)[:, 1]

    final_matches = candidates[candidates['match_prob'] >= ml_threshold].sort_values(by='match_prob', ascending=False)

    if final_matches.empty:
        return "No match found"
    else:
        return final_matches.head(top_k)[
            ['alias_name', 'DateLst', 'IDLst', 'cos_sim', 'levenshtein', 'dob_match', 'id_match', 'match_prob']
        ]"""


  """def hybrid_name_matcher(input_name, input_dob=None, input_id=None, top_k=5, ml_threshold=0.5):


'def hybrid_name_matcher(input_name, input_dob=None, input_id=None, top_k=5, ml_threshold=0.5):\n    name_clean = re.sub(r"[^\\w\\s]", "", input_name.lower()).strip()\n    query_vec = model.encode([name_clean]).astype(\'float32\')\n\n    cos_sims = cosine_similarity(query_vec, embeddings)[0]\n    best_index = np.argmax(cos_sims)\n    cluster_id = df.loc[best_index, \'cluster_id\']\n\n    if cluster_id == -1:\n        top_indices = np.argsort(cos_sims)[::-1][:top_k*3]\n        candidates = df.loc[top_indices].copy()\n        candidate_embeddings = embeddings[top_indices]\n        candidate_cos_sims = cos_sims[top_indices]\n    else:\n        candidates = df[df[\'cluster_id\'] == cluster_id].copy()\n        candidate_embeddings = embeddings[candidates.index]\n        candidate_cos_sims = cosine_similarity(query_vec, candidate_embeddings)[0]\n\n    candidates = candidates.assign(cos_sim=candidate_cos_sims)\n    candidates[\'levenshtein\'] = candidates[\'clean_name\'].apply(lambda x: leven

NameError: name 'hybrid_name_matcher' is not defined