In [4]:
import pandas as pd
from metaphone import doublemetaphone
from difflib import SequenceMatcher

In [5]:
def check_match(name1, name2):
    # Remove periods, convert to lowercase, and split names
    name1_words = name1.replace('.', '').lower().split()
    name2_words = name2.replace('.', '').lower().split()
    
    # Check if either name is a single word (ignore such cases)
    if len(name1_words) < 2 or len(name2_words) < 2:
        print('One or both names do not have enough information (less than two words).')
        return False
    
    # Extract last names
    last_name1 = name1_words[-1]
    last_name2 = name2_words[-1]
    
    # Check if last names match or if one is the initial of the other
    last_names_match = (last_name1 == last_name2 or 
                        (len(last_name1) == 1 and last_name1 == last_name2[0]) or 
                        (len(last_name2) == 1 and last_name2 == last_name1[0]))
    
    # Extract first names or initials
    first_name1 = name1_words[0]
    first_name2 = name2_words[0]
    
    # Check if first names match or initials correspond
    first_names_match = (first_name1[0] == first_name2[0])
    
    # Use a lower threshold for Levenshtein ratio when comparing full names
    if len(first_name1) > 1 and len(first_name2) > 1:
        lev_ratio = levenshtein_ratio(first_name1, first_name2)
    else:
        lev_ratio = 1  # Assume perfect match if one is just an initial
    
    # Check Metaphone phonetic similarity for both first and last names combined
    meta_match = metaphone_match(' '.join(name1_words), ' '.join(name2_words))
    
    # Determine final match based on OR condition
    if (last_names_match and (first_names_match or lev_ratio >= 0.8)) or meta_match:
        print('Names match or sound similar.')
        return True
    else:
        print('Names do not match and do not sound similar.')
        return False

In [6]:
def levenshtein_ratio(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

In [7]:
def metaphone_match(name1, name2):
    meta1 = doublemetaphone(name1)
    meta2 = doublemetaphone(name2)
    return any(m1 == m2 for m1 in meta1 for m2 in meta2 if m1 and m2)

In [8]:
name1 = "Vidur Singh"
name2 = "V S"
print("Match:", check_match(name1, name2))

Names match or sound similar.
Match: True


In [11]:
name1 = "Vidur Singh"
name2 = "V. Singh"
print("Match:", check_match(name1, name2))

Names match or sound similar.
Match: True


In [10]:
name1 = "Vidur Singh"
name2 = "Mr. Singh"
print("Match:", check_match(name1, name2))

Names do not match and do not sound similar.
Match: False


---
## Version 2
---

In [10]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


In [11]:
# Load your dataset containing text data
df = pd.read_csv('/home/vidur/mediagraph/data/TOI_FarmersProtests.csv')

# Fill missing values
df['Text'] = df['Text'].fillna('')

# Preprocess text: tokenize, remove punctuation, etc.
def preprocess_text(text):
    # Remove non-alphabetic characters and lowercase the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    # Tokenize the text
    tokens = text.split()
    return tokens

# Apply preprocessing to the text column
df['Tokens'] = df['Text'].apply(preprocess_text)

# Build the corpus (list of token lists)
corpus = df['Tokens'].tolist()


In [12]:
# Set parameters for Word2Vec
embedding_size = 100  # Size of the embedding vectors
window_size = 5       # Context window size
min_count = 1         # Minimum word frequency
workers = 4           # Number of worker threads

# Initialize and train the model
model = Word2Vec(sentences=corpus,
                 vector_size=embedding_size,
                 window=window_size,
                 min_count=min_count,
                 workers=workers,
                 sg=0)  # Use CBOW architecture


In [13]:
# Assuming you have a list of names and aliases
# For demonstration, let's create some sample data
names = ['rajinder singh', 'r singh', 'rajindar singh', 'rjinder sngh', 'rajinder', 'rajinder kumar singh']

# Preprocess names
def preprocess_name(name):
    name = re.sub(r'[^a-zA-Z\s]', '', name)
    name = name.lower().strip()
    return name

names = [preprocess_name(name) for name in names]


In [14]:
# Function to get the embedding for a name
def get_name_embedding(name, model):
    words = name.split()
    word_vectors = []
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])
        else:
            # Handle out-of-vocabulary words (e.g., use zero vector)
            word_vectors.append(np.zeros(model.vector_size))
    # Compute the average embedding
    if word_vectors:
        name_embedding = np.mean(word_vectors, axis=0)
    else:
        # If no words are in the vocabulary, return a zero vector
        name_embedding = np.zeros(model.vector_size)
    return name_embedding

# Generate embeddings for all names
name_embeddings = []
for name in names:
    embedding = get_name_embedding(name, model)
    name_embeddings.append(embedding)


In [15]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(name_embeddings)

# Convert to DataFrame for better visualization
similarity_df = pd.DataFrame(similarity_matrix, index=names, columns=names)
similarity_df


Unnamed: 0,rajinder singh,r singh,rajindar singh,rjinder sngh,rajinder,rajinder kumar singh
rajinder singh,1.0,0.980035,0.997709,0.0,0.610952,0.96911
r singh,0.980035,1.0,0.97264,0.0,0.659021,0.989328
rajindar singh,0.997709,0.97264,1.0,0.0,0.555995,0.962417
rjinder sngh,0.0,0.0,0.0,0.0,0.0,0.0
rajinder,0.610952,0.659021,0.555995,0.0,1.0,0.644426
rajinder kumar singh,0.96911,0.989328,0.962417,0.0,0.644426,1.0


In [16]:
# Use DBSCAN for clustering
# Adjust 'eps' and 'min_samples' based on your data
clustering_model = DBSCAN(eps=0.5, min_samples=1, metric='cosine')
cluster_labels = clustering_model.fit_predict(name_embeddings)

# Create a DataFrame to display names and their cluster labels
clusters_df = pd.DataFrame({'Name': names, 'Cluster': cluster_labels})
clusters_df.sort_values('Cluster')


Unnamed: 0,Name,Cluster
0,rajinder singh,0
1,r singh,0
2,rajindar singh,0
4,rajinder,0
5,rajinder kumar singh,0
3,rjinder sngh,1


In [17]:
# Group names by clusters
grouped = clusters_df.groupby('Cluster')['Name'].apply(list)

# Display the clusters and their associated names
for cluster_id, name_list in grouped.items():
    print(f"Cluster {cluster_id}: {name_list}")


Cluster 0: ['rajinder singh', 'r singh', 'rajindar singh', 'rajinder', 'rajinder kumar singh']
Cluster 1: ['rjinder sngh']


In [19]:
# Assuming you have a function to update your Elasticsearch index
def update_elasticsearch(name_list, cluster_id):
    # Implement your logic to update Elasticsearch with resolved entities
    # For example, merge names in the same cluster under a single entity
    pass

# Update Elasticsearch with clusters
for cluster_id, name_list in grouped.items():
    print(cluster_id, name_list)
    update_elasticsearch(name_list, cluster_id)


0 ['rajinder singh', 'r singh', 'rajindar singh', 'rajinder', 'rajinder kumar singh']
1 ['rjinder sngh']
