In [12]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from metaphone import doublemetaphone
from difflib import SequenceMatcher
from ast import literal_eval
from elasticsearch import Elasticsearch


In [3]:
# Load your dataset
df = pd.read_csv('/home/vidur/mediagraph/data/output.csv')

# Fill missing values in 'Text' column
df['Text'] = df['Text'].fillna('')

In [15]:
# Sample data
data = {
    'Title': ['Punjab minister orders probe into jail assault on farm activist'],
    'Date': ['01-01-2020'],
    'Text': ['Your full text here'],
    'Entities': ["[('BATHINDA', 'GPE'), ('Punjab', 'GPE'), ('Sukhjinder Singh Randhawa', 'PERSON')]"],
    'Resolved Entities': ["[('BATHINDA', 'GPE'), ('Punjab', 'GPE'), ('Sukhjinder Singh Randhawa', 'PERSON')]"]
}

df = pd.DataFrame(data)


In [16]:
# Parse the 'Resolved Entities' column to extract entities
def parse_resolved_entities(resolved_entities_str):
    try:
        # Convert the string representation of the list to an actual list
        entities = literal_eval(resolved_entities_str)
        return entities
    except:
        return []

# Apply the parsing function
df['Parsed_Entities'] = df['Resolved Entities'].apply(parse_resolved_entities)

# Extract person-type entities
def extract_person_entities(entities):
    return [entity[0] for entity in entities if entity[1] in ['PERSON', 'POL', 'DIR']]

# Apply extraction to get person entities
df['Person_Entities'] = df['Parsed_Entities'].apply(extract_person_entities)


In [17]:
# Collect all person entities from all articles
all_entities = df['Person_Entities'].explode().dropna().unique().tolist()

# Preprocess entity names
def preprocess_name(name):
    name = re.sub(r'[^a-zA-Z\s\']', '', name)
    name = name.lower().strip()
    return name

all_entities = [preprocess_name(name) for name in all_entities]


In [18]:
# Rule-Based Matching Functions
def levenshtein_ratio(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

def metaphone_match(name1, name2):
    meta1 = doublemetaphone(name1)
    meta2 = doublemetaphone(name2)
    return any(m1 == m2 for m1 in meta1 for m2 in meta2 if m1 and m2)

def check_match(name1, name2):
    # Remove periods, convert to lowercase, and split names
    name1_words = name1.replace('.', '').lower().split()
    name2_words = name2.replace('.', '').lower().split()
    
    # Check if either name is a single word (ignore such cases)
    if len(name1_words) < 2 or len(name2_words) < 2:
        return False
    
    # Extract last names
    last_name1 = name1_words[-1]
    last_name2 = name2_words[-1]
    
    # Check if last names match or if one is the initial of the other
    last_names_match = (last_name1 == last_name2 or 
                        (len(last_name1) == 1 and last_name1 == last_name2[0]) or 
                        (len(last_name2) == 1 and last_name2 == last_name1[0]))
    
    # Extract first names or initials
    first_name1 = name1_words[0]
    first_name2 = name2_words[0]
    
    # Check if first names match or initials correspond
    first_names_match = (first_name1[0] == first_name2[0])
    
    # Use a lower threshold for Levenshtein ratio when comparing full names
    if len(first_name1) > 1 and len(first_name2) > 1:
        lev_ratio = levenshtein_ratio(first_name1, first_name2)
    else:
        lev_ratio = 1  # Assume perfect match if one is just an initial
    
    # Check Metaphone phonetic similarity for both first and last names combined
    meta_match = metaphone_match(' '.join(name1_words), ' '.join(name2_words))
    
    # Determine final match based on OR condition
    if (last_names_match and (first_names_match or lev_ratio >= 0.8)) or meta_match:
        return True
    else:
        return False


In [19]:
# Build clusters based on rule-based matching
clusters = []
unclustered_entities = set(all_entities)

while unclustered_entities:
    base_entity = unclustered_entities.pop()
    cluster = [base_entity]
    to_check = set(unclustered_entities)
    for other_entity in to_check:
        if check_match(base_entity, other_entity):
            cluster.append(other_entity)
            unclustered_entities.remove(other_entity)
    clusters.append(cluster)


In [20]:
# Build a corpus for Word2Vec (using the text data)
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    return tokens

df['Tokens'] = df['Text'].apply(preprocess_text)
corpus = df['Tokens'].tolist()


In [21]:
# Train Word2Vec model
embedding_size = 100
window_size = 5
min_count = 1
workers = 4

model = Word2Vec(sentences=corpus,
                 vector_size=embedding_size,
                 window=window_size,
                 min_count=min_count,
                 workers=workers,
                 sg=0)


In [22]:
# Generate embeddings for entities
def get_name_embedding(name, model):
    words = name.split()
    word_vectors = []
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])
        else:
            word_vectors.append(np.zeros(model.vector_size))
    if word_vectors:
        name_embedding = np.mean(word_vectors, axis=0)
    else:
        name_embedding = np.zeros(model.vector_size)
    return name_embedding

name_embeddings = [get_name_embedding(name, model) for name in all_entities]


In [23]:
# Apply clustering on embeddings
clustering_model = DBSCAN(eps=0.5, min_samples=1, metric='cosine')
cluster_labels = clustering_model.fit_predict(name_embeddings)

# Group entities by clusters
vector_clusters = {}
for label, name in zip(cluster_labels, all_entities):
    vector_clusters.setdefault(label, []).append(name)


In [24]:
# Combine clusters from both methods
# Map entities to their rule-based cluster IDs
entity_to_rule_cluster = {}
for cluster_id, cluster in enumerate(clusters):
    for entity in cluster:
        entity_to_rule_cluster[entity] = cluster_id

# Merge clusters based on overlapping entities
combined_clusters = {}
for vector_cluster_id, vector_cluster in vector_clusters.items():
    combined_cluster = set()
    for entity in vector_cluster:
        rule_cluster_id = entity_to_rule_cluster.get(entity)
        if rule_cluster_id is not None:
            combined_cluster.update(clusters[rule_cluster_id])
        else:
            combined_cluster.add(entity)
    combined_clusters[vector_cluster_id] = list(combined_cluster)


In [25]:
# Initialize Elasticsearch client
es = Elasticsearch(
    "https://your_elasticsearch_endpoint",
    http_auth=('username', 'password')  # Use appropriate authentication
)


  es = Elasticsearch(


In [26]:
# Function to update Elasticsearch index
def update_elasticsearch(name_list, cluster_id):
    # Decide on a representative name for the entity (e.g., the most frequent name)
    representative_name = max(name_list, key=len)
    
    # Prepare aliases (other names in the cluster)
    aliases = [name for name in name_list if name != representative_name]
    
    # Check if an entity with the representative name already exists
    search_body = {
        "query": {
            "match": {
                "name": representative_name
            }
        }
    }
    response = es.search(index="resolved_entities", body=search_body)
    if response['hits']['hits']:
        # Entity exists, update aliases
        entity_id = response['hits']['hits'][0]['_id']
        existing_aliases = response['hits']['hits'][0]['_source'].get('aliases', '')
        existing_aliases = existing_aliases.split(';') if existing_aliases else []
        all_aliases = set(existing_aliases + aliases)
        es.update(
            index="resolved_entities",
            id=entity_id,
            body={"doc": {"aliases": ';'.join(all_aliases)}}
        )
    else:
        # Entity does not exist, create a new one
        es.index(index="resolved_entities", body={
            "name": representative_name,
            "aliases": ';'.join(aliases),
            "type": "PERSON"  # Adjust the type as needed
        })


In [27]:
# Update Elasticsearch with combined clusters
for cluster_id, name_list in combined_clusters.items():
    update_elasticsearch(name_list, cluster_id)


ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NameResolutionError(<elastic_transport._node._urllib3_chain_certs.HTTPSConnection object at 0x7faeeedd7130>: Failed to resolve 'your_elasticsearch_endpoint' ([Errno -3] Temporary failure in name resolution)))

---
## FINAL
---

In [1]:
# Import necessary libraries
import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN
import numpy as np
import re
from metaphone import doublemetaphone
from difflib import SequenceMatcher
from ast import literal_eval


In [2]:
# Sample data
data = {
    'Title': ['Punjab minister orders probe into jail assault on farm activist'],
    'Date': ['01-01-2020'],
    'Text': ['Your full text here'],
    'Entities': ["[('BATHINDA', 'GPE'), ('Punjab', 'GPE'), ('Sukhjinder Singh Randhawa', 'PERSON')]"],
    'Resolved Entities': ["[('BATHINDA', 'GPE'), ('Punjab', 'GPE'), ('Sukhjinder Singh Randhawa', 'PERSON')]"]
}

df = pd.DataFrame(data)

In [3]:
# Parse the 'Resolved Entities' column to extract entities
def parse_resolved_entities(resolved_entities_str):
    try:
        # Convert the string representation of the list to an actual list
        entities = literal_eval(resolved_entities_str)
        return entities
    except:
        return []

# Apply the parsing function
df['Parsed_Entities'] = df['Resolved Entities'].apply(parse_resolved_entities)

# Extract person-type entities
def extract_person_entities(entities):
    return [entity[0] for entity in entities if entity[1] in ['PERSON', 'POL', 'DIR']]

# Apply extraction to get person entities
df['Person_Entities'] = df['Parsed_Entities'].apply(extract_person_entities)


In [4]:
# Collect all person entities from all articles
all_entities = df['Person_Entities'].explode().dropna().unique().tolist()

# Preprocess entity names
def preprocess_name(name):
    name = re.sub(r'[^a-zA-Z\s\']', '', name)
    name = name.lower().strip()
    return name

all_entities = [preprocess_name(name) for name in all_entities]


In [5]:
# Rule-Based Matching Functions
def levenshtein_ratio(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

def metaphone_match(name1, name2):
    meta1 = doublemetaphone(name1)
    meta2 = doublemetaphone(name2)
    return any(m1 == m2 for m1 in meta1 for m2 in meta2 if m1 and m2)

def check_match(name1, name2):
    # Remove periods, convert to lowercase, and split names
    name1_words = name1.replace('.', '').lower().split()
    name2_words = name2.replace('.', '').lower().split()
    
    # Check if either name is a single word (ignore such cases)
    if len(name1_words) < 2 or len(name2_words) < 2:
        return False
    
    # Extract last names
    last_name1 = name1_words[-1]
    last_name2 = name2_words[-1]
    
    # Check if last names match or if one is the initial of the other
    last_names_match = (last_name1 == last_name2 or 
                        (len(last_name1) == 1 and last_name1 == last_name2[0]) or 
                        (len(last_name2) == 1 and last_name2 == last_name1[0]))
    
    # Extract first names or initials
    first_name1 = name1_words[0]
    first_name2 = name2_words[0]
    
    # Check if first names match or initials correspond
    first_names_match = (first_name1[0] == first_name2[0])
    
    # Use a lower threshold for Levenshtein ratio when comparing full names
    if len(first_name1) > 1 and len(first_name2) > 1:
        lev_ratio = levenshtein_ratio(first_name1, first_name2)
    else:
        lev_ratio = 1  # Assume perfect match if one is just an initial
    
    # Check Metaphone phonetic similarity for both first and last names combined
    meta_match = metaphone_match(' '.join(name1_words), ' '.join(name2_words))
    
    # Determine final match based on OR condition
    if (last_names_match and (first_names_match or lev_ratio >= 0.8)) or meta_match:
        return True
    else:
        return False


In [6]:
# Build clusters based on rule-based matching
clusters = []
unclustered_entities = set(all_entities)

while unclustered_entities:
    base_entity = unclustered_entities.pop()
    cluster = [base_entity]
    to_check = set(unclustered_entities)
    for other_entity in to_check:
        if check_match(base_entity, other_entity):
            cluster.append(other_entity)
            unclustered_entities.remove(other_entity)
    clusters.append(cluster)


In [7]:
# Build a corpus for Word2Vec (using the text data)
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    return tokens

df['Tokens'] = df['Text'].apply(preprocess_text)
corpus = df['Tokens'].tolist()


In [8]:
# Train Word2Vec model
embedding_size = 100
window_size = 5
min_count = 1
workers = 4

model = Word2Vec(sentences=corpus,
                 vector_size=embedding_size,
                 window=window_size,
                 min_count=min_count,
                 workers=workers,
                 sg=0)


In [9]:
# Generate embeddings for entities
def get_name_embedding(name, model):
    words = name.split()
    word_vectors = []
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])
        else:
            word_vectors.append(np.zeros(model.vector_size))
    if word_vectors:
        name_embedding = np.mean(word_vectors, axis=0)
    else:
        name_embedding = np.zeros(model.vector_size)
    return name_embedding

name_embeddings = [get_name_embedding(name, model) for name in all_entities]


In [10]:
# Apply clustering on embeddings
clustering_model = DBSCAN(eps=0.5, min_samples=1, metric='cosine')
cluster_labels = clustering_model.fit_predict(name_embeddings)

# Group entities by clusters
vector_clusters = {}
for label, name in zip(cluster_labels, all_entities):
    vector_clusters.setdefault(label, []).append(name)


In [11]:
# Combine clusters from both methods
# Map entities to their rule-based cluster IDs
entity_to_rule_cluster = {}
for cluster_id, cluster in enumerate(clusters):
    for entity in cluster:
        entity_to_rule_cluster[entity] = cluster_id

# Merge clusters based on overlapping entities
combined_clusters = {}
for vector_cluster_id, vector_cluster in vector_clusters.items():
    combined_cluster = set()
    for entity in vector_cluster:
        rule_cluster_id = entity_to_rule_cluster.get(entity)
        if rule_cluster_id is not None:
            combined_cluster.update(clusters[rule_cluster_id])
        else:
            combined_cluster.add(entity)
    combined_clusters[vector_cluster_id] = list(combined_cluster)


In [12]:
# Update the DataFrame with resolved entities
def resolve_entities(entities):
    resolved = []
    for name in entities:
        preprocessed_name = preprocess_name(name)
        representative_name = entity_to_representative.get(preprocessed_name, name)
        resolved.append(representative_name)
    return resolved

# Apply the resolution to the 'Person_Entities' column
df['Resolved_Person_Entities'] = df['Person_Entities'].apply(resolve_entities)


NameError: name 'entity_to_representative' is not defined

In [1]:
# Import necessary libraries
import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN
import numpy as np
import re
from metaphone import doublemetaphone
from difflib import SequenceMatcher
from ast import literal_eval

# Load your dataset
df = pd.read_csv('/home/vidur/mediagraph/oldPythonFiles/2024output_PreAlias.csv')

# Fill missing values in 'Text' column
df['Text'] = df['Text'].fillna('')

# Parse the 'Resolved Entities' column to extract entities
def parse_resolved_entities(resolved_entities_str):
    try:
        # Convert the string representation of the list to an actual list
        entities = literal_eval(resolved_entities_str)
        return entities
    except:
        return []

# Apply the parsing function
df['Parsed_Entities'] = df['Resolved Entities'].apply(parse_resolved_entities)

# Extract person-type entities
def extract_person_entities(entities):
    return [entity[0] for entity in entities if entity[1] in ['PERSON', 'POL', 'DIR']]

# Apply extraction to get person entities
df['Person_Entities'] = df['Parsed_Entities'].apply(extract_person_entities)

# Collect all person entities from all articles
all_entities = df['Person_Entities'].explode().dropna().unique().tolist()

# Preprocess entity names
def preprocess_name(name):
    name = re.sub(r'[^a-zA-Z\s\']', '', name)
    name = name.lower().strip()
    return name

all_entities = [preprocess_name(name) for name in all_entities]

# Rule-Based Matching Functions
def levenshtein_ratio(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

def metaphone_match(name1, name2):
    meta1 = doublemetaphone(name1)
    meta2 = doublemetaphone(name2)
    return any(m1 == m2 for m1 in meta1 for m2 in meta2 if m1 and m2)

def check_match(name1, name2):
    # Remove periods, convert to lowercase, and split names
    name1_words = name1.replace('.', '').lower().split()
    name2_words = name2.replace('.', '').lower().split()
    
    # Check if either name is a single word (ignore such cases)
    if len(name1_words) < 2 or len(name2_words) < 2:
        return False
    
    # Extract last names
    last_name1 = name1_words[-1]
    last_name2 = name2_words[-1]
    
    # Check if last names match or if one is the initial of the other
    last_names_match = (last_name1 == last_name2 or 
                        (len(last_name1) == 1 and last_name1 == last_name2[0]) or 
                        (len(last_name2) == 1 and last_name2 == last_name1[0]))
    
    # Extract first names or initials
    first_name1 = name1_words[0]
    first_name2 = name2_words[0]
    
    # Check if first names match or initials correspond
    first_names_match = (first_name1[0] == first_name2[0])
    
    # Use a lower threshold for Levenshtein ratio when comparing full names
    if len(first_name1) > 1 and len(first_name2) > 1:
        lev_ratio = levenshtein_ratio(first_name1, first_name2)
    else:
        lev_ratio = 1  # Assume perfect match if one is just an initial
    
    # Check Metaphone phonetic similarity for both first and last names combined
    meta_match = metaphone_match(' '.join(name1_words), ' '.join(name2_words))
    
    # Determine final match based on OR condition
    if (last_names_match and (first_names_match or lev_ratio >= 0.8)) or meta_match:
        return True
    else:
        return False

# Build clusters based on rule-based matching
clusters = []
unclustered_entities = set(all_entities)

while unclustered_entities:
    base_entity = unclustered_entities.pop()
    cluster = [base_entity]
    to_check = set(unclustered_entities)
    for other_entity in to_check:
        if check_match(base_entity, other_entity):
            cluster.append(other_entity)
            unclustered_entities.remove(other_entity)
    clusters.append(cluster)

# Build a corpus for Word2Vec (using the text data)
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    return tokens

df['Tokens'] = df['Text'].apply(preprocess_text)
corpus = df['Tokens'].tolist()

# Train Word2Vec model
embedding_size = 100
window_size = 5
min_count = 1
workers = 4

model = Word2Vec(sentences=corpus,
                 vector_size=embedding_size,
                 window=window_size,
                 min_count=min_count,
                 workers=workers,
                 sg=0)

# Generate embeddings for entities
def get_name_embedding(name, model):
    words = name.split()
    word_vectors = []
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])
        else:
            word_vectors.append(np.zeros(model.vector_size))
    if word_vectors:
        name_embedding = np.mean(word_vectors, axis=0)
    else:
        name_embedding = np.zeros(model.vector_size)
    return name_embedding

name_embeddings = [get_name_embedding(name, model) for name in all_entities]

# Apply clustering on embeddings
clustering_model = DBSCAN(eps=0.5, min_samples=1, metric='cosine')
cluster_labels = clustering_model.fit_predict(name_embeddings)

# Group entities by clusters
vector_clusters = {}
for label, name in zip(cluster_labels, all_entities):
    vector_clusters.setdefault(label, []).append(name)

# Combine clusters from both methods
# Map entities to their rule-based cluster IDs
entity_to_rule_cluster = {}
for cluster_id, cluster in enumerate(clusters):
    for entity in cluster:
        entity_to_rule_cluster[entity] = cluster_id

# Merge clusters based on overlapping entities
combined_clusters = {}
for vector_cluster_id, vector_cluster in vector_clusters.items():
    combined_cluster = set()
    for entity in vector_cluster:
        rule_cluster_id = entity_to_rule_cluster.get(entity)
        if rule_cluster_id is not None:
            combined_cluster.update(clusters[rule_cluster_id])
        else:
            combined_cluster.add(entity)
    combined_clusters[vector_cluster_id] = list(combined_cluster)

# Create a mapping from entity to representative name
entity_to_representative = {}
for cluster_id, name_list in combined_clusters.items():
    # Decide on a representative name for the entity (e.g., the longest name)
    representative_name = max(name_list, key=len)
    for name in name_list:
        entity_to_representative[name] = representative_name

# Update the DataFrame with resolved entities
def resolve_entities(entities, entity_to_representative):
    resolved = []
    for name in entities:
        preprocessed_name = preprocess_name(name)
        representative_name = entity_to_representative.get(preprocessed_name, name)
        resolved.append(representative_name)
    return resolved

# Apply the resolution to the 'Person_Entities' column
df['Resolved_Person_Entities'] = df['Person_Entities'].apply(lambda x: resolve_entities(x, entity_to_representative))


In [3]:
df['Person_Entities'][300]

['Modi', 'Shivraj Singh Chouhan', 'Savitri Thakur', 'Bharti Pardhi', 'Cong']

In [9]:
df.to_csv(f'output_final2.csv', sep=',')