In [2]:
import pandas as pd

alias_mapping = {
    "UP": "Uttar Pradesh",
    "U.P.": "Uttar Pradesh",
    "Uttar Pradesh": "Uttar Pradesh",
    "Uttar Pradesh 's" : "Uttar Pradesh" # Ensuring canonical name maps to itself
    # Add more mappings as needed
}

# Load entities.csv
entities_df = pd.read_csv('cleaned_entities2024.csv')

# Replace aliases with canonical names
entities_df['entity'] = entities_df['entity'].apply(lambda x: alias_mapping.get(x, x))

# Remove duplicate entities
entities_df = entities_df.drop_duplicates(subset=['entity', 'type']).reset_index(drop=True)

# Save the updated entities.csv
entities_df.to_csv('/home/vidur/mediagraph/Python/final files/cleaned_entities2024.csv', index=False)

print("Entities have been updated and duplicates removed.")


Entities have been updated and duplicates removed.


In [1]:
import pandas as pd
import ast

# Define the alias to canonical mapping
alias_mapping = {
    "UP": "Uttar Pradesh",
    "U.P.": "Uttar Pradesh",
    "Uttar Pradesh": "Uttar Pradesh",  # Ensuring canonical name maps to itself
    # Add more mappings as needed
    # "Delhi": "NCT of Delhi",
    # "NCT Delhi": "NCT of Delhi",
}

# Paths to the CSV files
entities_path = '/home/vidur/mediagraph/Python/cleaned_entities2024.csv'
cooccurrences_path = '/home/vidur/mediagraph/Python/cleaned_co_occurrence2024.csv'
updated_entities_path = '/home/vidur/mediagraph/Python/cleaned_entities2024_2.csv'
updated_cooccurrences_path = '/home/vidur/mediagraph/Python/cleaned_co_occurrence2024_2.csv'

# Step 1: Load and Clean cooccurrences.csv
cooccurrences_df = pd.read_csv(cooccurrences_path)

print("Initial Data:")
print(cooccurrences_df.head())

# Check for missing values in 'entity1', 'entity2', and 'dates'
missing_entity1 = cooccurrences_df['entity1'].isnull().sum()
missing_entity2 = cooccurrences_df['entity2'].isnull().sum()
missing_dates = cooccurrences_df['dates'].isnull().sum()

print(f"\nMissing 'entity1' entries: {missing_entity1}")
print(f"Missing 'entity2' entries: {missing_entity2}")
print(f"Missing 'dates' entries: {missing_dates}")

# Drop rows with missing 'entity1' or 'entity2'
cooccurrences_df = cooccurrences_df.dropna(subset=['entity1', 'entity2'])

# Handle missing 'dates' by replacing them with empty lists
cooccurrences_df['dates'] = cooccurrences_df['dates'].fillna('[]')

# Convert 'entity1' and 'entity2' to strings
cooccurrences_df['entity1'] = cooccurrences_df['entity1'].astype(str)
cooccurrences_df['entity2'] = cooccurrences_df['entity2'].astype(str)

# Replace aliases with canonical names
cooccurrences_df['entity1'] = cooccurrences_df['entity1'].apply(lambda x: alias_mapping.get(x, x))
cooccurrences_df['entity2'] = cooccurrences_df['entity2'].apply(lambda x: alias_mapping.get(x, x))

# Function to create a sorted tuple key for each pair to handle unordered pairs
def create_sorted_pair(row):
    return tuple(sorted([row['entity1'], row['entity2']]))

# Apply the function to create a new column with sorted pairs
cooccurrences_df['pair'] = cooccurrences_df.apply(create_sorted_pair, axis=1)

# Function to safely evaluate 'dates' strings to lists
def safe_eval_dates(dates_str):
    try:
        dates = ast.literal_eval(dates_str)
        if isinstance(dates, list):
            return dates
        else:
            return []
    except (ValueError, SyntaxError):
        return []

# Apply the function to ensure 'dates' are lists
cooccurrences_df['dates'] = cooccurrences_df['dates'].apply(safe_eval_dates)

# Group by the sorted pair and aggregate weights and dates
aggregated_df = cooccurrences_df.groupby('pair').agg({
    'weight': 'sum',
    'dates': lambda dates: sorted(set([date for sublist in dates for date in sublist]))
}).reset_index()

# Split the pair back into 'entity1' and 'entity2'
aggregated_df[['entity1', 'entity2']] = pd.DataFrame(aggregated_df['pair'].tolist(), index=aggregated_df.index)

# Drop the 'pair' column as it's no longer needed
aggregated_df = aggregated_df.drop(columns=['pair'])

# Convert the dates list to string format suitable for Neo4j
aggregated_df['dates'] = aggregated_df['dates'].apply(lambda x: str(x))

# Display the aggregated DataFrame
print("\nAggregated Data:")
print(aggregated_df.head())

# Step 2: Load and Update entities.csv
entities_df = pd.read_csv(entities_path)

# Replace aliases with canonical names
entities_df['entity'] = entities_df['entity'].apply(lambda x: alias_mapping.get(x, x))

# Remove duplicate entities
entities_df = entities_df.drop_duplicates(subset=['entity', 'type']).reset_index(drop=True)

# Save the updated entities.csv
entities_df.to_csv(updated_entities_path, index=False)
print("\nEntities have been updated and duplicates removed.")

# Step 3: Save the updated cooccurrences.csv
aggregated_df.to_csv(updated_cooccurrences_path, index=False)
print("Co-occurrences have been updated and aggregated successfully.")



Initial Data:
                entity1                   entity2  weight  \
0               Haryana  Jammu & Kashmir Alliance       8   
1                 Noida                     YEIDA       1   
2  Ekta Dakaunda-Dhaner             Swaiman Singh       1   
3               Madhura             Swaiman Singh       1   
4     M S Swaminathan's             Swaiman Singh       1   

                                dates  
0  02-2024, 03-2024, 04-2024, 05-2024  
1                             01-2024  
2                             02-2024  
3                             02-2024  
4                             02-2024  

Missing 'entity1' entries: 0
Missing 'entity2' entries: 0
Missing 'dates' entries: 0

Aggregated Data:
   weight dates entity1         entity2
0       2    []    "BJP    Aditya Yadav
1       2    []    "BJP  Akhilesh Yadav
2       2    []    "BJP             BJP
3       2    []    "BJP        Bareilly
4       2    []    "BJP          Budaun

Entities have been updated and dup