In [4]:
import pandas as pd

# Load the data from the raw data directory
df = pd.read_csv('../data/processed/cleaned_election_results.csv')

In [5]:
# Comprehensive alliance mapping for Indian political parties
alliance_mapping = {
    # NDA (National Democratic Alliance) Parties
    'bjp': 'NDA',
    'bharatiya janata party': 'NDA',
    'shiv sena': 'NDA',  # Pre-2019
    'shivsena': 'NDA',
    'lok janashakti party': 'NDA',
    'aiadmk': 'NDA',  # All India Anna Dravida Munnetra Kazhagam
    'all india anna dravida munnetra kazhagam': 'NDA',
    'janata dal (united)': 'NDA',
    'jdu': 'NDA',
    'akali dal': 'NDA',
    
    # UPA (United Progressive Alliance) Parties
    'inc': 'UPA',
    'indian national congress': 'UPA',
    'nationalist congress party': 'UPA',
    'ncp': 'UPA',
    'dmk': 'UPA',  # Dravida Munnetra Kazhagam
    'dravida munnetra kazhagam': 'UPA',
    'rashtriya janata dal': 'UPA',
    'rjd': 'UPA',
    'jharkhand mukti morcha': 'UPA',
    'jmm': 'UPA',
    
    # Third Front/Other Alliances
    'sp': 'Third Front',  # Samajwadi Party
    'samajwadi party': 'Third Front',
    'bsp': 'Third Front',  # Bahujan Samaj Party
    'bahujan samaj party': 'Third Front',
    'aam aadmi party': 'Third Front',
    'aap': 'Third Front',
    'trinamool congress': 'Third Front',
    'tmc': 'Third Front',
    'left front': 'Third Front',
    'cpi': 'Third Front',  # Communist Party of India
    'cpi(m)': 'Third Front',  # Communist Party of India (Marxist)
    
    # Regional Alliances (example for Maharashtra)
    'maha agadi': 'Maha Vikas Aghadi',
    'mva': 'Maha Vikas Aghadi',
    
    # Independent and Others
    'independent': 'Independent',
    'ind': 'Independent'
}

# Alternative: Year-specific mapping (more accurate)
alliance_mapping_2019 = {
    'bjp': 'NDA', 'bharatiya janata party': 'NDA',
    'shiv sena': 'NDA', 'aiadmk': 'NDA', 'jdu': 'NDA',
    'inc': 'UPA', 'indian national congress': 'UPA',
    'dmk': 'UPA', 'aap': 'Third Front', 'tmc': 'Third Front'
}

alliance_mapping_2014 = {
    'bjp': 'NDA', 'bharatiya janata party': 'NDA',
    'shiv sena': 'NDA', 'aiadmk': 'NDA',
    'inc': 'UPA', 'indian national congress': 'UPA',
    'dmk': 'UPA', 'aap': 'Third Front'
}

In [6]:
import pandas as pd
import numpy as np

def create_alliance_column(df, party_column='Party', year_column=None):
    """
    Create Alliance column based on party mapping with data cleaning
    
    Parameters:
    df: DataFrame containing party data
    party_column: Name of the column containing party names
    year_column: Optional column for year-specific mapping
    """
    
    # Clean party names
    df['Party_Clean'] = (
        df[party_column]
        .str.lower()
        .str.strip()
        .str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
        .str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with single space
    )
    
    # Apply mapping
    df['Alliance'] = df['Party_Clean'].map(alliance_mapping)
    
    # Handle year-specific mappings if year column provided
    if year_column and year_column in df.columns:
        for year in df[year_column].unique():
            if year == 2019:
                year_mask = df[year_column] == year
                df.loc[year_mask, 'Alliance'] = (
                    df.loc[year_mask, 'Party_Clean']
                    .map(alliance_mapping_2019)
                )
            elif year == 2014:
                year_mask = df[year_column] == year
                df.loc[year_mask, 'Alliance'] = (
                    df.loc[year_mask, 'Party_Clean']
                    .map(alliance_mapping_2014)
                )
    
    # Fill missing values
    df['Alliance'] = df['Alliance'].fillna('Other')
    
    # Drop temporary clean column
    df.drop('Party_Clean', axis=1, inplace=True)
    
    return df

# Usage example
df = create_alliance_column(df, party_column='Party', year_column='Year')

In [7]:
df.head()

Unnamed: 0,State_Name,Constituency_No,Year,Party,Candidate,Incumbent,N_Cand,Constituency_Type,Votes,Valid_Votes,Vote_Share,Alliance
0,andaman_&_nicobar_islands,1,2019,inc,KULDEEP RAI SHARMA,False,15,GEN,95308.0,207296,45.976768,UPA
1,andaman_&_nicobar_islands,1,2019,bjp,VISHAL JOLLY,False,15,GEN,93901.0,207296,45.298028,NDA
2,andaman_&_nicobar_islands,1,2019,ind,PARITOSH KUMAR HALDAR,False,15,GEN,5341.0,207296,2.576509,Other
3,andaman_&_nicobar_islands,1,2019,aaap,SANJAY MESHACK,False,15,GEN,2839.0,207296,1.369539,Other
4,andaman_&_nicobar_islands,1,2019,bsp,PRAKASH MINJ,False,15,GEN,2486.0,207296,1.199251,Other


In [8]:
from pathlib import Path

# Go up one level from notebooks folder to project root
output_path = Path('../data/processed/featured_data.csv')

# Create parent directories if they don't exist
output_path.parent.mkdir(parents=True, exist_ok=True)

# Save the DataFrame
df.to_csv(output_path, index=False)

print(f"File saved to: {output_path}")

File saved to: ..\data\processed\featured_data.csv
