In [2]:
import pandas as pd
import numpy as np

def clean_excel_file(df):
    # Step 1: Rename Columns based on their expected order and content
    num_cols = df.shape[1]
    core_cols = ['SN', 'Polling_Station']

    # Check if the third column is likely 'Polling_Station_Name'
    # We expect this column to have 'name' in its header if it exists
    third_col_name = df.columns[2].lower()  # Lowercase the third column name to check properly
    if 'name' in third_col_name:
        core_cols.append('Polling_Station_Name')
        start_dynamic_cols = 4
    else:
        start_dynamic_cols = 3

    # Continue with the remaining expected column names
    core_cols += [f'col_{i}' for i in range(start_dynamic_cols, num_cols-5+1)] + \
                 ['Total_Valid_Votes', 'Total_Rejected_Votes', 'NOTA', 'Total', 'Total_Votes_Tendered']
    
    # Apply the new column names
    df.columns = core_cols

    if 'Polling_Station_Name' in df.columns:
        df.drop('Polling_Station_Name', axis=1, inplace=True)
        
    # Step 2: Clean all cells in the dataframe to remove unwanted characters
    df = df.applymap(lambda x: ''.join(c for c in str(x) if c.isalnum() or c == '.'))

    # Step 3: Replace 'nan' strings with actual NaN values
    df.replace(to_replace='^nan$', value=np.nan, regex=True, inplace=True)
    
    #sum
    

    # Step 4: Convert applicable columns to numeric types
    # Define columns to exclude from numeric conversion (text columns)
    exclude_columns = ['SN', 'Polling_Station']

    # Identify columns that should be processed (all except the excluded ones)
    columns_to_process = df.columns.difference(exclude_columns)

    # Apply numeric conversion only to the appropriate columns
    df[columns_to_process] = df[columns_to_process].apply(pd.to_numeric, errors='coerce')

    # Step 5: Filter out rows where any cell is non-numeric (where it should be numeric)
    df = df[df[columns_to_process].applymap(lambda x: pd.isna(x) or isinstance(x, (int, float))).all(axis=1)]
    start_col = 2  # 3rd column (index 2)
    end_col = len(core_cols) - 6   # 7th column (index 6)

    # Calculate the sum of values across specified columns
    sum_of_columns = df.iloc[:, start_col:end_col+1].sum(axis=1)
    # print(sum_of_columns)
    mask = sum_of_columns == df['Total_Valid_Votes']
    # df_filtered = df.loc[mask]
    df_filtered = df[mask]

    df = df_filtered.apply(pd.to_numeric, errors='coerce')

    return df

In [3]:

import os 


def clean_and_dump_excel_files(excel_dir, output_dir, log_file_name, year):
    os.makedirs(output_dir, exist_ok=True)
    for filename in os.listdir(excel_dir):
        if filename.endswith('.xlsx'):
            try: 
                df = pd.read_excel(os.path.join(excel_dir, filename))
                cleaned_df = clean_excel_file(df)
                
                cleaned_df['State'] = 'MH'
                cleaned_df['Year'] = year
                if year == 2019 :
                    # remove the extension from the filename and split it by _ and get the last element 
                    constituency = filename.split('.')[0].split('_')[-1]
                elif year == 2014:
                    constituency = filename.split('.')[0].split('_')[-1]
                # else :
                #     constituency = filename.split('.')[0].split('-')[-1]       
                cleaned_df['Constituency'] = int(constituency)
                output_file_path = os.path.join(output_dir, f"{constituency}.xlsx")
                
                cleaned_df.to_excel(output_file_path, index=False)
            except Exception as exc: 
                with open(log_file_name, 'a') as f: 
                    f.write(f"Error processing file: {filename} - {exc}\n")
                continue
        

        

In [6]:
# excel_dir = 'results/Parsed_Excel/Maharastra/Assembly Election 2019'
# output_dir = 'results/cleaned_election_data/MH/2019'
# log_file_name = "logs/MH_assembly_election_2019_cleaning_log.txt"
# year = 2019
# clean_and_dump_excel_files(excel_dir, output_dir, log_file_name, year) 

# excel_dir = 'results/Parsed_Excel/Maharastra/Assembly Election 2014'
# output_dir = 'results/cleaned_election_data/MH/AE_2014'
# log_file_name = "logs/MH_assembly_election_2014_cleaning_log.txt"
# year = 2014
# clean_and_dump_excel_files(excel_dir, output_dir, log_file_name, year)

excel_dir = 'output/Parsed_Excel/Maharastra/Assembly Election 2019'
output_dir = 'output/cleaned_election_data/MH/GE_2014'
log_file_name = "logs/MH_loksabha_election_2014_cleaning_log.txt"
year = 2014
clean_and_dump_excel_files(excel_dir, output_dir, log_file_name, year)

FileNotFoundError: [Errno 2] No such file or directory: 'output/Parsed_Excel/Maharastra/Assembly Election 2019'

In [2]:
###test

In [18]:
import pandas as pd
import numpy as np

def clean_excel_file(df):
    # Step 1: Rename Columns based on their expected order and content
    num_cols = df.shape[1]
    core_cols = ['SN', 'Polling_Station']

    # Check if the third column is likely 'Polling_Station_Name'
    # We expect this column to have 'name' in its header if it exists
    third_col_name = df.columns[2].lower()  # Lowercase the third column name to check properly
    if 'name' in third_col_name:
        core_cols.append('Polling_Station_Name')
        start_dynamic_cols = 4
    else:
        start_dynamic_cols = 3

    # Continue with the remaining expected column names
    core_cols += [f'col_{i}' for i in range(start_dynamic_cols, num_cols-5+1)] + \
                 ['Total_Valid_Votes', 'Total_Rejected_Votes', 'NOTA', 'Total', 'Total_Votes_Tendered']
    
    # Apply the new column names
    df.columns = core_cols

    if 'Polling_Station_Name' in df.columns:
        df.drop('Polling_Station_Name', axis=1, inplace=True)
        
    # Step 2: Clean all cells in the dataframe to remove unwanted characters
    df = df.applymap(lambda x: ''.join(c for c in str(x) if c.isalnum() or c == '.'))

    # Step 3: Replace 'nan' strings with actual NaN values
    df.replace(to_replace='^nan$', value=np.nan, regex=True, inplace=True)
    
    #sum
    

    # Step 4: Convert applicable columns to numeric types
    # Define columns to exclude from numeric conversion (text columns)
    exclude_columns = ['SN', 'Polling_Station']

    # Identify columns that should be processed (all except the excluded ones)
    columns_to_process = df.columns.difference(exclude_columns)

    # Apply numeric conversion only to the appropriate columns
    df[columns_to_process] = df[columns_to_process].apply(pd.to_numeric, errors='coerce')

    # Step 5: Filter out rows where any cell is non-numeric (where it should be numeric)
    df = df[df[columns_to_process].applymap(lambda x: pd.isna(x) or isinstance(x, (int, float))).all(axis=1)]
    start_col = 2  # 3rd column (index 2)
    end_col = len(core_cols) - 6   # 7th column (index 6)

    # Calculate the sum of values across specified columns
    sum_of_columns = df.iloc[:, start_col:end_col+1].sum(axis=1)
    # print(sum_of_columns)
    mask = sum_of_columns == df['Total_Valid_Votes']
    # df_filtered = df.loc[mask]
    # rows_to_keep = ~mask
    df_filtered = df[mask]

    final_df = df_filtered.apply(pd.to_numeric, errors='coerce')

    return final_df

In [19]:
excel_dir = 'results/Parsed_Excel/Maharastra/Assembly Election 2014/combined_JSON_Maharastra_2014_AC_199.json.xlsx'
# df = pd.read_excel(excel_dir)
# output_dir = 'results'
# log_file_name = "logs/test.txt"
# year = 2014
# clean_and_dump_excel_files(excel_dir, output_dir, log_file_name, year)

In [20]:
df = pd.read_excel(excel_dir)
df

Unnamed: 0,Sr.no.,Polling Station Number,Kiran Shravan Pol (BSP),Takawan e Atmaram Sahebrao (INC),Rameshr ao Kisan Thorat (NCP),Rajaram Maruti Tambe (MNS),Rajendra Shankar Khati (Shivsen a),Kul Rahul Subhash rao (RSP),Kailas Vitthal Kamble (HJP),Vikas (Aba) Takawan e (P&WPI),...,Rajaram Shivaji Kadam (Indepen dent),Shivaji Rambha u Nandkhil e (Indepen dent),Shinde Narendra Balasahe b (Indepen dent),Suresh Somnath Ghatole (Indepen dent),Sanjay Ambadas Kamble (Indepen dent),Total valid Votes,Total invalid Votes,NOTA,Total Votes Recorded in CU,Tended Votes
0,1,1,1,0,144,54,5,668,8,1,...,1,0,0,0,2,897,0.0,5.0,902,
1,2,2,0,1,140,21,4,387,0,16,...,1,0,1,0,0,576,0.0,1.0,577,
2,3,3,4,0,224,79,4,508,3,20,...,3,0,0,1,1,854,0.0,3.0,857,
3,4,4,0,3,319,71,3,464,4,3,...,0,0,0,0,1,874,0.0,19.0,893,
4,5,5,3,3,176,30,10,573,2,51,...,0,0,1,2,1,872,0.0,1.0,873,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,287,283,0,3,441,5,7,283,3,1,...,0,3,0,0,0,755,0.0,4.0,759,
285,288,284,1,7,255,155,3,255,4,20,...,0,10,0,1,2,723,0.0,2.0,725,
286,289,285,10,7,276,334,23,193,2,15,...,0,5,3,3,4,884,0.0,12.0,896,
287,PostalBallot,,5,2,466,34,63,534,3,14,...,0,0,0,0,0,1130,23.0,4.0,27,


In [25]:
cleaned_df = clean_excel_file(df)
cleaned_df

  df = df.applymap(lambda x: ''.join(c for c in str(x) if c.isalnum() or c == '.'))
  df = df[df[columns_to_process].applymap(lambda x: pd.isna(x) or isinstance(x, (int, float))).all(axis=1)]


Unnamed: 0,SN,Polling_Station,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,...,col_16,col_17,col_18,col_19,col_20,Total_Valid_Votes,Total_Rejected_Votes,NOTA,Total,Total_Votes_Tendered
0,1,1.0,1.0,0,144,54,5,668,8,1,...,1,0,0,0,2,897,0.0,5.0,902,
1,2,2.0,0.0,1,140,21,4,387,0,16,...,1,0,1,0,0,576,0.0,1.0,577,
2,3,3.0,4.0,0,224,79,4,508,3,20,...,3,0,0,1,1,854,0.0,3.0,857,
3,4,4.0,0.0,3,319,71,3,464,4,3,...,0,0,0,0,1,874,0.0,19.0,893,
4,5,5.0,3.0,3,176,30,10,573,2,51,...,0,0,1,2,1,872,0.0,1.0,873,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,285,281.0,16.0,2,150,91,15,248,30,7,...,2,1,0,0,1,569,0.0,3.0,572,
283,286,282.0,1.0,1,179,67,6,266,2,31,...,0,4,1,1,0,572,0.0,10.0,582,
284,287,283.0,0.0,3,441,5,7,283,3,1,...,0,3,0,0,0,755,0.0,4.0,759,
285,288,284.0,1.0,7,255,155,3,255,4,20,...,0,10,0,1,2,723,0.0,2.0,725,
