In [1]:
import pandas as pd
import os
import glob
import re

In [2]:
## Declare constants

election_year = 2018

In [3]:
## Get all file paths of scraped raw CSVs

folders_list = glob.glob(os.path.join(os.path.dirname("__file__"),'data/*'))

In [4]:
## Collect all the similar types of CSV files into seperate lists

state_result_paths = [path for path in folders_list if "state_result" in path]
parliament_result_paths = [path for path in folders_list if "parliament_result" in path]
state_fact_paths = [path for path in folders_list if "state_fact" in path]
parliament_fact_paths = [path for path in folders_list if "parliament_fact" in path]

In [5]:
## Append CSVs into a dataframe

def append_all(paths):
    
    li = []

    for filename in paths:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)

    df = pd.concat(li, axis=0, ignore_index=True)
    return df

In [6]:
## Make int into 3 digit string with 0s as prefix

def make_3_digit(num):
    digits = str(num)
    if num < 10:
        digits = '00' + str(num)
    elif num < 100:
        digits = '0' + str(num)
        
    return digits

In [7]:
## Clean votes data

def clean_votes(raw_df):
    
    clean_df = pd.DataFrame(columns=['name','coalition','party_code','votes','vote_share',
                                    'parliament_code_digits','constituency'])
    
    for index, row in raw_df.iterrows():

        ## Clean name column 
        ## e.g. Mordi Bimol (PH - DAP) 
        ## e.g. Hamdan Sani (PAS)	
        name_regex = re.compile(
            r"(?P<candidate>[a-zA-Z @]+\S)\s*(?:\(\s*(?P<coalition>[a-zA-Z' ]+\S)\s*-\s*(?P<party>[a-zA-Z' ]+)\s*\)|\(\s*(?P<party_only>[a-zA-Z' ]+\S)\s*\))"
        )
        name_info = name_regex.search(row['name'])
        name_groups = name_info.groupdict()

        name = name_groups['candidate']
        coalition = name_groups['coalition'] if name_groups['coalition'] != None else ''
        party_code = name_groups['party'] if name_groups['party'] != None else name_groups['party_only']
        
        ## Clean votes column
        ## e.g. 43.1% (12,771)
        
        vote_regex = re.compile(r"(?P<vote_share>[\d.]+)%\s\((?P<votes>[\d,]+)\)")
        vote_info = vote_regex.search(row['number_of_voters'])
        vote_groups = vote_info.groupdict()
        
        vote_share = vote_groups['vote_share']
        votes = int(''.join(vote_groups['votes'].split(',')))
        
        
        ## Clean parliament/state code
        ## e.g. P10, P201
        
        parliament_code_int = int(row['panel_code'].replace('P','').replace('N',''))
        parliament_code_digits = make_3_digit(parliament_code_int)
            
        ## For state constituencies, need to append Parliament seat code to State seat code PXXX-NXXX
        
        if 'N' in row['panel_code']:
            p_code_regex = re.compile(r"\(P(?P<p_code>[0-9]+)\)")
            p_code_info = p_code_regex.search(row['panel_name'])
            p_code_groups = p_code_info.groupdict()
            #print(row['panel_name'])
            parliament_p_code = p_code_groups['p_code']
            parliament_p_code_int = int(parliament_p_code)
            parliament_p_code_digits = make_3_digit(parliament_p_code_int)
            parliament_code_digits = parliament_p_code_digits + '-' + parliament_code_digits
            
            
        ## Clean consituency name
        constituency = row['panel_name']
        
        ## Clean state
        state = row['state_region']
        
        #print(votes)
        
        
        new_row = {'name':name, 'coalition':coalition, 'party_code':party_code, 
                  'votes':votes, 'vote_share':vote_share, 'parliament_code_digits':parliament_code_digits,
                  'constituency':constituency, 'state':state}
        clean_df = clean_df.append(new_row, ignore_index=True)
        

    clean_df['votes'] = pd.to_numeric(clean_df['votes'])
    
    
    ## Get winner and total votes for each constituency
    
    total_votes_list = clean_df.groupby(["parliament_code_digits"])["votes"].sum().reset_index()
    total_votes_list.rename(columns={'votes':'total_votes'},inplace=True)
    
    winner_list = clean_df.loc[clean_df.groupby('parliament_code_digits')['votes']
                               .agg(lambda x: x.idxmax())][['parliament_code_digits','party_code']]
    winner_list.rename(columns={'party_code':'winner'},inplace=True)
    
    clean_df_merged_with_winner = pd.merge(clean_df, winner_list, how="left", on=["parliament_code_digits"])
    clean_df_merged_with_winner['winner'] = clean_df_merged_with_winner \
        .apply(lambda row : 1 if row['party_code'] == row['winner'] else 0, axis = 1)
    
    clean_df_merged_with_total = pd.merge(clean_df_merged_with_winner, total_votes_list, 
                                          how="left", on=["parliament_code_digits"])
        
        
    return clean_df_merged_with_total

In [8]:
state_votes = clean_votes(append_all(state_result_paths))

In [9]:
parliament_votes = clean_votes(append_all(parliament_result_paths))

In [10]:
state_votes.to_csv('data/cleaned/votes_'+str(election_year)+'_states.csv')

In [11]:
parliament_votes.to_csv('data/cleaned/votes_'+str(election_year)+'_parliament.csv')