In [1]:
import pandas as pd
import os
import glob
import re
import json

In [2]:
data_write_path = "data/raw/undi_dot_info/"

year_keys = ['2004-GE','2008-GE','2013-GE','2018-GE']

states_dict = {
    'PL': 'Perlis',
    'KD': 'Kedah',
    'PN': 'Penang',
    'PR': 'Perak',
    'KE': 'Kelantan',
    'TR': 'Terengganu',
    'PH': 'Pahang',
    'SL': 'Selangor',
    'WP': 'FederalTerritories',
    'NS': 'NegriSembilan',
    'MK': 'Melaka',
    'JH': 'Johor',
    'SW': 'Sarawak',
    'SB': 'Sabah'
}

## Utility functions

In [3]:
def read_json(filename,data_write_path=data_write_path):
    '''
    Read JSON file from designated raw data folder
    '''
    
    users = ''
    with open(data_write_path + filename, "r") as f:
        users = json.load(f)
        
    return users

In [4]:
def make_3_digit(num):
    '''
    Make int into 3 digit string with 0s as prefix
    '''
    
    digits = str(num)
    if num < 10:
        digits = '00' + str(num)
    elif num < 100:
        digits = '0' + str(num)
        
    return digits

In [5]:
def clean_consituency_code(row):
    '''
    Clean the append Parliament seat code to State seat code to XXX or XXX-XXX respectively
    '''
    
    ## Clean parliament/state code
    ## e.g. P10, P201

    parliament_code_int = int(row['ParliamentCode'].replace('P',''))
    parliament_code_digits = make_3_digit(parliament_code_int)

    ## For state constituencies, need to append Parliament seat code to State seat code PXXX-NXXX

    if 'StateCode' in row:
        state_code_int = int(row['StateCode'].replace('N',''))
        state_code_digits = make_3_digit(state_code_int)
                             
        return parliament_code_digits + '-' + state_code_digits
        
    return parliament_code_digits

## Wrangle data from JSON files

In [6]:
def add_totals_for_results(results_df):
    '''
    Get winner and total votes for each constituency
    '''
    
    total_votes_list = results_df.groupby(["year","parliament_code_digits"])["Votes"].sum().reset_index()
    total_votes_list.rename(columns={'Votes':'total_votes'},inplace=True)
    
    winner_list = results_df.loc[results_df.groupby(['year','parliament_code_digits'])['Votes']
                               .agg(lambda x: x.idxmax())][['year','parliament_code_digits','Party']]
    winner_list.rename(columns={'Party':'winner'},inplace=True)
    
    results_df_merged_with_winner = pd.merge(results_df, winner_list, how="left", on=["year","parliament_code_digits"])
    results_df_merged_with_winner['winner'] = results_df_merged_with_winner \
        .apply(lambda row : 1 if row['Party'] == row['winner'] else 0, axis = 1)
    
    results_df_merged_with_total = pd.merge(results_df_merged_with_winner, total_votes_list, 
                                          how="left", on=["year","parliament_code_digits"])
    
    results_df_merged_with_total['vote_share'] = results_df_merged_with_total \
         .apply(lambda row: 100*row['Votes']/row['total_votes'] if row['total_votes'] != 0 else 0,
                axis = 1) 
    
    return results_df_merged_with_total

In [7]:
def json_data_to_df(json_filename):
    '''
    Read results JSON file and return raw dataframes for results and consitutency info
    '''
    
    raw_data = read_json(json_filename)
    
    results_list = []
    constituency_list = []
    
    for entry in raw_data:
        for year_key in year_keys:
            if year_key in entry:
                
                year = entry[year_key]['year']
                state = states_dict[entry[year_key]['State']]
                constituency = entry[year_key]['SeatName']
                clean_c_code = clean_consituency_code(entry[year_key])
                
                # Parse results
                results_for_consituency = entry[year_key]['candidates']
                for result in results_for_consituency:
                    result['year'] = year
                    result['state'] = state
                    result['constituency'] = constituency
                    result['parliament_code_digits'] = clean_c_code
                    
                results_list = results_list + results_for_consituency
                
                
                # Parse constituency info
                info = {}
                info['year'] = year
                info['state'] = state
                info['constituency'] = constituency
                info['parliament_code_digits'] = clean_c_code
                info = {**info, **entry[year_key]['info']}
                
                constituency_list.append(info)
    
    
    
    results_df = pd.DataFrame(results_list)
    results_df.sort_values(by=['year', 'parliament_code_digits','Votes'],
                           ascending=[True,True,False],
                           inplace=True)
    results_df.drop_duplicates(inplace=True)
    results_df = add_totals_for_results(results_df)
    results_df[['majority']] = results_df[['majority']].fillna(value=0)
    results_df[['NameBallot']] = results_df[['NameBallot']].fillna(value="")
    results_df.rename(columns={"Votes":"votes","Name":"name","Party":"party_code"},inplace=True)
    results_df.reset_index(drop=True,inplace=True)
    
    constituency_info_df = pd.DataFrame(constituency_list)
    constituency_info_df.drop_duplicates(inplace=True)
    constituency_info_df.sort_values(by=['year', 'parliament_code_digits'],inplace=True)
    constituency_info_df.reset_index(drop=True,inplace=True)
    
    return results_df, constituency_info_df

In [8]:
results_parliament_df, constituency_info_parliament_df = json_data_to_df('results_parliament.json')

In [9]:
results_states_df, constituency_info_states_df = json_data_to_df('results_states.json')

## Write results as CSVs

In [10]:
results_parliament_df.to_csv('data/cleaned/undi_dot_info/results_parliament.csv')

In [11]:
results_states_df.to_csv('data/cleaned/undi_dot_info/results_states.csv')

In [12]:
constituency_info_parliament_df.to_csv('data/cleaned/undi_dot_info/constituency_info_parliament.csv')

In [13]:
constituency_info_states_df.to_csv('data/cleaned/undi_dot_info/constituency_info_states.csv')