In [1]:
import pandas as pd
import os
import glob
import re
import json

In [2]:
data_write_path = "data/raw/undi_dot_info/"

year_keys = ['2004-GE','2008-GE','2011-GE','2013-GE','2016-GE','2018-GE']

states_dict = {
    'PL': 'Perlis',
    'KD': 'Kedah',
    'PN': 'Penang',
    'PR': 'Perak',
    'KE': 'Kelantan',
    'TR': 'Terengganu',
    'PH': 'Pahang',
    'SL': 'Selangor',
    'WP': 'Federal Territories',
    'NS': 'Negeri Sembilan',
    'MK': 'Melaka',
    'JH': 'Johor',
    'SW': 'Sarawak',
    'SB': 'Sabah'
}

## Utility functions

In [3]:
def read_json(filename,data_write_path=data_write_path):
    '''
    Read JSON file from designated raw data folder
    '''
    
    data = ''
    with open(data_write_path + filename, "r") as f:
        data = json.load(f)
        
    return data

In [4]:
def make_3_digit(num):
    '''
    Make int into 3 digit string with 0s as prefix
    '''
    
    digits = str(num)
    if num < 10:
        digits = '00' + str(num)
    elif num < 100:
        digits = '0' + str(num)
        
    return digits

In [5]:
def clean_consituency_code(row):
    '''
    Clean the append Parliament seat code to State seat code to XXX or XXX-XXX respectively
    '''
    
    ## Clean parliament/state code
    ## e.g. P10, P201

    parliament_code_int = int(row['ParliamentCode'].replace('P',''))
    parliament_code_digits = make_3_digit(parliament_code_int)

    ## For state constituencies, need to append Parliament seat code to State seat code PXXX-NXXX

    if 'StateCode' in row:
        state_code_int = int(row['StateCode'].replace('N',''))
        state_code_digits = make_3_digit(state_code_int)
                             
        return parliament_code_digits + '-' + state_code_digits
        
    return parliament_code_digits

## Wrangle data from JSON files

In [6]:
def add_totals_for_results(results_df):
    '''
    Get winner and total votes for each constituency
    '''
    
    total_votes_list = results_df.groupby(["year","parliament_code_digits"])["Votes"].sum().reset_index()
    total_votes_list.rename(columns={'Votes':'total_votes'},inplace=True)
    
    winner_list = results_df.loc[results_df.groupby(['year','parliament_code_digits'])['Votes']
                               .agg(lambda x: x.idxmax())][['year','parliament_code_digits','Party']]
    winner_list.rename(columns={'Party':'winner'},inplace=True)
    
    results_df_merged_with_winner = pd.merge(results_df, winner_list, how="left", on=["year","parliament_code_digits"])
    results_df_merged_with_winner['winner'] = results_df_merged_with_winner \
        .apply(lambda row : 1 if row['Party'] == row['winner'] else 0, axis = 1)
    
    results_df_merged_with_total = pd.merge(results_df_merged_with_winner, total_votes_list, 
                                          how="left", on=["year","parliament_code_digits"])
    
    results_df_merged_with_total['vote_share'] = results_df_merged_with_total \
         .apply(lambda row: 100*row['Votes']/row['total_votes'] if row['total_votes'] != 0 else 0,
                axis = 1) 
    
    return results_df_merged_with_total

In [7]:
def swap_years(year):
  if year == '2011':
    return '2008'
  if year == '2016':
    return '2013'
  return year

In [8]:
def json_data_to_df(json_filename):
    '''
    Read results JSON file and return raw dataframes for results and consitutency info
    '''
    
    raw_data = read_json(json_filename)
    
    results_list = []
    constituency_list = []
    
    for entry in raw_data:
        for year_key in year_keys:
            if year_key in entry:
                
                year = swap_years(entry[year_key]['year'])
                state = states_dict[entry[year_key]['State']]
                constituency = entry[year_key]['SeatName']
                clean_c_code = clean_consituency_code(entry[year_key])
                
                # Parse results
                results_for_consituency = entry[year_key]['candidates']
                for result in results_for_consituency:
                    result['year'] = year
                    result['state'] = state
                    result['constituency'] = constituency
                    result['parliament_code_digits'] = clean_c_code
                    
                results_list = results_list + results_for_consituency
                
                
                # Parse constituency info
                info = {}
                info['year'] = year
                info['state'] = state
                info['constituency'] = constituency
                info['parliament_code_digits'] = clean_c_code
                info = {**info, **entry[year_key]['info']}
                
                constituency_list.append(info)
    
    
    
    results_df = pd.DataFrame(results_list)
    
    results_df.sort_values(by=['year', 'parliament_code_digits','Votes'],
                           ascending=[True,True,False],
                           inplace=True)
    results_df.drop_duplicates(inplace=True)
    results_df = add_totals_for_results(results_df)
    results_df[['majority']] = results_df[['majority']].fillna(value=0)
    results_df[['NameBallot']] = results_df[['NameBallot']].fillna(value="")
    results_df.rename(columns={"Votes":"votes","Name":"name","Party":"party_code"},inplace=True)
    # Dropping the 'Unopposed' column from original data because it is not consistent and is almost all blanks
    if 'Unopposed' in results_df.columns:
        results_df.drop(columns=['Unopposed'], inplace=True)
    results_df.reset_index(drop=True,inplace=True)
    
    constituency_info_df = pd.DataFrame(constituency_list)
    constituency_info_df.drop_duplicates(inplace=True)
    constituency_info_df.sort_values(by=['year', 'parliament_code_digits'],inplace=True)
    constituency_info_df.reset_index(drop=True,inplace=True)
    
    return results_df, constituency_info_df

In [9]:
results_parliament_df, constituency_info_parliament_df = json_data_to_df('results_parliament.json')

In [10]:
results_states_df, constituency_info_states_df = json_data_to_df('results_states.json')

## Write results as CSVs

In [11]:
results_parliament_df.to_csv('data/cleaned/undi_dot_info/results_parliament.csv')

In [12]:
results_states_df.to_csv('data/cleaned/undi_dot_info/results_states.csv')

In [13]:
constituency_info_parliament_df.to_csv('data/cleaned/undi_dot_info/constituency_info_parliament.csv')

In [14]:
constituency_info_states_df.to_csv('data/cleaned/undi_dot_info/constituency_info_states.csv')

## Code Book

### Prepare Code Book for results data

In [15]:
results_parliament_df.head(5)

Unnamed: 0,party_code,name,votes,majority,year,state,constituency,parliament_code_digits,NameBallot,winner,total_votes,vote_share
0,UMNO,DATUK SERI AZMI KHALID,18322,9264.0,2004,Perlis,PADANG BESAR,1,,1,27380,66.917458
1,PAS,WAN KHARIZAL WAN KHAZIM,9058,0.0,2004,Perlis,PADANG BESAR,1,,0,27380,33.082542
2,UMNO,DATUK RADZI SHEIKH AHMAD,22498,12548.0,2004,Perlis,KANGAR,2,,1,32448,69.335552
3,PAS,ISHAR SAAD,9950,0.0,2004,Perlis,KANGAR,2,,0,32448,30.664448
4,UMNO,DATUK SERI SYED RAZLAN SYED PUTRA JAMALULLAIL,17367,3243.0,2004,Perlis,ARAU,3,,1,31491,55.14909


In [16]:
results_states_df.head(5)

Unnamed: 0,party_code,name,votes,majority,year,state,constituency,parliament_code_digits,NameBallot,winner,total_votes,vote_share
0,MCA,LOH YOON FOO,3668,2192.0,2004,Perlis,TITI TINGGI,001-001,,1,5144,71.306376
1,PKR,KO CHU LIANG,1476,0.0,2004,Perlis,TITI TINGGI,001-001,,0,5144,28.693624
2,UMNO,DATUK ZAHIDI ZAINUL ABIDIN,3492,1871.0,2004,Perlis,BESERI,001-002,,1,5113,68.296499
3,PAS,MOHD ANUAR MOHD TAHIR,1621,0.0,2004,Perlis,BESERI,001-002,,0,5113,31.703501
4,UMNO,MANSOR JUSOH,4632,2590.0,2004,Perlis,CHUPING,001-003,,1,6674,69.403656


In [17]:
results_states_df.columns

Index(['party_code', 'name', 'votes', 'majority', 'year', 'state',
       'constituency', 'parliament_code_digits', 'NameBallot', 'winner',
       'total_votes', 'vote_share'],
      dtype='object')

In [18]:
len(results_states_df.columns)

12

In [19]:
results_codebook_dict = [
    {
        "Variable Name": "party_code",
        "Variable Description": "Short name of the coalition that the candidate's party belongs to",
        "Variable Type": "string"
    },
    {
        "Variable Name": "name",
        "Variable Description": "Name of candidate",
        "Variable Type": "string"
    },
    {
        "Variable Name": "votes",
        "Variable Description": "Number of votes awarded to the candidate",
        "Variable Type": "number"
    },
    {
        "Variable Name": "majority",
        "Variable Description": "The majority of votes by which the winning candidate won",
        "Variable Type": "number"
    },
    {
        "Variable Name": "year",
        "Variable Description": "Election year",
        "Variable Type": "number"
    },
    {
        "Variable Name": "state",
        "Variable Description": "Name of the state that the constituency belongs to",
        "Variable Type": "string"
    },
    {
        "Variable Name": "constituency",
        "Variable Description": "Name of the constituency",
        "Variable Type": "string"
    },
    {
        "Variable Name": "parliament_code_digits",
        "Variable Description": "For parliamentary constituencies: 3 digit code of constituency " +
            "(e.g. P125 for Putrajaya is coded 125). " +
            "For state constituencies: 3 digit parliament code followed by 3 digit state constituency code " +
            "(e.g. N33 for Air Itam (P51) in Penang is coded 051-033)",
        "Variable Type": "string (3 numerical digits for parliament, 3-3 for state)"
    },
    {
        "Variable Name": "NameBallot",
        "Variable Description": "Name of the candidate as it appears on the ballot. " +
            "This data is only available for the 2018 elections data",
        "Variable Type": "string"
    },
    {
        "Variable Name": "winner",
        "Variable Description": "Boolean indicating whether a candidate won in the constituency. " +
            "Coded 1 if they are a winner, 0 if they are not.",
        "Variable Type": "number (1 or 0)"
    },
    {
        "Variable Name": "total_votes",
        "Variable Description": "Sum of the votes that were awarded to all the candidates in that constituency.",
        "Variable Type": "number"
    },
    {
        "Variable Name": "vote_share",
        "Variable Description": "Percentage of votes awarded to the candidate in this constituency",
        "Variable Type": "number (percentage)"
    }
]


In [20]:
results_codebook_df = pd.DataFrame(results_codebook_dict)

In [21]:
results_codebook_df.to_csv('data/cleaned/undi_dot_info/results_CODEBOOK.csv')

### Prepare Code Book for constituency information data

In [22]:
constituency_info_parliament_df.head(5)

Unnamed: 0,year,state,constituency,parliament_code_digits,EligibleVoters,MalayVoters,ChineseVoters,IndianVoters,OtherVoters,BumiMuslimVoters,BumiNonmuslimVoters,SabahBumiVoters,Early,Postal,Spoilt,Turnout,Unreturned
0,2004,Perlis,PADANG BESAR,1,33899,84,11,1,3,0,0,0,0,0,672,28453,400
1,2004,Perlis,KANGAR,2,40516,79,18,2,1,0,0,0,0,0,548,32996,99
2,2004,Perlis,ARAU,3,38067,87,9,2,2,0,0,0,0,0,533,32062,38
3,2004,Kedah,LANGKAWI,4,27980,90,7,3,0,0,0,0,0,0,560,22844,36
4,2004,Kedah,JERLUN,5,44148,90,9,0,1,0,0,0,0,0,718,36822,0


In [23]:
constituency_info_states_df.head(5)

Unnamed: 0,year,state,constituency,parliament_code_digits,EligibleVoters,MalayVoters,ChineseVoters,IndianVoters,OtherVoters,BumiMuslimVoters,BumiNonmuslimVoters,SabahBumiVoters,Early,Postal,Spoilt,Turnout,Unreturned
0,2004,Perlis,TITI TINGGI,001-001,6887,71,25,3,0,0,0,0,0,0,253,5593,196
1,2004,Perlis,BESERI,001-002,6645,75,23,1,1,0,0,0,0,0,109,5222,0
2,2004,Perlis,CHUPING,001-003,7802,85,4,2,10,0,0,0,0,0,122,6797,1
3,2004,Perlis,MATA AYER,001-004,5557,90,4,2,4,0,0,0,0,0,56,4563,28
4,2004,Perlis,SANTAN,001-005,7008,99,1,0,0,0,0,0,0,0,73,6063,0


In [24]:
constituency_info_states_df.columns

Index(['year', 'state', 'constituency', 'parliament_code_digits',
       'EligibleVoters', 'MalayVoters', 'ChineseVoters', 'IndianVoters',
       'OtherVoters', 'BumiMuslimVoters', 'BumiNonmuslimVoters',
       'SabahBumiVoters', 'Early', 'Postal', 'Spoilt', 'Turnout',
       'Unreturned'],
      dtype='object')

In [25]:
len(constituency_info_states_df.columns)

17

In [26]:
constituency_info_codebook_dict = [
    {
        "Variable Name": "year",
        "Variable Description": "Election year",
        "Variable Type": "number"
    },
    {
        "Variable Name": "state",
        "Variable Description": "Name of the state that the constituency belongs to",
        "Variable Type": "string"
    },
    {
        "Variable Name": "constituency",
        "Variable Description": "Name of the constituency",
        "Variable Type": "string"
    },
    {
        "Variable Name": "parliament_code_digits",
        "Variable Description": "For parliamentary constituencies: 3 digit code of constituency " +
            "(e.g. P125 for Putrajaya is coded 125). " +
            "For state constituencies: 3 digit parliament code followed by 3 digit state constituency code " +
            "(e.g. N33 for Air Itam (P51) in Penang is coded 051-033)",
        "Variable Type": "string (3 numerical digits for parliament, 3-3 for state)"
    },
    {
        "Variable Name": "EligibleVoters",
        "Variable Description": "Number of eligible voters in the constituency",
        "Variable Type": "number"
    },
    {
        "Variable Name": "MalayVoters",
        "Variable Description": "Percentage of Malay voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "ChineseVoters",
        "Variable Description": "Percentage of Chinese voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "IndianVoters",
        "Variable Description": "Percentage of Indian voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "OtherVoters",
        "Variable Description": "Percentage of Other voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "BumiMuslimVoters",
        "Variable Description": "Percentage of Bumi Muslim voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "BumiNonmuslimVoters",
        "Variable Description": "Percentage of Bumi Non-Muslim voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "SabahBumiVoters",
        "Variable Description": "Percentage of Sabah Bumi voters in the constituency",
        "Variable Type": "number (percentage)"
    },
    {
        "Variable Name": "Early",
        "Variable Description": "Number of early votes in the constituency",
        "Variable Type": "number"
    },
    {
        "Variable Name": "Postal",
        "Variable Description": "Number of postal votes in the constituency",
        "Variable Type": "number"
    },
    {
        "Variable Name": "Sploit",
        "Variable Description": "Number of sploit votes in the constituency",
        "Variable Type": "number"
    },
    {
        "Variable Name": "Turnout",
        "Variable Description": "Voter turnout in number of votes",
        "Variable Type": "number"
    },
    
]

In [27]:
constituency_info_codebook_df = pd.DataFrame(constituency_info_codebook_dict)

In [28]:
constituency_info_codebook_df

Unnamed: 0,Variable Name,Variable Description,Variable Type
0,year,Election year,number
1,state,Name of the state that the constituency belong...,string
2,constituency,Name of the constituency,string
3,parliament_code_digits,For parliamentary constituencies: 3 digit code...,"string (3 numerical digits for parliament, 3-3..."
4,EligibleVoters,Number of eligible voters in the constituency,number
5,MalayVoters,Percentage of Malay voters in the constituency,number (percentage)
6,ChineseVoters,Percentage of Chinese voters in the constituency,number (percentage)
7,IndianVoters,Percentage of Indian voters in the constituency,number (percentage)
8,OtherVoters,Percentage of Other voters in the constituency,number (percentage)
9,BumiMuslimVoters,Percentage of Bumi Muslim voters in the consti...,number (percentage)


In [29]:
constituency_info_codebook_df.to_csv('data/cleaned/undi_dot_info/constituency_info_CODEBOOK.csv')