In [1]:
## Import required libraries
import re
import pandas as pd
import requests
import urllib
import json
import json5
from time import sleep
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [2]:
spr_results_js_file_url = "https://dashboard.spr.gov.my/js/penamaan.js"
spr_party_json_path = "./data/raw/spr/ge15_results/spr_parties.json"

## Utility functions

In [3]:
def read_json(filepath):
    '''
    Read JSON file from designated raw data folder
    '''
    
    users = ''
    with open(filepath, "r") as f:
        users = json.load(f)
        
    return users

In [4]:
def get_spr_data(url):
    '''
    Get data from SPR dashboard
    Params: endpoint
    Returns: dict parsed results data
    '''
    
    raw_data = ''
    raw_dict = {}

    print("Requesting... "+ url)
    try:
        r = requests.get(url, timeout=5)
        print("Status: "+ str(r.status_code))
        raw_data = r.text
    except:
        print("Something went wrong")
    
    finally:
        raw_data = r.text.replace('var dataPenamaan = ','')[:-1]
        raw_dict = json5.loads(raw_data)
    
    return raw_dict

In [5]:
def match_coal_or_party(row):
  coal_or_party = ''

  # For the first pass, take the coalition name if available
  if row['coalition'] == '':
    coal_or_party = row['party_code']
  else:
    coal_or_party = row['coalition']

  # Code independents as BEBAS
  if coal_or_party == 'IND':
    coal_or_party = 'BEBAS'

  # SPR data reports candidates from these parties as their parties, not their coalitions, so use party to match if it's one of these
  if row['party_code'] in ["PEJUANG","PAS","PUTRA","WARISAN","PRM","MUDA","PBM","PUR","PSM","PSB","KDM","SEDAR","PBDS","PPRS","PBRS","DAP","PBK"]:
    coal_or_party = row['party_code']
    

  return coal_or_party

In [6]:
def make_parliament_code_digits(codeId):
  p_code = codeId[:3]
  n_code = codeId[3:]

  if len(n_code) == 2:
    n_code = "0" + n_code

  if n_code == "000":
    return p_code
  else:
    return p_code + "-" + n_code

## Scrape SPR Data

In [7]:
raw_dict = get_spr_data(spr_results_js_file_url)

Requesting... https://dashboard.spr.gov.my/js/penamaan.js
Status: 200


In [8]:
raw_results_df = pd.DataFrame(raw_dict)

In [9]:
raw_results_df

Unnamed: 0,id,t,jp,pid,s,kid,kt,i,st,mi,nc,ju,mj,ut
0,20,MOHAMMAD RAFFI BERAN,,25,L,15900,parlimen,,HD,2022-11-05T10:25:28.890Z,4,1003,31558,1298.0
1,68,NOOR AZLEEN AMBROS,,1,L,15900,parlimen,,KLH,2022-11-05T10:25:28.890Z,1,37369,31558,1298.0
2,170,HASSAN ABDUL KARIM,,31,L,15900,parlimen,,MNG,2022-11-05T10:25:28.890Z,2,71233,31558,1298.0
3,384,MOHAMAD FARID BIN ABDUL RAZAK,,27,L,15900,parlimen,,KLH,2022-11-05T10:25:28.890Z,3,39675,31558,1298.0
4,332,ROSLI,TIADA,25,L,04000,parlimen,,HD,2022-11-05T10:29:18.173Z,1,506,27179,1288.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1370,1151,MEOR SHAHIMUDIN BIN HJ HASIM,,20,L,06633,dun,005,HD,2022-11-05T17:19:09.293Z,3,259,11372,373.0
1371,1034,BRYAN YONG WU SEAN,,42,L,06633,dun,,HD,2022-11-05T17:19:09.293Z,2,710,11372,373.0
1372,1063,STEVEN TIW,,31,L,06633,dun,,MNG,2022-11-05T17:19:09.293Z,1,15602,11372,373.0
1373,784,ROSLAN BIN ISMAIL,,25,L,06633,dun,,HD,2022-11-05T17:19:09.293Z,5,478,11372,373.0


In [10]:
parties = read_json(spr_party_json_path)

In [11]:
parties_df = pd.DataFrame(parties)

In [12]:
combined_df = pd.merge(raw_results_df, parties_df, how="left", left_on="pid", right_on="id")

In [13]:
combined_df

Unnamed: 0,id_x,t,jp,pid,s,kid,kt,i_x,st,mi,nc,ju,mj,ut,id_y,n,a,i_y,c
0,20,MOHAMMAD RAFFI BERAN,,25,L,15900,parlimen,,HD,2022-11-05T10:25:28.890Z,4,1003,31558,1298.0,25,PARTI PEJUANG TANAHAIR,PEJUANG,pejuang.png,#09618A
1,68,NOOR AZLEEN AMBROS,,1,L,15900,parlimen,,KLH,2022-11-05T10:25:28.890Z,1,37369,31558,1298.0,1,BARISAN NASIONAL OF MALAYSIA,BN,bn.png,#031A93
2,170,HASSAN ABDUL KARIM,,31,L,15900,parlimen,,MNG,2022-11-05T10:25:28.890Z,2,71233,31558,1298.0,31,PAKATAN HARAPAN,PH,ph.png,#D7292F
3,384,MOHAMAD FARID BIN ABDUL RAZAK,,27,L,15900,parlimen,,KLH,2022-11-05T10:25:28.890Z,3,39675,31558,1298.0,27,PERIKATAN NASIONAL,PN,pn.png,#043253
4,332,ROSLI,TIADA,25,L,04000,parlimen,,HD,2022-11-05T10:29:18.173Z,1,506,27179,1288.0,25,PARTI PEJUANG TANAHAIR,PEJUANG,pejuang.png,#09618A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1370,1151,MEOR SHAHIMUDIN BIN HJ HASIM,,20,L,06633,dun,005,HD,2022-11-05T17:19:09.293Z,3,259,11372,373.0,20,BEBAS,BEBAS,bebas.png,#993300
1371,1034,BRYAN YONG WU SEAN,,42,L,06633,dun,,HD,2022-11-05T17:19:09.293Z,2,710,11372,373.0,42,PARTI BANGSA MALAYSIA,PBM,pbm.png,#323467
1372,1063,STEVEN TIW,,31,L,06633,dun,,MNG,2022-11-05T17:19:09.293Z,1,15602,11372,373.0,31,PAKATAN HARAPAN,PH,ph.png,#D7292F
1373,784,ROSLAN BIN ISMAIL,,25,L,06633,dun,,HD,2022-11-05T17:19:09.293Z,5,478,11372,373.0,25,PARTI PEJUANG TANAHAIR,PEJUANG,pejuang.png,#09618A


In [14]:
combined_df['a'].unique()

array(['PEJUANG', 'BN', 'PH', 'PN', 'PAS', 'BEBAS', 'PUTRA', 'GPS',
       'WARISAN', 'PRM', 'MUDA', 'PBM', 'PUR', 'PSM', 'PSB', 'KDM', 'GRS',
       'SEDAR', 'PBDS', 'PPRS', 'PBRS', 'DAP', 'PBK'], dtype=object)

In [15]:
combined_df.to_csv("./data/raw/spr/ge15_results/results_with_party_data.csv")

## Prepare the Kini Data for merging

In [16]:
ge15_candidates_clean_df = pd.read_csv('./data/cleaned/malaysiakini_newslab/ge15_candidates_clean.csv', converters = {'parliament_code_digits': str})
ge15_candidates_clean_df.fillna('',inplace=True)
ge15_candidates_clean_df.drop(columns=['Unnamed: 0'], inplace=True)

In [17]:
ge15_candidates_clean_df['party_code'].unique()

array(['PUTRA', 'BERSATU', 'UMNO', 'AMANAH', 'WARISAN', 'PBM', 'PKR',
       'IND', 'PEJUANG', 'PAS', 'MMSP', 'MCA', 'PRM', 'MIC', 'BERJASA',
       'DIRECT', 'DAP', 'GERAKAN', 'MUDA', 'PSM', 'IMAN', 'PCM', 'IPF',
       'PUR', 'KIMMA', 'UPKO', 'KDM', 'PPRS', 'STAR', 'PBS', 'SAPP',
       'PBRS', 'SUPP', 'PSB', 'PBB', 'PRS', 'PBDS', 'PBK', 'SEDAR', 'PDP'],
      dtype=object)

In [18]:
ge15_candidates_clean_df['coalition'].unique()

array(['GTA', 'PN', 'BN', 'PH', '', 'GRS', 'GPS'], dtype=object)

In [19]:
#ge15_candidates_clean_df

In [20]:
#ge15_candidates_clean_df['coal_or_party'] = ge15_candidates_clean_df.apply(lambda x: match_coal_or_party(x),axis = 1) 

In [21]:
#ge15_candidates_clean_df.to_csv('./data/raw/malaysiakini_newslab/candidates_parliament_prep_merge_with_ge15_spr.csv')

In [22]:
def custom_fixes(kini_df):

  kini_df.loc[kini_df['name'] == 'TOO CHENG HUAT','name'] = 'TOO GAO LAN'
  kini_df.loc[kini_df['name'] == 'AZLAN SANI ZAWAWI','name'] = 'AZLAN SANI ZAWAWI (LANDO BROTHERHOOD)'
  kini_df.loc[kini_df['name'] == 'BASIR AB RAHMAN','name'] = 'LT KOL (B) BASIR'
  kini_df.loc[kini_df['name'] == 'TOMSON ANGO','name'] = 'IANA ANAK AKAM'
  kini_df.loc[kini_df['name'] == 'IANA ANAK AKAM','gender'] = 'F'
  kini_df.loc[kini_df['coalition'] == 'IANA ANAK AKAM','coalition'] = ''
  kini_df.loc[kini_df['party_code'] == 'IANA ANAK AKAM','party_code'] = 'PSB'

  return kini_df

In [23]:
def add_party_lookup_for_kini_data(kini_df_original,spr_df_original):
  spr_df = spr_df_original.copy()
  kini_df = kini_df_original.copy()
  kini_df['fuzzy'] = 0

  kini_df = custom_fixes(kini_df)

  spr_df['parliament_code_digits'] = spr_df.apply(lambda x: make_parliament_code_digits(x['kid']), axis=1)

  for index, row in kini_df.iterrows():

    spr_id = ''
    spr_df_pcode_subset = spr_df[spr_df['parliament_code_digits'] == row['parliament_code_digits']]

    #display(spr_df_pcode_subset)

    #print(index)


    ## Check for independent
    if row['party_code'] == 'IND':
      indies = spr_df_pcode_subset[spr_df_pcode_subset['a'] == 'BEBAS'].copy()
      if (len(indies) >= 1):
        indies['fuzzy_val'] = indies.apply(lambda x: fuzz.partial_ratio(x['t'],row['name']),axis=1)
        indies.sort_values(by='fuzzy_val', ascending=False, inplace=True)
        spr_id = indies['id_x'].iloc[0]
        kini_df.loc[index,'fuzzy'] = 1
      if (len(indies) == 1):
        spr_id = indies['id_x'].iloc[0]
     

        
    else:
      for i,r in spr_df_pcode_subset.iterrows():
        
        ## Check if coalition is same
        if r['a'] == row['coalition']:
          spr_id = r['id_x']
          break

        ## Check if party is same
        if r['a'] == row['party_code']:
          spr_id = r['id_x']
          break

    if spr_id == '' and len(spr_df_pcode_subset) > 0:
      #print("fuzzy matching..." + str(index))
      spr_df_pcode_subset['fuzzy_val'] = spr_df_pcode_subset.apply(lambda x: fuzz.partial_ratio(x['t'],row['name']),axis=1)
      spr_df_pcode_subset.sort_values(by='fuzzy_val', ascending=False, inplace=True)
      spr_id = spr_df_pcode_subset['id_x'].iloc[0]
      kini_df.loc[index,'fuzzy'] = 1

    kini_df.loc[index,'spr_id'] = spr_id

  kini_df = pd.merge(kini_df,spr_df[['id_x','t','kid','a','ju','mj']], how="left", left_on="spr_id", right_on="id_x")
  kini_df['votes'] = kini_df['ju'].fillna(0).astype('int')

  return kini_df
        


In [24]:
kini_matched_df = add_party_lookup_for_kini_data(ge15_candidates_clean_df,combined_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spr_df_pcode_subset['fuzzy_val'] = spr_df_pcode_subset.apply(lambda x: fuzz.partial_ratio(x['t'],row['name']),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spr_df_pcode_subset.sort_values(by='fuzzy_val', ascending=False, inplace=True)


In [25]:
kini_matched_df

Unnamed: 0,year,name,coalition,party_code,votes,vote_share,parliament_code_digits,constituency,state,winner,...,gender,results_added,fuzzy,spr_id,id_x,t,kid,a,ju,mj
0,2022,RAMLE MAT DALY,GTA,PUTRA,90,0,166,LABUAN,Sabah,0,...,F,0,1,606,606.0,RAMLE,16600,PEJUANG,90.0,708.0
1,2022,SUHAILI ABDUL RAHMAN,PN,BERSATU,8124,0,166,LABUAN,Sabah,0,...,F,0,0,203,203.0,DATO' DR. SUHAILI ABDUL RAHMAN,16600,PN,8124.0,708.0
2,2022,BASHIR ALIAS,BN,UMNO,7416,0,166,LABUAN,Sabah,0,...,F,0,0,347,347.0,BASHIR BIN ALIAS,16600,BN,7416.0,708.0
3,2022,RAMLI TAHIR,PH,AMANAH,5307,0,166,LABUAN,Sabah,0,...,F,0,0,508,508.0,DATUK WIRA DR RAMLI,16600,PH,5307.0,708.0
4,2022,ROZMAN ISLI,,WARISAN,7310,0,166,LABUAN,Sabah,0,...,F,0,0,432,432.0,DATUK ROZMAN ISLI,16600,WARISAN,7310.0,708.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,2022,LIDANG DISEN,GPS,PDP,11794,0,192,MAS GADING,Sarawak,0,...,F,0,0,1089,1089.0,LIDANG DISEN,19200,GPS,11794.0,5480.0
941,2022,MORDI BIMOL,PH,DAP,17274,0,192,MAS GADING,Sarawak,0,...,F,0,0,1259,1259.0,MORDI ANAK BIMOL,19200,DAP,17274.0,5480.0
942,2022,CHONG CHIENG JEN,PH,DAP,39310,0,196,STAMPIN,Sarawak,0,...,F,0,0,953,953.0,CHONG CHIENG JEN,19600,DAP,39310.0,7158.0
943,2022,LO KHERE CHANG,GPS,SUPP,32152,0,196,STAMPIN,Sarawak,0,...,F,0,0,883,883.0,LO KHERE CHIANG,19600,GPS,32152.0,7158.0


In [26]:
kini_matched_df.to_csv('./data/raw/malaysiakini_newslab/candidates_parliament_spr_ids_matched.csv')

## Calculate vote shres, total votes and margins

In [27]:
def calculate_totals_winners_shares(clean_df_original):

    clean_df = clean_df_original.copy()

    ## Get winner and total votes for each constituency
        
    total_votes_list = clean_df.groupby(["parliament_code_digits"])["votes"].sum().reset_index()
    total_votes_list.rename(columns={'votes':'total_votes'},inplace=True)

    winner_indices = clean_df.groupby('parliament_code_digits')['votes'].agg(lambda x: x.idxmax())
    winner_list = clean_df.loc[winner_indices,['parliament_code_digits','votes','name']]
    # winner_list.rename(columns={'name':'winner'},inplace=True)

    winner_list['winner'] = winner_list.apply(lambda row: 0 if row['votes'] == 0 else 1,axis=1)

    display(winner_list)
    display(total_votes_list)


    winner_list.drop(columns=['votes'],inplace=True)

    clean_df.drop(columns=['winner'],inplace=True)
    clean_df.drop(columns=['total_votes'],inplace=True)

    clean_df_merged_with_winner = pd.merge(clean_df, winner_list, how="left", on=["parliament_code_digits","name"])
    clean_df_merged_with_winner['winner'].fillna(0,inplace=True)
    display(clean_df_merged_with_winner)
    # clean_df_merged_with_winner['winner'] = clean_df_merged_with_winner \
    #     .apply(lambda row : 1 if row['name'] == row['winner'] else 0, axis = 1)

    clean_df_merged_with_total = pd.merge(clean_df_merged_with_winner, total_votes_list, 
                                        how="left", on=["parliament_code_digits"])

    clean_df_merged_with_total['vote_share'] = clean_df_merged_with_total.apply(lambda x: x['votes'] if x['total_votes'] == 0 else 100*x['votes']/x['total_votes'],axis=1)    
    clean_df_merged_with_total['results_added'] = clean_df_merged_with_total.apply(lambda x: 0 if x['total_votes'] == 0 else 1,axis=1)    
    clean_df_merged_with_total.sort_values(by=['parliament_code_digits','vote_share'], ascending=[True,False], inplace=True)

    cols_to_keep = ['year', 'name', 'coalition', 'party_code', 'votes', 'vote_share',
       'parliament_code_digits', 'constituency', 'state', 'total_votes',
       'gender', 'results_added', 'spr_id','winner']
    clean_df_merged_with_total = clean_df_merged_with_total[cols_to_keep]

    return clean_df_merged_with_total

In [28]:
cleaned_df_with_totals = calculate_totals_winners_shares(kini_matched_df)

Unnamed: 0,parliament_code_digits,votes,name,winner
641,001,24267,RUSYDAN RUSMI,1
289,002,24562,ZAKRI HASSAN,1
613,003,31458,SHAHIDAN KASSIM,1
241,004,25463,SUHAIMI ABDULLAH,1
521,005,31685,ABD GHANI AHMAD,1
...,...,...,...,...
910,218,22150,LUKANISMAN AWANG SAUNI,1
933,219,39549,CHIEW CHOON MAN,1
907,220,0,ANYI NGAU,0
879,221,14897,HASBI HABIBOLLAH,1


Unnamed: 0,parliament_code_digits,total_votes
0,001,45288
1,002,56200
2,003,46789
3,004,47480
4,005,52207
...,...,...
217,218,33916
218,219,78148
219,220,0
220,221,19796


Unnamed: 0,year,name,coalition,party_code,votes,vote_share,parliament_code_digits,constituency,state,gender,results_added,fuzzy,spr_id,id_x,t,kid,a,ju,mj,winner
0,2022,RAMLE MAT DALY,GTA,PUTRA,90,0,166,LABUAN,Sabah,F,0,1,606,606.0,RAMLE,16600,PEJUANG,90.0,708.0,0.0
1,2022,SUHAILI ABDUL RAHMAN,PN,BERSATU,8124,0,166,LABUAN,Sabah,F,0,0,203,203.0,DATO' DR. SUHAILI ABDUL RAHMAN,16600,PN,8124.0,708.0,1.0
2,2022,BASHIR ALIAS,BN,UMNO,7416,0,166,LABUAN,Sabah,F,0,0,347,347.0,BASHIR BIN ALIAS,16600,BN,7416.0,708.0,0.0
3,2022,RAMLI TAHIR,PH,AMANAH,5307,0,166,LABUAN,Sabah,F,0,0,508,508.0,DATUK WIRA DR RAMLI,16600,PH,5307.0,708.0,0.0
4,2022,ROZMAN ISLI,,WARISAN,7310,0,166,LABUAN,Sabah,F,0,0,432,432.0,DATUK ROZMAN ISLI,16600,WARISAN,7310.0,708.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,2022,LIDANG DISEN,GPS,PDP,11794,0,192,MAS GADING,Sarawak,F,0,0,1089,1089.0,LIDANG DISEN,19200,GPS,11794.0,5480.0,0.0
941,2022,MORDI BIMOL,PH,DAP,17274,0,192,MAS GADING,Sarawak,F,0,0,1259,1259.0,MORDI ANAK BIMOL,19200,DAP,17274.0,5480.0,1.0
942,2022,CHONG CHIENG JEN,PH,DAP,39310,0,196,STAMPIN,Sarawak,F,0,0,953,953.0,CHONG CHIENG JEN,19600,DAP,39310.0,7158.0,1.0
943,2022,LO KHERE CHANG,GPS,SUPP,32152,0,196,STAMPIN,Sarawak,F,0,0,883,883.0,LO KHERE CHIANG,19600,GPS,32152.0,7158.0,0.0


In [29]:
cleaned_df_with_totals.to_csv('./data/cleaned/combined/combined_results_parliament_ge15.csv')