In [1]:
import pandas as pd
import os
import glob
import re
import json

## Utility functions

In [2]:
def read_json(filename,data_write_path):
    '''
    Read JSON file from designated raw data folder
    '''
    
    data = ''
    with open(data_write_path + filename, "r") as f:
        data = json.load(f)
        
    return data

In [3]:
def make_parliament_code_digits(codeId):
  p_code = codeId[:3]
  n_code = codeId[3:]

  if len(n_code) == 2:
    n_code = "0" + n_code

  if n_code == "00":
    return p_code
  else:
    return p_code + "-" + n_code

## Make dataframes from attribute data of GeoJSON from Malaysiakini site

In [4]:
parliament_geo = read_json('parliament_ge14.json','./data/raw/malaysiakini_newslab/')
states_geo = read_json('dun_ge14.json','./data/raw/malaysiakini_newslab/')

In [5]:
def get_incumbent_and_candidates(item):
  constituency_info = item['properties']
  candidates = constituency_info.pop('candidates')
  for candidate in candidates:
    candidate['codeId'] = constituency_info['codeId']
    candidate['state'] = constituency_info['state']
    candidate['parName'] = constituency_info['parName']
  return candidates, [constituency_info]

In [6]:
def get_all_incumbent_and_candidates(geojson):
  
  items = geojson['features']
  constituencies = []
  candidates = []
  for item in items:
    current_candidates, current_constituency_info = get_incumbent_and_candidates(item)
    constituencies.extend(current_constituency_info)
    candidates.extend(current_candidates)

  return pd.DataFrame(candidates),pd.DataFrame(constituencies)

In [7]:
candidates_parliament_df, constituencies_parliament_df = get_all_incumbent_and_candidates(parliament_geo)

In [8]:
candidates_states_df, constituencies_states_df = get_all_incumbent_and_candidates(states_geo)

In [9]:
candidates_parliament_df.to_csv('./data/raw/malaysiakini_newslab/candidates_parliament.csv')
constituencies_parliament_df.to_csv('./data/raw/malaysiakini_newslab/constituencies_parliament.csv')
candidates_states_df.to_csv('./data/raw/malaysiakini_newslab/candidates_states.csv')
constituencies_states_df.to_csv('./data/raw/malaysiakini_newslab/constituencies_states.csv')

In [10]:
constituencies_parliament_df

Unnamed: 0,parCode,codeId,state,ge14WonParty,ge14WonCoalition,ge14Majority,parName,zhParName,voters,incumbentName,zhName,incumbentCoalition,incumbentParty,forLiveSite
0,P166,16600,Sabah,umno,bn,1450,Labuan,纳闽,44484,Rozman Isli,罗兹曼,,warisan,WARISAN
1,P085,08500,Pahang,umno,bn,24859,Pekan,北根,119443,Najib Abdul Razak,纳吉,bn,umno,BN
2,P006,00600,Kedah,bersatu,ph,13009,Kubang Pasu,古邦巴素,108217,Amiruddin Hamzah,阿米鲁丁,gta,pejuang,GTA
3,P047,04700,Penang,pkr,ph,15817,Nibong Tebal,高渊,100062,Mansor Othman,曼梳,pn,bersatu,PN
4,P028,02800,Kelantan,pas,gagasan,1360,Pasir Puteh,巴西富地,113070,Nik Muhammad Zawawi Salleh,聂莫哈末扎瓦威,pn,pas,PN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,P219,21900,Sarawak,pkr,ph,13663,Miri,美里,143229,Michael Teo Yu Keng,张有庆,ph,pkr,HARAPAN
218,P207,20700,Sarawak,pbb,bn,8495,Igan,依干,28290,Ahmad Johnie Zawawi,阿末佐尼,gps,pbb,GPS
219,P195,19500,Sarawak,dap,ph,35973,Bandar Kuching,古晋,109710,Kelvin Yii Lee Wuen,俞利文,ph,dap,HARAPAN
220,P192,19200,Sarawak,dap,ph,3024,Mas Gading,玛士加丁,47171,Mordi Bimol,莫迪,ph,dap,HARAPAN


In [11]:
constituencies_states_df

Unnamed: 0,parCode,dunCode,codeId,state,ge14WonParty,ge14WonCoalition,ge14Majority,parName,zhParName,dunName,zhDunName,voters,incumbentName,incumbentCoalition,incumbentParty
0,P001,N01,00101,Perlis,mca,bn,142,Padang Besar,巴当勿刹,Titi Tinggi,知知丁宜,13403,Teh Chai Aan,bn,mca
1,P001,N02,00102,Perlis,umno,bn,416,Padang Besar,巴当勿刹,Beseri,柏斯里,12128,Ruzaini Rais,bn,umno
2,P001,N03,00103,Perlis,umno,bn,1367,Padang Besar,巴当勿刹,Chuping,朱宾,14248,Asmaiza Ahmad,bn,umno
3,P001,N04,00104,Perlis,umno,bn,720,Padang Besar,巴当勿刹,Mata Ayer,马打亚逸,9662,Siti Berenee Yahaya,bn,umno
4,P001,N05,00105,Perlis,umno,bn,949,Padang Besar,巴当勿刹,Santan,山丹,10751,Azizan Sulaiman,bn,umno
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,P090,N39,09039,Pahang,umno,bn,4618,Bera,百乐,Kemayan,金马扬,27366,Mohd Fadil Osman,bn,umno
113,P091,N40,09140,Pahang,umno,bn,2478,Rompin,云冰,Bukit Ibam,武吉依班,31003,Samsiah Arshad,bn,umno
114,P091,N41,09141,Pahang,umno,bn,4840,Rompin,云冰,Muadzam Shah,姆阿占沙,29663,Razali Kassim,bn,umno
115,P091,N42,09142,Pahang,umno,bn,1280,Rompin,云冰,Tioman,刁曼,28465,Mohd Johari Hussain,bn,umno


In [12]:
candidates_parliament_df

Unnamed: 0,coalition,party,logo,name,gender,codeId,state,parName
0,gta,putra,pejuang,Ramle Mat Daly,L,16600,Sabah,Labuan
1,pn,bersatu,pn,Suhaili Abdul Rahman,L,16600,Sabah,Labuan
2,bn,umno,bn,Bashir Alias,L,16600,Sabah,Labuan
3,ph,amanah,ph,Ramli Tahir,L,16600,Sabah,Labuan
4,,warisan,warisan,Rozman Isli,L,16600,Sabah,Labuan
...,...,...,...,...,...,...,...,...
940,gps,pdp,gps,Lidang Disen,L,19200,Sarawak,Mas Gading
941,ph,dap,dap,Mordi Bimol,L,19200,Sarawak,Mas Gading
942,ph,dap,dap,Chong Chieng Jen,L,19600,Sarawak,Stampin
943,gps,supp,gps,Lo Khere Chang,L,19600,Sarawak,Stampin


In [13]:
candidates_parliament_df['coalition'].unique()

array(['gta', 'pn', 'bn', 'ph', None, 'grs', 'gps'], dtype=object)

In [14]:
candidates_parliament_df['gender'].value_counts()

L    818
P    127
Name: gender, dtype: int64

In [15]:
candidates_parliament_df['party'].value_counts().index

Index(['umno', 'ind', 'pkr', 'bersatu', 'pejuang', 'pas', 'dap', 'amanah',
       'warisan', 'mca', 'putra', 'gerakan', 'prm', 'direct', 'pbb', 'mic',
       'berjasa', 'kdm', 'supp', 'prs', 'muda', 'iman', 'psb', 'pbk', 'upko',
       'pbm', 'pdp', 'pbs', 'pbds', 'pbrs', 'star', 'pcm', 'mmsp', 'ipf',
       'sedar', 'pprs', 'kimma', 'sapp', 'pur', 'psm'],
      dtype='object')

In [16]:
constituencies_parliament_df

Unnamed: 0,parCode,codeId,state,ge14WonParty,ge14WonCoalition,ge14Majority,parName,zhParName,voters,incumbentName,zhName,incumbentCoalition,incumbentParty,forLiveSite
0,P166,16600,Sabah,umno,bn,1450,Labuan,纳闽,44484,Rozman Isli,罗兹曼,,warisan,WARISAN
1,P085,08500,Pahang,umno,bn,24859,Pekan,北根,119443,Najib Abdul Razak,纳吉,bn,umno,BN
2,P006,00600,Kedah,bersatu,ph,13009,Kubang Pasu,古邦巴素,108217,Amiruddin Hamzah,阿米鲁丁,gta,pejuang,GTA
3,P047,04700,Penang,pkr,ph,15817,Nibong Tebal,高渊,100062,Mansor Othman,曼梳,pn,bersatu,PN
4,P028,02800,Kelantan,pas,gagasan,1360,Pasir Puteh,巴西富地,113070,Nik Muhammad Zawawi Salleh,聂莫哈末扎瓦威,pn,pas,PN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,P219,21900,Sarawak,pkr,ph,13663,Miri,美里,143229,Michael Teo Yu Keng,张有庆,ph,pkr,HARAPAN
218,P207,20700,Sarawak,pbb,bn,8495,Igan,依干,28290,Ahmad Johnie Zawawi,阿末佐尼,gps,pbb,GPS
219,P195,19500,Sarawak,dap,ph,35973,Bandar Kuching,古晋,109710,Kelvin Yii Lee Wuen,俞利文,ph,dap,HARAPAN
220,P192,19200,Sarawak,dap,ph,3024,Mas Gading,玛士加丁,47171,Mordi Bimol,莫迪,ph,dap,HARAPAN


## Prep data for combining and nesting

In [17]:
def prep_parliament_data_for_ge15(original_candidates_df):
	candidates_df = original_candidates_df.copy()

	cols_to_keep = ['year', 'name', 'coalition', 'party_code', 'votes', 'vote_share',
		'parliament_code_digits', 'constituency', 'state', 'winner',
		'total_votes', 'gender', 'results_added']

	candidates_df['year'] = '2022'
	candidates_df['name'] = candidates_df['name'].str.upper()
	candidates_df['coalition'] = candidates_df['coalition'].str.upper()
	candidates_df.rename(columns = {'party': 'party_code', 'parName': 'constituency'}, inplace=True)
	candidates_df['party_code'] = candidates_df['party_code'].str.upper()
	candidates_df['votes'] = 0
	candidates_df['vote_share'] = 0
	candidates_df['parliament_code_digits'] = candidates_df['codeId'].apply(lambda x: make_parliament_code_digits(x))
	candidates_df['constituency'] = candidates_df['constituency'].str.upper()
	candidates_df['winner'] = 0
	candidates_df['total_votes'] = 0
	candidates_df['gender'] = candidates_df['gender'].apply(lambda x: 'F' if x == 'L' else 'M')
	candidates_df['results_added'] = 0
	candidates_df = candidates_df[cols_to_keep]
	candidates_df = candidates_df.reindex(columns=cols_to_keep)
	candidates_df = candidates_df.fillna(value='')

	return candidates_df

In [18]:
ge15_candidates_clean_df = prep_parliament_data_for_ge15(candidates_parliament_df)

In [19]:
def prep_parliament_data_for_incumbents(original_candidates_df):
	candidates_df = original_candidates_df.copy()

	cols_to_keep = ['year', 'name', 'coalition', 'party_code', 'votes', 'vote_share',
		'parliament_code_digits', 'constituency', 'state', 'winner',
		'total_votes', 'gender', 'results_added']
	
	candidates_df['year'] = '2022 Incumbents'
	candidates_df['name'] = candidates_df['incumbentName'].str.upper()
	candidates_df['coalition'] = candidates_df['incumbentCoalition'].str.upper()
	candidates_df['party_code'] = candidates_df['incumbentParty'].str.upper()
	candidates_df['votes'] = 0
	candidates_df['vote_share'] = 0
	candidates_df['parliament_code_digits'] = candidates_df['codeId'].apply(lambda x: make_parliament_code_digits(x))
	candidates_df['constituency'] = candidates_df['parName'].str.upper()
	candidates_df['winner'] = 1
	candidates_df['total_votes'] = 0
	candidates_df['gender'] = ''
	candidates_df['results_added'] = 0
	candidates_df = candidates_df[cols_to_keep]
	candidates_df = candidates_df.reindex(columns=cols_to_keep)
	candidates_df = candidates_df.fillna(value='')
	
	return candidates_df

In [20]:
ge15_incumbents_clean_df = prep_parliament_data_for_incumbents(constituencies_parliament_df)

In [21]:

ge15_candidates_clean_df.to_csv('./data/cleaned/malaysiakini_newslab/ge15_candidates_clean.csv')
ge15_incumbents_clean_df.to_csv('./data/cleaned/malaysiakini_newslab/ge15_incumbents_clean.csv')