In [None]:
from selenium import webdriver

from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

from time import sleep
import pandas as pd

import os
import glob

In [None]:
DRIVER_PATH = '/Applications/chromedriver'
SPR_PORTAL_URL = 'https://mysprsemak.spr.gov.my/semakan/calon/'

## Utilities to get browser running

In [None]:
def is_browser_alive(driver):
   try:
      driver.current_url
      # or driver.title
      return True
   except:
      return False

In [None]:
def get_new_driver(binary_path,initial_url):
  driver = webdriver.Chrome(executable_path=binary_path)
  driver.get(initial_url)
  return driver


In [None]:
driver = ''
if is_browser_alive(driver) == False:
  driver = get_new_driver(DRIVER_PATH,SPR_PORTAL_URL)

## Functions to interact with DOM elements

In [None]:
def get_to_state_combo_listing():
  sleep(3)
  button = driver.find_element("xpath","/html/body/div[1]/main/form[1]/button")
  button.click()

In [None]:
def select_state_from_combobox_and_get_seats(index,parliament_type):
  combobox = Select(driver.find_element(By.ID,'negeri'))
  combobox.select_by_index(index)
  state = driver.find_element("id","select2-negeri-container").text
  list_class = "parlimen-content" if parliament_type == 'parliament' else "dun-content"
  sleep(5)
  if parliament_type != 'parliament':
    dun_tab_button = driver.find_element(By.ID,"dun-tab")
    dun_tab_button.click()
    sleep(5)

  seats_list = driver.find_elements(By.CLASS_NAME,list_class)
  print("number of seats in " + state + ":" + str(len(seats_list)))
  return state,seats_list

In [None]:
def get_list_of_candidates(seat):
  print("Getting candidates for "+seat.text)
  seat.click()
  sleep(4)
  candidates = driver.find_element("id","nama-calon").find_elements(By.CLASS_NAME,"col-span-4")
  #print(candidates)
  return [candidate.text for candidate in candidates]

In [None]:
def build_dict_for_candidate(parliament_type,state,seat_name,seat_id,parliament_code,candidate_info):

  return_dict = {
    'parliament_type': parliament_type,
    'state': state,
    'seat_name': seat_name,
    'seat_id': seat_id,
    'parliament_code': parliament_code,
  }

  index,name,party = [text.strip() for text in candidate_info.split('\n')]

  return_dict['name'] = name
  return_dict['party'] = party

  return return_dict


## Start scraping

In [None]:
get_to_state_combo_listing()


In [None]:
def scrape_seats_for_states(start_index,end_index,parliament_type):
   candidates = []
   errors = []
   for index in range(start_index,end_index):
      print(index)
      sleep(5)
      state,seats_list = select_state_from_combobox_and_get_seats(index,parliament_type)
      print(state)
      for i, seat in enumerate(seats_list):
         sleep(1)

         try:
         
            print("Seat index " + str(i) + " " + seat.text)
            parliament_code,seat_name = [text.strip() for text in seat.text.split('\n')]
            seat_id = seat.get_attribute("id")
            new_candidates = [
               build_dict_for_candidate(
                  parliament_type,
                  state,
                  seat_name,
                  seat_id,
                  parliament_code,
                  candidate
               ) 
               for candidate in get_list_of_candidates(seat)
            ]
            #print(new_candidates)
            
            candidates.extend(new_candidates)
            
         except:
            print("Error occured while scraping " + str(seat.text))
            errors.append(seat.text)
      
   return errors, candidates

## Scrape parliament seats

In [None]:
# errors_1_6, candidates_list_1_6 = scrape_seats_for_states(1,6,'parliament')
# errors_6_10, candidates_list_6_10 = scrape_seats_for_states(6,10,'parliament')
# errors_10_15, candidates_list_10_15 = scrape_seats_for_states(10,15,'parliament')
# errors_15_17, candidates_list_15_17 = scrape_seats_for_states(15,17,'parliament')

In [None]:
# candidates_list_df_1_to_6 = pd.DataFrame(candidates_list_1_6)
# candidates_list_df_6_to_10 = pd.DataFrame(candidates_list_6_10)
# candidates_list_df_10_to_15 = pd.DataFrame(candidates_list_10_15)
# candidates_list_df_15_to_17 = pd.DataFrame(candidates_list_15_17)

In [None]:
# candidates_list_df_1_to_6.to_csv('data/raw/spr/ge15/candidates_list_1_to_6.csv')
# candidates_list_df_6_to_10.to_csv('data/raw/spr/ge15/candidates_list_6_to_10.csv')
# candidates_list_df_10_to_15.to_csv('data/raw/spr/ge15/candidates_list_10_to_15.csv')
# candidates_list_df_15_to_17.to_csv('data/raw/spr/ge15/candidates_list_15_to_17.csv')

# errors
# ['P.100\nPANDAN', 'P.103\nPUCHONG']

In [None]:
# errors_15_17

## Scrape state seats

In [None]:
#errors_states_1_6, candidates_list_states_1_6 = scrape_seats_for_states(1,6,'states')
# errors_states_6_17, candidates_list_states_6_17 = scrape_seats_for_states(6,17,'states')

In [None]:
# candidates_list_states_df_1_6 = pd.DataFrame(candidates_list_states_1_6)
# candidates_list_states_df_6_17 = pd.DataFrame(candidates_list_states_6_17)

In [None]:
# candidates_list_states_df_1_6.to_csv('data/raw/spr/ge15/candidates_list_states_1_to_6.csv')
# candidates_list_states_df_6_17.to_csv('data/raw/spr/ge15/candidates_list_states_6_to_17.csv')

# errors
# none


## Combine all results

In [None]:
csvs_list = glob.glob(os.path.join(os.path.dirname("__file__"),'data/raw/spr/ge15/*'))

In [None]:
def append_all(paths):
    '''
    Append CSVs into a dataframe.
    '''
    
    li = []

    for filename in paths:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df.reset_index(drop=True))

    df = pd.concat(li, axis=0, ignore_index=True)
    df.drop(columns=["Unnamed: 0"],inplace=True) 
    df.sort_values(by=["parliament_type","parliament_code"],inplace=True)
    return df.reset_index(drop=True)

In [None]:
ge15_combined_candidates_df = append_all(csvs_list)

In [None]:
ge15_combined_candidates_df

In [None]:
ge15_combined_candidates_df.to_csv('data/cleaned/spr/ge15/ge15_combined_candidates.csv')