### Calonkeadilan API Scraping
**List to Do**

- [x] Click dropdown to get option values
- [x] Scrape area names from dropdown 
- [x] Call API by using area names (small letters)

In [1]:
## import libraries
import requests
import json
import re
import pandas as pd
from requests_html import HTMLSession

In [75]:
## Click dropdown to get ul list of area names
## If you don't want to install & use playwright , just do uncomment and run below small_letters list block
from playwright.async_api import async_playwright

small_letters = []
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False, slow_mo=50)
    page = await browser.new_page()
    await page.goto("https://calonkeadilan.org/")
    print(await page.title())
    # await page.get_by_role("button").click()
    await page.get_by_text("Negeri").click()
    print("Done Click!")
    await page.wait_for_timeout(3000)
    
    ul_selector = "ul.MuiList-root"
    test = await page.locator(ul_selector).inner_html()
    data = re.findall(r'data-value=\"([\w\s]+)\"',test)
    small_letters = [x.lower() for x in data[1:]]
    # print(small_letters)
    await page.wait_for_timeout(1000)
    await browser.close()
print(small_letters)

Calon Keadilan | Ini Calon Kita | Kita Boleh
Done Click!
['johor', 'kedah', 'kelantan', 'melaka', 'negeri sembilan', 'pahang', 'pulau pinang', 'perak', 'perlis', 'sabah', 'sarawak', 'selangor', 'terengganu', 'wilayah persekutuan']


In [None]:
## If you don't want to install & run Playwright, you can just use this block.
## All purpose of using Playwright is to get this small_letters list to be used in API call.

## Uncomment below if you've to run
# small_letters = ['johor', 'kedah', 'kelantan', 'melaka', 'negeri sembilan', 'pahang', 'pulau pinang', 'perak', 'perlis', 'sabah', 'sarawak', 'selangor', 'terengganu', 'wilayah persekutuan']

In [77]:
raw_data = pd.DataFrame()
appended_data = []
for i in small_letters:
    print("Doing {}".format(i))
    api_url = "https://itcwu78qt3.execute-api.ap-southeast-1.amazonaws.com/dev/api/calon/view-negeri"
    payload = json.dumps({
        "negeri" : f"{i}"
    })
    headers = {
        'content-type': 'application/json'
    }
    
    response = requests.request("POST", api_url, headers=headers, data=payload)
    # print(response.status_code)
    if response.status_code == 200:
        x = response.json()
        single_data = x['data']
        single_df = pd.json_normalize(single_data)
        print(single_df.shape)
        # print(single_df.columns)
        appended_data.append(single_df)
    else:
        print("Connection Error!")
raw_df = pd.concat(appended_data)
print(raw_df.shape)
print(raw_df.columns)

Doing johor
(11, 8)
Doing kedah
(9, 8)
Doing kelantan
(6, 8)
Doing melaka
(2, 8)
Doing negeri sembilan
(3, 8)
Doing pahang
(23, 10)
Doing pulau pinang
(4, 8)
Doing perak
(31, 10)
Doing perlis
(11, 10)
Doing sabah
(11, 10)
Doing sarawak
(16, 8)
Doing selangor
(11, 8)
Doing terengganu
(4, 8)
Doing wilayah persekutuan
(6, 8)
(148, 10)
Index(['_id', 'b_pemohon', 'c_pendidikan', 'd_kerjaya', 'e_politik',
       'kod_parlimen', 'negeri', 'parlimen', 'dun', 'kod_dun'],
      dtype='object')


### English Definitions

- pemohon - Applicant
- pendidikan - Education
- kerjaya - Career
- politik - Political

In [82]:
## Create separated dataframes for Applicant, Education , Career , Political data with common _id column
applicant_df = raw_df[['_id','b_pemohon','kod_parlimen','negeri','parlimen']].reset_index(drop=True)
education_df = raw_df[['_id','c_pendidikan']].reset_index(drop=True)
career_df = raw_df[['_id','d_kerjaya']].reset_index(drop=True)
politics_df = raw_df[['_id','e_politik']].reset_index(drop=True)

## Concat to create separated values by column
pemohon_df = pd.concat([pd.DataFrame(i) for i in applicant_df['b_pemohon']], keys=applicant_df.index).reset_index(level=1, drop=True)
pendidikan_df = pd.concat([pd.DataFrame(i) for i in education_df['c_pendidikan']], keys=education_df.index).reset_index(level=1, drop=True)
kerjaya_df = pd.concat([pd.DataFrame(i) for i in career_df['d_kerjaya']], keys=career_df.index).reset_index(level=1, drop=True)
politik_df = pd.concat([pd.DataFrame(i) for i in politics_df['e_politik']], keys=politics_df.index).reset_index(level=1, drop=True)


In [83]:
## check new dataframes
print(applicant_df.shape)
print(pemohon_df.shape)
print(pemohon_df.index)
print(applicant_df.index)

(148, 5)
(148, 2)
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            138, 139, 140, 141, 142, 143, 144, 145, 146, 147],
           dtype='int64', length=148)
RangeIndex(start=0, stop=148, step=1)


In [84]:
## Join column back with index
applicant_df = applicant_df.drop('b_pemohon',axis=1).join(pemohon_df).reset_index(drop=True)
print(applicant_df.shape)
# print(applicant_df.columns)
# print(applicant_df.head())

education_df = education_df.drop('c_pendidikan', axis=1).join(pendidikan_df).reset_index(drop=True)
print(education_df.shape)
career_df = career_df.drop('d_kerjaya', axis=1).join(kerjaya_df).reset_index(drop=True)
print(career_df.shape)
politics_df = politics_df.drop('e_politik', axis=1).join(politik_df).reset_index(drop=True)
print(politics_df.shape)

(148, 6)
(374, 5)
(484, 5)
(618, 5)


In [96]:
## Decided to export CSV to be align with other scraped dataset
applicant_df.to_csv('./data/raw/calonkeadilan/applicant.csv', encoding='utf-8')
education_df.to_csv('./data/raw/calonkeadilan/education.csv', encoding='utf-8')
career_df.to_csv('./data/raw/calonkeadilan/career.csv', encoding='utf-8')
politics_df.to_csv('./data/raw/calonkeadilan/politics.csv',encoding='utf-8')

## Test with Excel
# applicant_df.to_excel('./data/raw/calonkeadilan/applicant.xlsx')

## Export JSON >> raw/calonkeadilan folder
# applicant_df.to_json('./data/raw/calonkeadilan/applicant.json', orient='index')
# applicant_df.to_json('./data/raw/calonkeadilan/applicant_compared.json', orient='records')
# result = applicant_df.to_json(orient='index')
