In [20]:
import os
import random
import time
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

In [21]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
DATADIR = Path("data", "dti-data")
PROJECTROOT = WORKINGDIR.parents[1]

In [22]:
new_dir = Path(PROJECTROOT / DATADIR)
new_dir.mkdir(parents=True, exist_ok=True)

# Scraping necessary tags via bs4

In [23]:
URL = "https://cmci.dti.gov.ph/data-portal.php"
r = requests.get(URL)

## Scraping names of cities in dropdown

In [24]:
soup = BeautifulSoup(r.content, "html")

In [25]:
for i in soup.find_all("select", {"id": "lgu"}):
    cities = [j.string for j in i.find_all("option") if j.string != "Choose LGU(s)"]

In [26]:
hrefs = [
    "#collapse_ed",
    "#collapse_ge",
    "#collapse_in",
    "#collapse_re",
    "#collapse_iv",
]

# Web scraping via Selenium

In [27]:
def scrape_city(city):
    s = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=s)
    driver.maximize_window()
    driver.get("https://cmci.dti.gov.ph/data-portal.php")

    time.sleep(1)

    # Click dropdpwn
    lgu_select = driver.find_element(By.ID, "select2-lgu-container")
    lgu_select.click()
    time.sleep(1)

    # Type city name and then select
    lgu_input = driver.find_element(By.CLASS_NAME, "select2-search__field")
    lgu_input.send_keys(city)
    lgu_input.send_keys(Keys.ENTER)

    # Click the pillars
    for href in hrefs:
        pillars = driver.find_element(By.XPATH, f"//a[@href='{href}']")
        pillars.click()

    ff = []

    for i in soup.find_all("div", {"class": "checkbox-nice"}):
        for j in i.find_all("label"):
            for_attr = j.get("for")
            if "chk" not in for_attr and "checkbox" not in for_attr:
                ff.append(for_attr)

    # Click checkboxes
    for f_ in ff:
        driver.find_element(By.XPATH, f"//label[@for='{f_}']").click()

    # Click Submit
    submit = driver.find_element(
        By.XPATH, f"//button[@onclick='return processSelections()']"
    )

    submit.click()

    # Move to new tab
    parent = driver.window_handles[0]
    chld = driver.window_handles[1]
    driver.switch_to.window(chld)

    # Convert html table to df
    table_df = html_to_df(driver)

    # Adding new column for city name
    table_df["City_Municipality"] = city

    driver.close()
    driver.quit()

    return table_df

In [28]:
def html_to_df(driver):
    driver.find_element(By.ID, "select2-chart-type-container").click()

    soup_table = BeautifulSoup(driver.page_source, "html")

    table_ = soup_table.find("table", {"id": "table_data2"})

    table_df = pd.read_html(str(table_))[0]

    return table_df

In [29]:
dir_ = os.listdir(new_dir)
dir_

['Garcia-Hernandez-dti-data.csv',
 'Giporlos-dti-data.csv',
 'Boac-dti-data.csv',
 'Carmen(AN)-dti-data.csv',
 'Tago-dti-data.csv',
 'Nagcarlan-dti-data.csv',
 'Dapitan-dti-data.csv',
 'LaLibertad(NR)-dti-data.csv',
 'Moncada-dti-data.csv',
 'Tagbina-dti-data.csv',
 'Mabini(BL)-dti-data.csv',
 'Llanera-dti-data.csv',
 'SantaMonica-dti-data.csv',
 'Arayat-dti-data.csv',
 'Masinloc-dti-data.csv',
 'Saguiaran-dti-data.csv',
 'Guinayangan-dti-data.csv',
 'Gainza-dti-data.csv',
 'Concepcion(TC)-dti-data.csv',
 'Hinunangan-dti-data.csv',
 'Taysan-dti-data.csv',
 'Balamban-dti-data.csv',
 'Baras(CT)-dti-data.csv',
 'Malita-dti-data.csv',
 'Ipil-dti-data.csv',
 'Argao-dti-data.csv',
 'Dingalan-dti-data.csv',
 'OldPanamao-dti-data.csv',
 'Sulat-dti-data.csv',
 'Siruma-dti-data.csv',
 'Kitaotao-dti-data.csv',
 'GeneralSantos-dti-data.csv',
 'Talusan-dti-data.csv',
 'Gen.EmilioAguinaldo-dti-data.csv',
 'Hingyon-dti-data.csv',
 'PadreGarcia-dti-data.csv',
 'SanFrancisco(AS)-dti-data.csv',
 'Castil

In [30]:
unscraped = [
    city for city in cities if f"{city.replace(' ','')}-dti-data.csv" not in dir_
]
unscraped

["Brooke's Point", "T'boli"]

In [31]:
for i in range(len(unscraped)):
    try:
        print(f"Scraping: {unscraped[i]}")
        print(f"{i+1} out of {len(unscraped)} cities/municipalities done")
        df = scrape_city(unscraped[i])
        print("=========================")
        df.to_csv(Path(new_dir / f"{unscraped[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {unscraped[i]}")
        print("=========================")

Scraping: Brooke's Point
1 out of 2 cities/municipalities done
Scraping errored for Brooke's Point
Scraping: T'boli
2 out of 2 cities/municipalities done
Scraping errored for T'boli


I tried manually taking the features for these two but the DTI website seems to have a bug with city names that have an apostrophe not being recognized as a LGU. 