In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

from pathlib import Path

import time 
import random

In [2]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
DATADIR = Path("data", "dti-data")
PROJECTROOT = WORKINGDIR.parents[1]

In [3]:
new_dir = Path(PROJECTROOT/DATADIR)
new_dir.mkdir(parents=True, exist_ok=True)

# Scraping necessary tags via bs4

In [4]:
URL = "https://cmci.dti.gov.ph/data-portal.php"
r = requests.get(URL)

## Scraping names of cities in dropdown

In [5]:
soup = BeautifulSoup(r.content, "html")

In [6]:
for i in soup.find_all("select", {"id": "lgu"}):
    cities = [j.string for j in i.find_all("option") if j.string != "Choose LGU(s)"]

In [7]:
hrefs = [
    "#collapse_ed",
    "#collapse_ge",
    "#collapse_in",
    "#collapse_re",
    "#collapse_iv",
]

# Web scraping via Selenium

In [8]:
def scrape_city(city):
    s = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=s)
    driver.maximize_window()
    driver.get("https://cmci.dti.gov.ph/data-portal.php")
        
    lgu_select = driver.find_element(By.ID, "select2-lgu-container")
    lgu_select.click()
    
    lgu_input = driver.find_element(By.CLASS_NAME, "select2-search__field")
    lgu_input.send_keys(city)
    lgu_input.send_keys(Keys.ENTER)
    
    # Clicking the pillars
    for href in hrefs:
        pillars = driver.find_element(By.XPATH, f"//a[@href='{href}']")
        pillars.click()
        
    ff = []

    for i in soup.find_all("div", {"class": "checkbox-nice"}):
        for j in i.find_all("label"):
            for_attr = j.get("for")
            if "chk" not in for_attr and "checkbox" not in for_attr:
                ff.append(for_attr)
        
    for f_ in ff:
        driver.find_element(By.XPATH, f"//label[@for='{f_}']").click()
        
    submit = driver.find_element(
            By.XPATH, f"//button[@onclick='return processSelections()']"
        )
    
    submit.click()
    
    parent = driver.window_handles[0]
    chld = driver.window_handles[1]
    driver.switch_to.window(chld)
    
    table_df = html_to_df(driver)
    
    table_df["City_Municipality"] = city
    
    driver.close()
    driver.quit()
    
    return table_df

In [9]:
def html_to_df(driver):
    driver.find_element(By.ID, "select2-chart-type-container").click()
        
    soup_table = BeautifulSoup(driver.page_source, "html")
    
    table_ = soup_table.find("table", {"id": "table_data2"})
    
    table_df = pd.read_html(str(table_))[0]
    
    return table_df

In [10]:
errored = []

In [11]:
for i in range(len(cities)): 
    try:
        print(f"Scraping: {cities[i]}")
        print(f"{i+1} out of 1634 cities/municipalities done")
        df = scrape_city(cities[i])
        print("=========================")
        df.to_csv(Path(new_dir/f"{cities[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {cities[i]}")
        errored.append(cities[i])
        

Scraping: Aborlan
1 out of 1634 cities/municipalities done
Scraping errored for Aborlan
Scraping: Abra De Ilog
2 out of 1634 cities/municipalities done
Scraping errored for Abra De Ilog
Scraping: Abucay
3 out of 1634 cities/municipalities done
Scraping: Abulug
4 out of 1634 cities/municipalities done
Scraping: Abuyog
5 out of 1634 cities/municipalities done
Scraping errored for Abuyog
Scraping: Adams
6 out of 1634 cities/municipalities done
Scraping: Agdangan
7 out of 1634 cities/municipalities done
Scraping: Aglipay
8 out of 1634 cities/municipalities done
Scraping: Agno
9 out of 1634 cities/municipalities done
Scraping: Agoncillo
10 out of 1634 cities/municipalities done
Scraping: Agoo
11 out of 1634 cities/municipalities done
Scraping: Aguilar
12 out of 1634 cities/municipalities done
Scraping errored for Aguilar
Scraping: Aguinaldo
13 out of 1634 cities/municipalities done
Scraping errored for Aguinaldo
Scraping: Agutaya
14 out of 1634 cities/municipalities done
Scraping: Ajuy
15 o

In [12]:
print(errored)

['Aborlan', 'Abra De Ilog', 'Abuyog', 'Aguilar', 'Aguinaldo', 'Alabat', 'Alaminos (LA)', 'Albuera', 'Alburquerque', 'Alfonso', 'Alfonso Castaneda', 'Aliaga', 'Alimodian', 'Aloguinsan', 'Altavas', 'Alubijid', 'Ambaguio', 'Amlan', 'Anahawan', 'Angadanan', 'Angat', 'Apalit', 'Aparri', 'Araceli', 'Aringay', 'Arteche', 'Asingan', 'Atok', 'Aurora (IA)', 'Ayungon', 'Baclayon', 'Bacolod (LN)', 'Bacolod-Kalawi', 'Bacong', 'Badoc', 'Bagamanoc', 'Baganga', 'Baggao', 'Bagulin', 'Bagumbayan', 'Bais', 'Bakun', 'Balamban', 'Balanga', 'Balaoan', 'Balbalan', 'Baleno', 'Balete (BS)', 'Baliangao', 'Balingasag', 'Balingoan', 'Baliwag', 'Ballesteros', 'Bamban', 'Bambang', 'Banaybanay', 'Bangued', 'Bani', 'Banisilan', 'Banna', 'Bansud', 'Barcelona', 'Barira', 'Baroy', 'Basista', 'Batac', 'Batad', 'Batangas', 'Bato (LE)', 'Batuan (MS)', 'Bauan', 'Bauang', 'Bauko', 'Bautista', 'Bay', 'Bayawan', 'Bayombong', 'Binan', 'Bindoy', 'Binmaley', 'Binuangan', 'Bislig', 'Bobon', 'Bokod', 'Bolinao', 'Boliney', 'Bongabon

In [14]:
errored_batch2=[]

In [18]:
for i in range(len(errored[i])): 
    try:
        print(f"Scraping: {errored[i]}")
        print(f"{i+1} out of 1634 cities/municipalities done")
        df = scrape_city(errored[i])
        print("=========================")
        df.to_csv(Path(new_dir/f"{errored[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {errored[i]}")
        errored.append(errored_batch2[i])
        

IndexError: list index out of range