In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

from pathlib import Path

import time 
import random

In [2]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
DATADIR = Path("data", "dti-data")
PROJECTROOT = WORKINGDIR.parents[1]

In [3]:
new_dir = Path(PROJECTROOT/DATADIR)
new_dir.mkdir(parents=True, exist_ok=True)

# Scraping necessary tags via bs4

In [4]:
URL = "https://cmci.dti.gov.ph/data-portal.php"
r = requests.get(URL)

## Scraping names of cities in dropdown

In [5]:
soup = BeautifulSoup(r.content, "html")

In [6]:
for i in soup.find_all("select", {"id": "lgu"}):
    cities = [j.string for j in i.find_all("option") if j.string != "Choose LGU(s)"]

In [7]:
hrefs = [
    "#collapse_ed",
    "#collapse_ge",
    "#collapse_in",
    "#collapse_re",
    "#collapse_iv",
]

# Web scraping via Selenium

In [21]:
def scrape_city(city):
    s = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=s)
    driver.maximize_window()
    driver.get("https://cmci.dti.gov.ph/data-portal.php")
    
    time.sleep(1)
        
    lgu_select = driver.find_element(By.ID, "select2-lgu-container")
    lgu_select.click()
    
    time.sleep(1)
    
    lgu_input = driver.find_element(By.CLASS_NAME, "select2-search__field")
    lgu_input.send_keys(city)
    lgu_input.send_keys(Keys.ENTER)
    
    # Clicking the pillars
    for href in hrefs:
        pillars = driver.find_element(By.XPATH, f"//a[@href='{href}']")
        pillars.click()
        
    ff = []

    for i in soup.find_all("div", {"class": "checkbox-nice"}):
        for j in i.find_all("label"):
            for_attr = j.get("for")
            if "chk" not in for_attr and "checkbox" not in for_attr:
                ff.append(for_attr)
        
    for f_ in ff:
        driver.find_element(By.XPATH, f"//label[@for='{f_}']").click()
        
    submit = driver.find_element(
            By.XPATH, f"//button[@onclick='return processSelections()']"
        )
    
    submit.click()
    
    parent = driver.window_handles[0]
    chld = driver.window_handles[1]
    driver.switch_to.window(chld)
    
    table_df = html_to_df(driver)
    
    table_df["City_Municipality"] = city
    
    driver.close()
    driver.quit()
    
    return table_df

In [22]:
def html_to_df(driver):
    driver.find_element(By.ID, "select2-chart-type-container").click()
        
    soup_table = BeautifulSoup(driver.page_source, "html")
    
    table_ = soup_table.find("table", {"id": "table_data2"})
    
    table_df = pd.read_html(str(table_))[0]
    
    return table_df

In [10]:
# errored = []

In [11]:
# for i in range(len(cities)): 
#     try:
#         print(f"Scraping: {cities[i]}")
#         print(f"{i+1} out of 1634 cities/municipalities done")
#         df = scrape_city(cities[i])
#         print("=========================")
#         df.to_csv(Path(new_dir/f"{cities[i].replace(' ','')}-dti-data.csv"))
#         time.sleep(random.uniform(2, 5))
#     except:
#         print(f"Scraping errored for {cities[i]}")
#         errored.append(cities[i])

In [12]:
errored = ['Aborlan', 'Abra De Ilog', 'Abuyog', 'Aguilar', 'Aguinaldo', 'Alabat', 'Alaminos (LA)', 'Albuera', 'Alburquerque', 'Alfonso', 'Alfonso Castaneda', 'Aliaga', 'Alimodian', 'Aloguinsan', 'Altavas', 'Alubijid', 'Ambaguio', 'Amlan', 'Anahawan', 'Angadanan', 'Angat', 'Apalit', 'Aparri', 'Araceli', 'Aringay', 'Arteche', 'Asingan', 'Atok', 'Aurora (IA)', 'Ayungon', 'Baclayon', 'Bacolod (LN)', 'Bacolod-Kalawi', 'Bacong', 'Badoc', 'Bagamanoc', 'Baganga', 'Baggao', 'Bagulin', 'Bagumbayan', 'Bais', 'Bakun', 'Balamban', 'Balanga', 'Balaoan', 'Balbalan', 'Baleno', 'Balete (BS)', 'Baliangao', 'Balingasag', 'Balingoan', 'Baliwag', 'Ballesteros', 'Bamban', 'Bambang', 'Banaybanay', 'Bangued', 'Bani', 'Banisilan', 'Banna', 'Bansud', 'Barcelona', 'Barira', 'Baroy', 'Basista', 'Batac', 'Batad', 'Batangas', 'Bato (LE)', 'Batuan (MS)', 'Bauan', 'Bauang', 'Bauko', 'Bautista', 'Bay', 'Bayawan', 'Bayombong', 'Binan', 'Bindoy', 'Binmaley', 'Binuangan', 'Bislig', 'Bobon', 'Bokod', 'Bolinao', 'Boliney', 'Bongabong', 'Bonifacio', 'Bontoc (MP)', 'Borongan', 'Botolan', 'Braulio E. Dujali', "Brooke's Point", 'Bucay', 'Buenavista (GS)', 'Bugasong', 'Bula', 'Buldon', 'Burdeos', 'Burgos (IA)', 'Burgos (IN)', 'Burgos (IS)', 'Buug', 'Cabagan', 'Cabangan', 'Cabiao', 'Cabucgayan', 'Cabusao', 'Cabuyao', 'Calamba (LA)', 'Calamba (MC)', 'Calanogas', 'Calapan', 'Calatrava (NO)', 'Calayan', 'Calbiga', 'Calintaan', 'Caloocan', 'Calubian', 'Calumpit', 'Caluya', 'Camalaniugan', 'Camiling', 'Candelaria (QN)', 'Caoayan', 'Capoocan', 'Caraga', 'Caramoan', 'Carcar', 'Carles', 'Carmen (AN)', 'Carmen (DN)', 'Carmona', 'Carranglan', 'Cataingan', 'Catanauan', 'Catarman (CM)', 'Catarman (NS)', 'Catubig', 'Cauayan (IA)', 'Cauayan (NO)', 'Cawayan', 'Clarin (BL)', 'Claveria (MO)', 'Concepcion (IO)', 'Cuenca', 'Currimao', 'Damulog', 'Danao (BL)', 'Dangcagan', 'Danglas', 'Dasmarinas', 'Dasol', 'Datu Abdullah Sangki', 'Datu Blah Sinsuat', 'Datu Odin Sinsuat', 'Datu Piang', 'Datu Unsay', 'Dauin', 'Dauis', 'Del Gallego', 'Delfin Albano', 'Dilasag', 'Dimasalang', 'Dimiao', 'Dinagat', 'Dinalupihan', 'Dinas', 'Dingalan', 'Divilacan', 'Don Carlos', 'Duero', 'Dupax Del Norte', 'Dupax Del Sur', 'E. B. Magalona', 'Esperanza (SK)', 'Estancia', 'Famy', 'Flora', 'Gamu', 'Gandara', 'Gasan', 'Gattaran', 'Gen S.K Pendatun', 'Gen. Emilio Aguinaldo', 'General Luna (QN)', 'General Nakar', 'Gerona', 'Gigmoto', 'Goa', 'Godod', 'Gonzaga', 'Gregorio Del Pilar', 'Guiguinto (BU)', 'Guihulngan', 'Guindulman', 'Guipos', 'Gumaca', 'Gutalac', 'Hagonoy (DS)', 'Hamtic', 'Hermosa', 'Hernani', 'Hinigaran', 'Hinundayan', 'Ibajay', 'Iloilo', 'Indanan', 'Infanta (QN)', 'Initao', 'Ipil', 'Isabel', 'Isulan', 'Ivana', 'Jabonga', 'Jamindan', 'Janiuay', 'Jomalig', 'Jones', 'Jose Abad Santos', 'Jose Panganiban', 'Josefina', 'Juban', 'Julita', 'Kabuntalan', 'Kadingilan', 'Kalamansig', 'Kalayaan (PN)', 'Kalibo', 'Kalilangan', 'Kalingalan Caluang', 'Kapalong', 'Katipunan', 'Kauswagan', 'Kawit', 'Kitaotao', 'Kitcharao', 'Koronadal', 'La Carlota', 'La Libertad (NR)', 'La Libertad (ZN)', 'La Paz (LE)', 'La Trinidad', 'Labangan', 'Labason', 'Lagawe', 'Lagayan', 'Lala', 'Lallo', 'Lambayong', 'Lamitan', 'Laoac', 'Lapinig', 'Larena', 'Las Pinas', 'Laur', 'Laurel', 'Lebak', 'Lian', 'Libacao', 'Libagon', 'Libjo', 'Licuan-Baay', 'Lidlidda', 'Liloan (SL)', 'Linapacan', 'Lingayen', 'Lingig', 'Llanera', 'Loay', 'Lobo', 'Looc (RN)', 'Lope De Vega', 'Lopez Jaena', 'Lubao', 'Lumba-Bayabao', 'Luna (AO)', 'Luna (IA)', 'Luna (LU)', 'Lupi', 'Lutayan', 'Mabalacat', 'Mabini (CV)', 'Mabuhay', 'Macabebe', 'Macrohon', 'Madalum', 'Madridejos', 'Magdalena', 'Magpet', 'Magsaysay (OM)', 'Magsingal', 'Mahaplag', 'Mahatao', 'Mahayag', 'Makilala', 'Malabang', 'Malaybalay', 'Malinao (AK)', 'Manay', 'Mandaue', 'Mangaldan', 'Manila', 'Manjuyod', 'Mankayan', 'Manticao', 'Manukan', 'Mapanas', 'Margosatubig', 'Marikina', 'Maripipi', 'Masantol', 'Mataasnakahoy', 'Matag-Ob', 'Matalam', 'Matalom', 'Mati', 'Matnog', 'Matungao', 'Mauban', 'Mayantoc', 'Mendez', 'Mercedes', 'Milaor', 'Minglanilla', 'Monkayo', 'Mulanay', 'Mulondo', 'Munai', 'Mutia', 'Nabua', 'Naga (CS)', 'Naga (CU)', 'Nagcarlan', 'Nagtipunan', 'Naguilian (LU)', 'Narvacan', 'Nasipit', 'Nasugbu', 'Natividad', 'Naval', 'Northern Kabuntalan', 'Numancia', 'Oas', 'Obando', 'Omar', 'Orani', 'Paete', 'Pagadian', 'Pagalungan', 'Pagbilao', 'Palanan', 'Palauig', 'Palayan', 'Palo', 'Paluan', 'Panaon', 'Panay', 'Pandami', 'Pantabangan', 'Paoay', 'Paracale', 'Pasig', 'Passi', 'Patnanungan', 'Patnongon', 'Pavia', 'Payao', 'Pigcawayan', 'Pilar (CU)', 'Pilar (SN)', 'Pili', 'Pililla', 'Pinili', 'Pitogo (QN)', 'Plaridel (MC)', 'Plaridel (QN)', 'Pola', 'Poona Bayabao', 'Poro', 'President Roxas (CZ)', 'President Roxas (NC)', 'Prieto Diaz', 'Pualas', 'Pura', 'Quezon (BK)', 'Quezon (IA)', 'Quezon (PN)', 'Reina Mercedes', 'Rizal (CG)', 'Rizal (KA)', 'Rizal (OM)', 'Rodriguez', 'Romblon', 'Rosario (AS)', 'Rosario (BS)', 'Rosario (LU)', 'Roxas (PN)', 'Sagay (CM)', 'Sagnay', 'Saguday', 'Salug', 'Sampaloc', 'San Agustin (RN)', 'San Andres (QN)', 'San Antonio (NE)', 'San Antonio (NS)', 'San Benito', 'San Dionisio', 'San Emilio', 'San Esteban', 'San Fernando (PA)', 'San Francisco (CU)', 'San Isidro (AA)', 'San Isidro (NS)', 'San Jacinto (MS)', 'San Jacinto (PS)', 'San Joaquin', 'San Jose (CS)', 'San Jose (DI)', 'San Jose (NS)', 'San Jose (OM)', 'San Jose Del Monte', 'San Juan (IS)', 'San Juan (LU)', 'San Leonardo', 'San Manuel (IA)', 'San Marcelino', 'San Miguel (SS)', 'San Narciso (QN)', 'San Narciso (ZA)', 'San Pablo (LA)', 'San Pascual (BS)', 'San Pedro', 'San Quintin (AA)', 'San Quintin (PS)', 'San Rafael (BU)', 'San Remigio (CU)', 'San Ricardo', 'San Roque', 'San Vicente (PN)', 'Santa Cruz (IS)', 'Santa Fe (LE)', 'Santa Ignacia', 'Santa Lucia', 'Santa Maria (BU)', 'Santa Maria (IA)', 'Santa Maria (PS)', 'Santa Praxedes', 'Santa Rita (PA)', 'Santa Rita (WS)', 'Santa Rosa (NE)', 'Santa Teresita (BS)', 'Santiago (AN)', 'Santiago (IS)', 'Santo Nino (SC)', 'Santol', 'Sapa-Sapa', 'Sapang Dalaga', 'Sara', 'Sarangani', 'Sariaya', 'Sarrat', 'Sasmuan', 'Sergio Osmena', 'Shariff Aguak', 'Shariff Saydona Mustapha', 'Siasi', 'Siaton', 'Siayan', 'Sibagat', 'Sigma', 'Sinacaban', 'Sipalay', 'Sipocot', 'Siquijor', 'Sirawai', 'Sison (PS)', 'Sogod (CU)', 'Sogod (SL)', 'Solano', 'Sorsogon', 'Sual', 'Sugbongcogon', 'Sultan Dumalondong', 'Sultan Mastura', "T'boli", 'Tabaco', 'Tabontabon', 'Tabuan-Lasa', 'Tacloban', 'Taft', 'Tagaytay', 'Tagbina', 'Tagkawayan', 'Tago', 'Tagoloan Ii', 'Talakag', 'Talavera', 'Talisay  (CU)', 'Talisay (CN)', 'Talitay', 'Tambulig', 'Tampilisan', 'Tanauan (LE)', 'Tandag', 'Tangalan', 'Tanjay', 'Tantangan', 'Tanza', 'Tapaz', 'Taraka', 'Tarangnan', 'Tarlac', 'Tayasan', 'Taytay (PN)', 'Taytay (RL)', 'Tayug', 'Teresa', 'Ternate', 'Tigbao', 'Tinglayan', 'Tingloy', 'Tipo-Tipo', 'Tiwi', 'Tongkil', 'Torrijos', 'Trento', 'Tuao', 'Tubajon', 'Tubao', 'Tubay', 'Tubigon', 'Tublay', 'Tubod (LN)', 'Tuburan (BA)', 'Tudela (CU)', 'Tudela (MC)', 'Tuguegarao', 'Tulunan', 'Turtle Islands', 'Valencia (NR)', 'Vallehermoso', 'Victoria (LA)', 'Victoria (NS)', 'Villasis', 'Vincenzo A Sagun', 'Wao', 'Zarraga', 'Zumarraga']

In [14]:
for i in range(len(errored)): 
    try:
        print(f"Scraping: {errored[i]}")
        print(f"{i+1} out of {len(errored)} cities/municipalities done")
        df = scrape_city(errored[i])
        print("=========================")
        df.to_csv(Path(new_dir/f"{errored[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {errored[i]}")
        errored_batch2.append(errored[i])
        

Scraping: Aborlan
1 out of 550 cities/municipalities done
Scraping errored for Aborlan
Scraping: Abra De Ilog
2 out of 550 cities/municipalities done
Scraping errored for Abra De Ilog
Scraping: Abuyog
3 out of 550 cities/municipalities done
Scraping errored for Abuyog
Scraping: Aguilar
4 out of 550 cities/municipalities done
Scraping: Aguinaldo
5 out of 550 cities/municipalities done
Scraping: Alabat
6 out of 550 cities/municipalities done
Scraping errored for Alabat
Scraping: Alaminos (LA)
7 out of 550 cities/municipalities done
Scraping errored for Alaminos (LA)
Scraping: Albuera
8 out of 550 cities/municipalities done
Scraping: Alburquerque
9 out of 550 cities/municipalities done
Scraping: Alfonso
10 out of 550 cities/municipalities done
Scraping: Alfonso Castaneda
11 out of 550 cities/municipalities done
Scraping: Aliaga
12 out of 550 cities/municipalities done
Scraping: Alimodian
13 out of 550 cities/municipalities done
Scraping errored for Alimodian
Scraping: Aloguinsan
14 out of

In [None]:
for i in range(len(errored)): 
    try:
        print(f"Scraping: {errored[i]}")
        print(f"{i+1} out of {len(errored)} cities/municipalities done")
        df = scrape_city(errored[i])
        print("=========================")
        df.to_csv(Path(new_dir/f"{errored[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {errored[i]}")
        errored_batch2.append(errored[i])
        

Scraping: Aborlan
1 out of 550 cities/municipalities done
Scraping errored for Aborlan
Scraping: Abra De Ilog
2 out of 550 cities/municipalities done
Scraping errored for Abra De Ilog
Scraping: Abuyog
3 out of 550 cities/municipalities done
Scraping errored for Abuyog
Scraping: Aguilar
4 out of 550 cities/municipalities done
Scraping: Aguinaldo
5 out of 550 cities/municipalities done
Scraping: Alabat
6 out of 550 cities/municipalities done
Scraping errored for Alabat
Scraping: Alaminos (LA)
7 out of 550 cities/municipalities done
Scraping errored for Alaminos (LA)
Scraping: Albuera
8 out of 550 cities/municipalities done
Scraping: Alburquerque
9 out of 550 cities/municipalities done
Scraping: Alfonso
10 out of 550 cities/municipalities done
Scraping: Alfonso Castaneda
11 out of 550 cities/municipalities done
Scraping: Aliaga
12 out of 550 cities/municipalities done
Scraping: Alimodian
13 out of 550 cities/municipalities done
Scraping errored for Alimodian
Scraping: Aloguinsan
14 out of

In [18]:
errored_batch3 = []

In [32]:
for i in range(len(errored)): 
    try:
        print(f"Scraping: {errored[i]}")
        print(f"{i+1} out of {len(errored)} cities/municipalities done")
        df = scrape_city(errored[i])
        print("=========================")
        df.to_csv(Path(new_dir/f"{errored[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {errored[i]}")
        

Scraping: Asingan
1 out of 72 cities/municipalities done
Scraping errored for Asingan
Scraping: Ayungon
2 out of 72 cities/municipalities done
Scraping errored for Ayungon
Scraping: Baclayon
3 out of 72 cities/municipalities done
Scraping errored for Baclayon
Scraping: Badoc
4 out of 72 cities/municipalities done
Scraping errored for Badoc
Scraping: Baganga
5 out of 72 cities/municipalities done
Scraping errored for Baganga
Scraping: Bagulin
6 out of 72 cities/municipalities done
Scraping errored for Bagulin
Scraping: Banisilan
7 out of 72 cities/municipalities done
Scraping errored for Banisilan
Scraping: Barira
8 out of 72 cities/municipalities done


In [None]:
errored_batch4 = []

In [None]:
for i in range(len(errored_batch3)): 
    try:
        print(f"Scraping: {errored_batch3[i]}")
        print(f"{i+1} out of {len(errored_batch3)} cities/municipalities done")
        df = scrape_city(errored_batch3[i])
        print("=========================")
        df.to_csv(Path(new_dir/f"{errored_batch3[i].replace(' ','')}-dti-data.csv"))
        time.sleep(random.uniform(2, 5))
    except:
        print(f"Scraping errored for {errored_batch3[i]}")
        errored_batch4.append(errored_batch3[i])
        

Scraping: Aborlan
1 out of 215 cities/municipalities done
Scraping errored for Aborlan
Scraping: Abra De Ilog
2 out of 215 cities/municipalities done
Scraping errored for Abra De Ilog
Scraping: Abuyog
3 out of 215 cities/municipalities done
Scraping errored for Abuyog
Scraping: Alabat
4 out of 215 cities/municipalities done
Scraping: Alaminos (LA)
5 out of 215 cities/municipalities done
Scraping: Alimodian
6 out of 215 cities/municipalities done
Scraping: Aloguinsan
7 out of 215 cities/municipalities done
Scraping: Angadanan
8 out of 215 cities/municipalities done
Scraping: Apalit
9 out of 215 cities/municipalities done
Scraping errored for Apalit
Scraping: Aparri
10 out of 215 cities/municipalities done
Scraping errored for Aparri
Scraping: Araceli
11 out of 215 cities/municipalities done
Scraping: Aringay
12 out of 215 cities/municipalities done
Scraping: Asingan
13 out of 215 cities/municipalities done
Scraping errored for Asingan
Scraping: Ayungon
14 out of 215 cities/municipalitie

In [27]:
dir_ = os.listdir(new_dir)

In [29]:
errored = [city for city in cities if f"{city.replace(' ','')}-dti-data.csv" not in dir_]
errored

['Asingan',
 'Ayungon',
 'Baclayon',
 'Badoc',
 'Baganga',
 'Bagulin',
 'Banisilan',
 'Barira',
 'Bauko',
 'Binan',
 'Bolinao',
 "Brooke's Point",
 'Bula',
 'Buldon',
 'Burdeos',
 'Candelaria (QN)',
 'Carles',
 'Carmen (AN)',
 'Catarman (NS)',
 'Cawayan',
 'Dangcagan',
 'Divilacan',
 'Flora',
 'Gigmoto',
 'Guindulman',
 'Jabonga',
 'Jomalig',
 'Julita',
 'Kalingalan Caluang',
 'Lagawe',
 'Lala',
 'Las Pinas',
 'Lingayen',
 'Loay',
 'Magsingal',
 'Mangaldan',
 'Mankayan',
 'Masantol',
 'Mataasnakahoy',
 'Matalom',
 'Matnog',
 'Matungao',
 'Mulanay',
 'Mutia',
 'Nagtipunan',
 'Oas',
 'Paete',
 'Palanan',
 'Romblon',
 'San Andres (QN)',
 'San Antonio (NE)',
 'San Antonio (NS)',
 'San Emilio',
 'San Fernando (PA)',
 'San Jacinto (PS)',
 'San Juan (LU)',
 'San Miguel (SS)',
 'San Pedro',
 'Santa Maria (PS)',
 'Santa Praxedes',
 'Sapang Dalaga',
 'Sariaya',
 'Sibagat',
 'Sipocot',
 'Siquijor',
 'Sultan Dumalondong',
 "T'boli",
 'Tacloban',
 'Tagoloan Ii',
 'Tambulig',
 'Valencia (NR)',
 'Zum

In [30]:
len(errored)

72

In [23]:
new_dir

PosixPath('/Users/ian/Documents/MachineLearning/omdena/mapping-urban-vulnerability-areas/philippines-chapter-urban-vunerability/src/data/dti-data')

In [31]:
len(cities)

1634