In [1]:
import os
import random
import re
import time
from pathlib import Path

import geopandas as gp
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

# Helper Functions

In [2]:
def data_cleaner(text):
    # Make lower caps
    text = text.lower()

    # Remove "city"
    text = text.replace("city", "")

    # Remove " of "
    text = text.replace(" of ", "")

    # Remove punctuation marks
    text = re.sub(r"[^\w\s]", "", text)

    # Remove spaces
    text = text.replace(" ", "")

    # Replace compostela valley with Davao de Oro
    text = text.replace("compostelavalley", "davaodeoro")

    # Replace Maguindanao with del Norte/del Sur
    if "maguindanao" in text:
        text = "maguindanao"

    # Replace North Cotabato with Cotabato
    text = text.replace("northcotabato", "cotabato")

    # Replace santo with sto
    text = text.replace("santo", "sto")

    # Replace santa with sta
    text = text.replace("santa", "sta")

    return text

In [3]:
def correct_index(index):
    index = index.replace(
        "bacunganleonbpostigo, zamboangadelnorte", "bacungan, zamboangadelnorte"
    )
    index = index.replace("baliwag, bulacan", "baliuag, bulacan")
    index = index.replace("bantayanisland, cebu", "bantayan, cebu")
    index = index.replace("binan, laguna", "biñan, laguna")
    index = index.replace("bulakan, bulacan", "bulacan, bulacan")
    index = index.replace("dasmarinas, cavite", "dasmariñas, cavite")
    index = index.replace(
        "datublahsinsuat, maguindanao", "datublahtsinsuat, maguindanao"
    )
    index = index.replace(
        "donaremediostrinidad, bulacan", "doñaremediostrinidad, bulacan"
    )
    index = index.replace("duenas, iloilo", "dueñas, iloilo")
    index = index.replace(
        "ebmagalona, negrosoccidental", "enriquebmagalona, negrosoccidental"
    )
    index = index.replace(
        "genemilioaguinaldo, cavite", "generalemilioaguinaldo, cavite"
    )
    index = index.replace("igacos, davaodelnorte", "islandgardensamal, davaodelnorte")
    index = index.replace("isabela, basilanix", "isabela, basilan")
    index = index.replace(
        "laspinas, metropolitanmanila", "laspiñas, metropolitanmanila"
    )
    index = index.replace("losbanos, laguna", "losbaños, laguna")
    index = index.replace(
        "paranaque, metropolitanmanila", "parañaque, metropolitanmanila"
    )
    index = index.replace("penablanca, cagayan", "peñablanca, cagayan")
    index = index.replace("penaranda, nuevaecija", "peñaranda, nuevaecija")
    index = index.replace("penarrubia, abra", "peñarrubia, abra")
    index = index.replace("pigcawayan, cotabato", "pigkawayan, cotabato")
    index = index.replace("pinan, zamboangadelnorte", "piñan, zamboangadelnorte")
    index = index.replace("presidentgarcia, bohol", "presidentcarlospgarcia, bohol")
    index = index.replace("rtlim, zamboangasibugay", "rosellerlim, zamboangasibugay")
    index = index.replace(
        "roxas, zamboangadelnorte", "presmanuelaroxas, zamboangadelnorte"
    )
    index = index.replace("sagnay, camarinessur", "sagñay, camarinessur")
    index = index.replace("stonino, cagayan", "stoniño, cagayan")
    index = index.replace("stonino, southcotabato", "stoniño, southcotabato")
    index = index.replace("stonino, samar", "stoniño, samar")
    index = index.replace("sciencemunoz, nuevaecija", "sciencemuñoz, nuevaecija")
    index = index.replace(
        "senatorninoyaquino, sultankudarat", "senninoyaquino, sultankudarat"
    )
    index = index.replace(
        "sergioosmena, zamboangadelnorte", "sergioosmeñasr, zamboangadelnorte"
    )
    index = index.replace("sofronioespanola, palawan", "sofronioespañola, palawan")
    index = index.replace("zamboanga, zamboanga", "zamboanga, zamboangadelsur")

    return index

In [4]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

NAMESFILE = Path("tasks", "task-1-data-collection", "city_names.csv")
SHAPEFILE = Path("data", "geolocation", "ph_cities_joined", "ph_cities.shp")
SHAPEFILE_V2 = Path("data", "geolocation", "ph_cities_joined_v2", "ph_cities_v2.shp")

In [5]:
gdf = gp.read_file(Path(PROJECTROOT, SHAPEFILE))

In [6]:
gdf = gdf.sort_values("clean_inde").reset_index()

In [7]:
names = pd.read_csv(Path(PROJECTROOT, NAMESFILE))
names = list(names["City_Municipality"])

In [8]:
# def scrape(name):
#     try:
#         URL = f"https://cmci.dti.gov.ph/lgu-profile.php?lgu={name}"
#         r = requests.get(url=URL)
#         soup = BeautifulSoup(r.content, 'lxml')

#         table = soup.find("div", {"class":"economy-overview-bottom"})

#         table_df = pd.read_html(str(table))[0]

#         return table_df[3].iloc[1]
#     except:
#         return None

In [9]:
# provinces =[]

In [10]:
# for name in names:
#     province = scrape(name)

#     if province:
#         print(f"success for {name}")
#         print(f"province: {province}")
#         print("")
#     else:
#         print(f"errored for {name}")

#     provinces.append(province)

In [11]:
# scraped_tuples = pd.DataFrame(zip(names, provinces), columns=["name", "province"])
# scraped_tuples.to_csv("scraped_tuples.csv")

In [12]:
def correct_province(province):
    province = province.replace("cotabatocotabato", "cotabato")
    province = province.replace("samarwesternsamar", "samar")
    province = province.replace("metromanila", "metropolitanmanila")

    return province

In [13]:
correct_province("alamada, cotabatocotabato")

'alamada, cotabato'

In [14]:
name_and_province = pd.read_csv("scraped_tuples.csv")
name_and_province

Unnamed: 0.1,Unnamed: 0,name,province
0,0,Aborlan,Palawan
1,1,Abra De Ilog,Occidental Mindoro
2,2,Abucay,Bataan
3,3,Abulug,Cagayan
4,4,Abuyog,Leyte
...,...,...,...
1629,1629,Zamboanga,Zamboanga City
1630,1630,Zamboanguita,Negros Oriental
1631,1631,Zaragoza,Nueva Ecija
1632,1632,Zarraga,Iloilo


In [15]:
names_no_abbrevs = [name.split(" (")[0] for name in names]

In [16]:
name_and_province["name_no_abbrev"] = names_no_abbrevs

In [17]:
name_and_province["clean_province"] = (
    name_and_province["province"].apply(data_cleaner).apply(correct_province)
)

name_and_province["clean_city"] = name_and_province["name_no_abbrev"].apply(
    data_cleaner
)

In [18]:
name_and_province["clean_index"] = (
    name_and_province["clean_city"].astype(str)
    + ", "
    + name_and_province["clean_province"].astype(str)
)

name_and_province["clean_index"] = correct_index(name_and_province["clean_index"])

In [19]:
name_and_province

Unnamed: 0.1,Unnamed: 0,name,province,name_no_abbrev,clean_province,clean_city,clean_index
0,0,Aborlan,Palawan,Aborlan,palawan,aborlan,"aborlan, palawan"
1,1,Abra De Ilog,Occidental Mindoro,Abra De Ilog,occidentalmindoro,abradeilog,"abradeilog, occidentalmindoro"
2,2,Abucay,Bataan,Abucay,bataan,abucay,"abucay, bataan"
3,3,Abulug,Cagayan,Abulug,cagayan,abulug,"abulug, cagayan"
4,4,Abuyog,Leyte,Abuyog,leyte,abuyog,"abuyog, leyte"
...,...,...,...,...,...,...,...
1629,1629,Zamboanga,Zamboanga City,Zamboanga,zamboanga,zamboanga,"zamboanga, zamboangadelsur"
1630,1630,Zamboanguita,Negros Oriental,Zamboanguita,negrosoriental,zamboanguita,"zamboanguita, negrosoriental"
1631,1631,Zaragoza,Nueva Ecija,Zaragoza,nuevaecija,zaragoza,"zaragoza, nuevaecija"
1632,1632,Zarraga,Iloilo,Zarraga,iloilo,zarraga,"zarraga, iloilo"


In [20]:
gdf.head()

Unnamed: 0,index,psgc,province,city_munic,longitude,latitude,clean_prov,clean_city,clean_inde,coords,geometry
0,522,1705301000,Palawan,Aborlan,118.548417,9.437101,palawan,aborlan,"aborlan, palawan","9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9..."
1,496,1705101000,Occidental Mindoro,Abra De Ilog,120.726826,13.443721,occidentalmindoro,abradeilog,"abradeilog, occidentalmindoro","13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373..."
2,218,300801000,Bataan,Abucay,120.53487,14.721315,bataan,abucay,"abucay, bataan","14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696..."
3,131,201501000,Cagayan,Abulug,121.457273,18.443485,cagayan,abulug,"abulug, cagayan","18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ..."
4,965,803701000,Leyte,Abuyog,125.011485,10.747102,leyte,abuyog,"abuyog, leyte","10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576..."


In [21]:
new_indices = list(name_and_province["clean_index"])
old_indices = list(gdf["clean_inde"])

In [25]:
gdf = gdf.sort_values("clean_inde")
name_and_province = name_and_province.sort_values("clean_index")

In [28]:
g = list(gdf["clean_inde"])
n = list(name_and_province["clean_index"])