In [1]:
import os
import re
from pathlib import Path

import geopandas as gp
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Helper Functions

In [2]:
def correct_province(province):
    province = province.replace("cotabatocotabato", "cotabato")
    province = province.replace("samarwesternsamar", "samar")
    province = province.replace("metromanila", "metropolitanmanila")

    return province

In [3]:
def data_cleaner(text):
    # Make lower caps
    text = text.lower()

    # Remove "city"
    text = text.replace("city", "")

    # Remove " of "
    text = text.replace(" of ", "")

    # Remove punctuation marks
    text = re.sub(r"[^\w\s]", "", text)

    # Remove spaces
    text = text.replace(" ", "")

    # Replace compostela valley with Davao de Oro
    text = text.replace("compostelavalley", "davaodeoro")

    # Replace Maguindanao with del Norte/del Sur
    if "maguindanao" in text:
        text = "maguindanao"

    # Replace North Cotabato with Cotabato
    text = text.replace("northcotabato", "cotabato")

    # Replace santo with sto
    text = text.replace("santo", "sto")

    # Replace santa with sta
    text = text.replace("santa", "sta")

    return text

In [4]:
def correct_index(index):
    index = index.replace(
        "bacunganleonbpostigo, zamboangadelnorte", "bacungan, zamboangadelnorte"
    )
    index = index.replace("baliwag, bulacan", "baliuag, bulacan")
    index = index.replace("bantayanisland, cebu", "bantayan, cebu")
    index = index.replace("binan, laguna", "biñan, laguna")
    index = index.replace("bulakan, bulacan", "bulacan, bulacan")
    index = index.replace("dasmarinas, cavite", "dasmariñas, cavite")
    index = index.replace(
        "datublahsinsuat, maguindanao", "datublahtsinsuat, maguindanao"
    )
    index = index.replace(
        "donaremediostrinidad, bulacan", "doñaremediostrinidad, bulacan"
    )
    index = index.replace("duenas, iloilo", "dueñas, iloilo")
    index = index.replace(
        "ebmagalona, negrosoccidental", "enriquebmagalona, negrosoccidental"
    )
    index = index.replace(
        "genemilioaguinaldo, cavite", "generalemilioaguinaldo, cavite"
    )
    index = index.replace("igacos, davaodelnorte", "islandgardensamal, davaodelnorte")
    index = index.replace("isabela, basilanix", "isabela, basilan")
    index = index.replace(
        "laspinas, metropolitanmanila", "laspiñas, metropolitanmanila"
    )
    index = index.replace("losbanos, laguna", "losbaños, laguna")
    index = index.replace(
        "paranaque, metropolitanmanila", "parañaque, metropolitanmanila"
    )
    index = index.replace("penablanca, cagayan", "peñablanca, cagayan")
    index = index.replace("penaranda, nuevaecija", "peñaranda, nuevaecija")
    index = index.replace("penarrubia, abra", "peñarrubia, abra")
    index = index.replace("pigcawayan, cotabato", "pigkawayan, cotabato")
    index = index.replace("pinan, zamboangadelnorte", "piñan, zamboangadelnorte")
    index = index.replace("presidentgarcia, bohol", "presidentcarlospgarcia, bohol")
    index = index.replace("rtlim, zamboangasibugay", "rosellerlim, zamboangasibugay")
    index = index.replace(
        "roxas, zamboangadelnorte", "presmanuelaroxas, zamboangadelnorte"
    )
    index = index.replace("sagnay, camarinessur", "sagñay, camarinessur")
    index = index.replace("stonino, cagayan", "stoniño, cagayan")
    index = index.replace("stonino, southcotabato", "stoniño, southcotabato")
    index = index.replace("stonino, samar", "stoniño, samar")
    index = index.replace("sciencemunoz, nuevaecija", "sciencemuñoz, nuevaecija")
    index = index.replace(
        "senatorninoyaquino, sultankudarat", "senninoyaquino, sultankudarat"
    )
    index = index.replace(
        "sergioosmena, zamboangadelnorte", "sergioosmeñasr, zamboangadelnorte"
    )
    index = index.replace("sofronioespanola, palawan", "sofronioespañola, palawan")
    index = index.replace("zamboanga, zamboanga", "zamboanga, zamboangadelsur")

    return index

In [5]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

NAMESFILE = Path("tasks", "task-1-data-collection", "city_names.csv")
SHAPEFILE = Path("data", "geolocation", "ph_cities_joined", "ph_cities.shp")
SHAPEFILE_V2 = Path("data", "geolocation", "ph_cities_joined_v2", "ph_cities_v2.shp")

In [6]:
gdf = gp.read_file(Path(PROJECTROOT, SHAPEFILE))

In [7]:
names = pd.read_csv(Path(PROJECTROOT, NAMESFILE))
names = list(names["City_Municipality"])

In [8]:
# def scrape(name):
#     try:
#         URL = f"https://cmci.dti.gov.ph/lgu-profile.php?lgu={name}"
#         r = requests.get(url=URL)
#         soup = BeautifulSoup(r.content, 'lxml')

#         table = soup.find("div", {"class":"economy-overview-bottom"})

#         table_df = pd.read_html(str(table))[0]

#         return table_df[3].iloc[1]
#     except:
#         return None

In [9]:
# provinces =[]

In [10]:
# for name in names:
#     province = scrape(name)

#     if province:
#         print(f"success for {name}")
#         print(f"province: {province}")
#         print("")
#     else:
#         print(f"errored for {name}")

#     provinces.append(province)

In [11]:
# scraped_tuples = pd.DataFrame(zip(names, provinces), columns=["name", "province"])
# scraped_tuples.to_csv("scraped_tuples.csv")

In [12]:
name_and_province = pd.read_csv("scraped_tuples.csv")
name_and_province

Unnamed: 0.1,Unnamed: 0,name,province
0,0,Aborlan,Palawan
1,1,Abra De Ilog,Occidental Mindoro
2,2,Abucay,Bataan
3,3,Abulug,Cagayan
4,4,Abuyog,Leyte
...,...,...,...
1629,1629,Zamboanga,Zamboanga City
1630,1630,Zamboanguita,Negros Oriental
1631,1631,Zaragoza,Nueva Ecija
1632,1632,Zarraga,Iloilo


In [13]:
names_no_abbrevs = [name.split(" (")[0] for name in names]

In [14]:
name_and_province["name_no_abbrev"] = names_no_abbrevs

In [15]:
name_and_province["clean_province"] = (
    name_and_province["province"].apply(data_cleaner).apply(correct_province)
)

name_and_province["clean_city"] = name_and_province["name_no_abbrev"].apply(
    data_cleaner
)

In [16]:
name_and_province["clean_idx"] = (
    name_and_province["clean_city"].astype(str)
    + ", "
    + name_and_province["clean_province"].astype(str)
)

name_and_province["clean_idx"] = correct_index(name_and_province["clean_idx"])

In [17]:
name_and_province

Unnamed: 0.1,Unnamed: 0,name,province,name_no_abbrev,clean_province,clean_city,clean_idx
0,0,Aborlan,Palawan,Aborlan,palawan,aborlan,"aborlan, palawan"
1,1,Abra De Ilog,Occidental Mindoro,Abra De Ilog,occidentalmindoro,abradeilog,"abradeilog, occidentalmindoro"
2,2,Abucay,Bataan,Abucay,bataan,abucay,"abucay, bataan"
3,3,Abulug,Cagayan,Abulug,cagayan,abulug,"abulug, cagayan"
4,4,Abuyog,Leyte,Abuyog,leyte,abuyog,"abuyog, leyte"
...,...,...,...,...,...,...,...
1629,1629,Zamboanga,Zamboanga City,Zamboanga,zamboanga,zamboanga,"zamboanga, zamboangadelsur"
1630,1630,Zamboanguita,Negros Oriental,Zamboanguita,negrosoriental,zamboanguita,"zamboanguita, negrosoriental"
1631,1631,Zaragoza,Nueva Ecija,Zaragoza,nuevaecija,zaragoza,"zaragoza, nuevaecija"
1632,1632,Zarraga,Iloilo,Zarraga,iloilo,zarraga,"zarraga, iloilo"


In [18]:
gdf.head()

Unnamed: 0,psgc,province,city_munic,longitude,latitude,clean_prov,clean_city,clean_inde,coords,geometry
0,102801000,Ilocos Norte,Adams,120.903571,18.461311,ilocosnorte,adams,"adams, ilocosnorte","18.4613108, 120.903571","POLYGON ((120.96105 18.44746, 120.96137 18.446..."
1,102802000,Ilocos Norte,Bacarra,120.610659,18.251738,ilocosnorte,bacarra,"bacarra, ilocosnorte","18.2517377, 120.6106589","POLYGON ((120.64023 18.25566, 120.63957 18.254..."
2,102803000,Ilocos Norte,Badoc,120.474089,17.926701,ilocosnorte,badoc,"badoc, ilocosnorte","17.9267006, 120.4740893","MULTIPOLYGON (((120.54331 17.91912, 120.54858 ..."
3,102804000,Ilocos Norte,Bangui,120.765756,18.536739,ilocosnorte,bangui,"bangui, ilocosnorte","18.5367387, 120.7657563","POLYGON ((120.79987 18.51717, 120.79823 18.514..."
4,102805000,Ilocos Norte,City of Batac,120.58175,18.0373,ilocosnorte,batac,"batac, ilocosnorte","18.0373, 120.58175","POLYGON ((120.62589 18.06064, 120.62514 18.058..."


In [19]:
gdf = gdf.sort_values("clean_inde")
name_and_province = name_and_province.sort_values("clean_idx")

In [20]:
name_and_province.columns

Index(['Unnamed: 0', 'name', 'province', 'name_no_abbrev', 'clean_province',
       'clean_city', 'clean_idx'],
      dtype='object')

In [21]:
gdf = gdf.merge(name_and_province[["name", "clean_idx"]], 
                how="left", 
                left_on="clean_inde",
                right_on="clean_idx")

In [22]:
gdf

Unnamed: 0,psgc,province,city_munic,longitude,latitude,clean_prov,clean_city,clean_inde,coords,geometry,name,clean_idx
0,1705301000,Palawan,Aborlan,118.548417,9.437101,palawan,aborlan,"aborlan, palawan","9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9...",Aborlan,"aborlan, palawan"
1,1705101000,Occidental Mindoro,Abra De Ilog,120.726826,13.443721,occidentalmindoro,abradeilog,"abradeilog, occidentalmindoro","13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373...",Abra De Ilog,"abradeilog, occidentalmindoro"
2,300801000,Bataan,Abucay,120.534870,14.721315,bataan,abucay,"abucay, bataan","14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696...",Abucay,"abucay, bataan"
3,201501000,Cagayan,Abulug,121.457273,18.443485,cagayan,abulug,"abulug, cagayan","18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ...",Abulug,"abulug, cagayan"
4,803701000,Leyte,Abuyog,125.011485,10.747102,leyte,abuyog,"abuyog, leyte","10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576...",Abuyog,"abuyog, leyte"
...,...,...,...,...,...,...,...,...,...,...,...,...
1629,931700000,Zamboanga del Sur,City of Zamboanga,122.079000,6.921400,zamboangadelsur,zamboanga,"zamboanga, zamboangadelsur","6.9214, 122.079","MULTIPOLYGON (((122.06639 6.86972, 122.06639 6...",Zamboanga,"zamboanga, zamboangadelsur"
1630,704625000,Negros Oriental,Zamboanguita,123.199424,9.100465,negrosoriental,zamboanguita,"zamboanguita, negrosoriental","9.1004649, 123.1994244","POLYGON ((123.20750 9.10485, 123.20722 9.10443...",Zamboanguita,"zamboanguita, negrosoriental"
1631,304932000,Nueva Ecija,Zaragoza,120.793554,15.447583,nuevaecija,zaragoza,"zaragoza, nuevaecija","15.4475833, 120.7935538","POLYGON ((120.81170 15.47132, 120.81309 15.470...",Zaragoza,"zaragoza, nuevaecija"
1632,603047000,Iloilo,Zarraga,122.609582,10.822379,iloilo,zarraga,"zarraga, iloilo","10.8223786, 122.6095819","POLYGON ((122.65892 10.79784, 122.65959 10.796...",Zarraga,"zarraga, iloilo"


In [23]:
keep_cols = ["psgc", "name", "city_munic", "province", "clean_idx", "geometry", "longitude", "latitude", "coords"]

In [24]:
gdf = gdf[keep_cols]

In [26]:
gdf.head()

Unnamed: 0,psgc,name,city_munic,province,clean_idx,geometry,longitude,latitude,coords
0,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9...",118.548417,9.437101,"9.4371009, 118.5484168"
1,1705101000,Abra De Ilog,Abra De Ilog,Occidental Mindoro,"abradeilog, occidentalmindoro","POLYGON ((120.60896 13.35233, 120.60797 13.373...",120.726826,13.443721,"13.4437209, 120.7268262"
2,300801000,Abucay,Abucay,Bataan,"abucay, bataan","POLYGON ((120.45676 14.69671, 120.45620 14.696...",120.53487,14.721315,"14.7213146, 120.5348704"
3,201501000,Abulug,Abulug,Cagayan,"abulug, cagayan","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ...",121.457273,18.443485,"18.4434854, 121.4572732"
4,803701000,Abuyog,Abuyog,Leyte,"abuyog, leyte","POLYGON ((125.04650 10.56751, 125.04588 10.576...",125.011485,10.747102,"10.747102, 125.0114853"


In [25]:
gdf.to_file(Path(PROJECTROOT, SHAPEFILE_V2), index=False)