In [28]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import geopandas as gp
import re

import warnings
warnings.filterwarnings('ignore')

In [29]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

POVERTY_ESTIMATES_DIR = Path(PROJECTROOT, "data", "gathered-datasets", "poverty-estimates.csv")
COMPLETE_DATASET = Path(PROJECTROOT, "data", "geolocation","ph_cities_joined_v2", "ph_cities_v2.shp")
OUTPUT_DATASET = Path(PROJECTROOT, "data", "gathered-datasets", "poverty-estimates-cleaned.csv")

In [30]:
ref_df = gp.read_file(COMPLETE_DATASET)
df = pd.read_csv(POVERTY_ESTIMATES_DIR)

In [31]:
ref_df.head()

Unnamed: 0,psgc,name,city_munic,province,clean_idx,longitude,latitude,coords,geometry
0,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9..."
1,1705101000,Abra De Ilog,Abra De Ilog,Occidental Mindoro,"abradeilog, occidentalmindoro",120.726826,13.443721,"13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373..."
2,300801000,Abucay,Abucay,Bataan,"abucay, bataan",120.53487,14.721315,"14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696..."
3,201501000,Abulug,Abulug,Cagayan,"abulug, cagayan",121.457273,18.443485,"18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ..."
4,803701000,Abuyog,Abuyog,Leyte,"abuyog, leyte",125.011485,10.747102,"10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576..."


In [32]:
keep_cols = ["name", "clean_idx", "province", "city_munic"]
ref_df = ref_df[keep_cols]

ref_df.head()

Unnamed: 0,name,clean_idx,province,city_munic
0,Aborlan,"aborlan, palawan",Palawan,Aborlan
1,Abra De Ilog,"abradeilog, occidentalmindoro",Occidental Mindoro,Abra De Ilog
2,Abucay,"abucay, bataan",Bataan,Abucay
3,Abulug,"abulug, cagayan",Cagayan,Abulug
4,Abuyog,"abuyog, leyte",Leyte,Abuyog


In [33]:
df.head()

Unnamed: 0,psgc,province,municipality-city,2009-poverty-inc,2012-poverty-inc,2015-poverty-inc,2009-cov,2012-cov,2015-cov,2009-conf-int-lowerlimit,2009-conf-int-upper-limit,2012-conf-int-lowerlimit,2012-conf-int-upper-limit,2015-conf-int-lowerlimit,2015-conf-int-upper-limit
0,133901,Metropolitan Manila,Tondo,29,31,56,275,204,107,16,42,21,42,46,66
1,133902,Metropolitan Manila,Binondo,10,15,35,680,419,228,0,22,5,25,22,48
2,133903,Metropolitan Manila,Quiapo,21,21,71,424,281,179,6,35,11,31,50,92
3,133904,Metropolitan Manila,San Nicolas,24,30,67,570,536,272,1,47,4,55,37,97
4,133905,Metropolitan Manila,Santa Cruz,17,15,47,325,195,127,8,25,10,20,37,57


In [34]:
def data_cleaner(text):
    # Make lower caps
    text = text.lower()

    # Remove "city"
    text = text.replace("city", "")

    # Remove " of "
    text = text.replace(" of ", "")
    
    # Remove all strings within parentheses
    text = text.split(" (")[0]
    
    # Remove (Capital)
    text = text.replace(" (capital)", "")

    # Remove punctuation marks
    text = re.sub(r"[^\w\s]", "", text)

    # Remove spaces
    text = text.replace(" ", "")

    # Replace compostela valley with Davao de Oro
    text = text.replace("compostelavalley", "davaodeoro")

    # Replace Maguindanao with del Norte/del Sur
    if "maguindanao" in text:
        text = "maguindanao"

    # Replace santo with sto
    text = text.replace("santo", "sto")

    # Replace santa with sta
    text = text.replace("santa", "sta")



    return text

In [35]:
def correct_province(province):
    if province == "North Cotabato":
        return "Cotabato"
    elif province == "Samar (Western Samar)":
        return "Samar"
    else:
        return province

In [36]:
def correct_index(index):
    index = index.replace('divilican, isabela',"divilacan, isabela")
    index = index.replace("cobarronguis, quirino", "cabarroguis, quirino")
    index = index.replace("sanidelfonso, bulacan", "sanildefonso, bulacan")
    index = index.replace("muñoz, nuevaecija", "sciencemuñoz, nuevaecija")
    index = index.replace("belizon, antique", "belison, antique")
    index = index.replace("prescarlospgarcia, bohol", "presidentcarlospgarcia, bohol")
    index = index.replace("cordoba, cebu", "cordova, cebu")
    index = index.replace("pinamungahan, cebu", "pinamungajan, cebu")
    index = index.replace("ozamis, misamisoccidental", "ozamiz, misamisoccidental")
    index = index.replace("koronodal, southcotabato", "koronadal, southcotabato")
    index = index.replace("bumbaran, lanaodelsur", "amaimanabilang, lanaodelsur")

    return index

In [37]:
df["province"] = df["province"].apply(correct_province)

In [38]:
"Cotabato" in list(df["province"].unique())

True

In [39]:
df["clean_province"] = (
    df["province"].apply(data_cleaner).apply(correct_province)
)

df["clean_city"] = df["municipality-city"].apply(
    data_cleaner
)

In [40]:
df.head()

Unnamed: 0,psgc,province,municipality-city,2009-poverty-inc,2012-poverty-inc,2015-poverty-inc,2009-cov,2012-cov,2015-cov,2009-conf-int-lowerlimit,2009-conf-int-upper-limit,2012-conf-int-lowerlimit,2012-conf-int-upper-limit,2015-conf-int-lowerlimit,2015-conf-int-upper-limit,clean_province,clean_city
0,133901,Metropolitan Manila,Tondo,29,31,56,275,204,107,16,42,21,42,46,66,metropolitanmanila,tondo
1,133902,Metropolitan Manila,Binondo,10,15,35,680,419,228,0,22,5,25,22,48,metropolitanmanila,binondo
2,133903,Metropolitan Manila,Quiapo,21,21,71,424,281,179,6,35,11,31,50,92,metropolitanmanila,quiapo
3,133904,Metropolitan Manila,San Nicolas,24,30,67,570,536,272,1,47,4,55,37,97,metropolitanmanila,sannicolas
4,133905,Metropolitan Manila,Santa Cruz,17,15,47,325,195,127,8,25,10,20,37,57,metropolitanmanila,stacruz


In [41]:
df["clean_idx"] = (
    df["clean_city"].astype(str)
    + ", "
    + df["clean_province"].astype(str)
)

In [42]:
df["clean_idx"] = correct_index(df["clean_idx"])

In [43]:
df.head()

Unnamed: 0,psgc,province,municipality-city,2009-poverty-inc,2012-poverty-inc,2015-poverty-inc,2009-cov,2012-cov,2015-cov,2009-conf-int-lowerlimit,2009-conf-int-upper-limit,2012-conf-int-lowerlimit,2012-conf-int-upper-limit,2015-conf-int-lowerlimit,2015-conf-int-upper-limit,clean_province,clean_city,clean_idx
0,133901,Metropolitan Manila,Tondo,29,31,56,275,204,107,16,42,21,42,46,66,metropolitanmanila,tondo,"tondo, metropolitanmanila"
1,133902,Metropolitan Manila,Binondo,10,15,35,680,419,228,0,22,5,25,22,48,metropolitanmanila,binondo,"binondo, metropolitanmanila"
2,133903,Metropolitan Manila,Quiapo,21,21,71,424,281,179,6,35,11,31,50,92,metropolitanmanila,quiapo,"quiapo, metropolitanmanila"
3,133904,Metropolitan Manila,San Nicolas,24,30,67,570,536,272,1,47,4,55,37,97,metropolitanmanila,sannicolas,"sannicolas, metropolitanmanila"
4,133905,Metropolitan Manila,Santa Cruz,17,15,47,325,195,127,8,25,10,20,37,57,metropolitanmanila,stacruz,"stacruz, metropolitanmanila"


In [44]:
(df["province"]=="North Cotabato").sum()

0

In [45]:
ref_idx= list(ref_df["clean_idx"].unique())

In [46]:
df_idx = list(df["clean_idx"].unique())

In [47]:
not_in_ref = [name for name in df_idx if name not in ref_idx]
not_in_ref

['tondo, metropolitanmanila',
 'binondo, metropolitanmanila',
 'quiapo, metropolitanmanila',
 'sannicolas, metropolitanmanila',
 'stacruz, metropolitanmanila',
 'sampaloc, metropolitanmanila',
 'sanmiguel, metropolitanmanila',
 'ermita, metropolitanmanila',
 'intramuros, metropolitanmanila',
 'malate, metropolitanmanila',
 'paco, metropolitanmanila',
 'pandacan, metropolitanmanila',
 'portarea, metropolitanmanila',
 'staana, metropolitanmanila']

In [48]:
not_in_df = [name for name in ref_idx if name not in df_idx]
not_in_df

['cotabato, maguindanao',
 'isabela, basilan',
 'kalayaan, palawan',
 'manila, metropolitanmanila']

In [49]:
df = ref_df.merge(df, how="left", on="clean_idx")

In [50]:
df.head()

Unnamed: 0,name,clean_idx,province_x,city_munic,psgc,province_y,municipality-city,2009-poverty-inc,2012-poverty-inc,2015-poverty-inc,...,2012-cov,2015-cov,2009-conf-int-lowerlimit,2009-conf-int-upper-limit,2012-conf-int-lowerlimit,2012-conf-int-upper-limit,2015-conf-int-lowerlimit,2015-conf-int-upper-limit,clean_province,clean_city
0,Aborlan,"aborlan, palawan",Palawan,Aborlan,175301.0,Palawan,Aborlan,234,224,216,...,124,118,166,302,178,269,174,257,palawan,aborlan
1,Abra De Ilog,"abradeilog, occidentalmindoro",Occidental Mindoro,Abra De Ilog,175101.0,Occidental Mindoro,Abra de Ilog,375,323,348,...,138,125,274,475,250,396,276,419,occidentalmindoro,abradeilog
2,Abucay,"abucay, bataan",Bataan,Abucay,30801.0,Bataan,Abucay,70,40,145,...,258,205,36,104,23,56,96,194,bataan,abucay
3,Abulug,"abulug, cagayan",Cagayan,Abulug,21501.0,Cagayan,Abulug,216,209,144,...,99,150,173,259,175,243,108,179,cagayan,abulug
4,Abuyog,"abuyog, leyte",Leyte,Abuyog,83701.0,Leyte,Abuyog,354,359,361,...,62,77,321,386,323,396,315,406,leyte,abuyog


In [51]:
years = ["2009", "2012", "2015"]

df_list = []

for year in years: 
    col_years = [col for col in df.columns if year in col]
    df_year = df[["name"]+col_years]
    df_year["year"] = str(year)
    
    rename_dict = {}
    for col in col_years:
        rename_dict[col] = col[5:]
    
    df_year.rename(columns=rename_dict, inplace=True)
    df_list.append(df_year)

In [52]:
df_merged = pd.concat(df_list, axis=0)

In [53]:
df_merged.to_csv(OUTPUT_DATASET)
df_merged

Unnamed: 0,name,poverty-inc,cov,conf-int-lowerlimit,conf-int-upper-limit,year
0,Aborlan,234,177,166,302,2009
1,Abra De Ilog,375,164,274,475,2009
2,Abucay,70,295,36,104,2009
3,Abulug,216,121,173,259,2009
4,Abuyog,354,55,321,386,2009
...,...,...,...,...,...,...
1629,Zamboanga,173,90,147,198,2015
1630,Zamboanguita,399,101,333,465,2015
1631,Zaragoza,132,155,98,166,2015
1632,Zarraga,174,148,132,216,2015
