In [423]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [424]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [425]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housetype-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [426]:
df = pd.read_csv(DATASET)
df = df.iloc[:, :7]  # Take only first 7 columns

In [427]:
df.head()

Unnamed: 0,Type of Building and City/Municipality,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units
0,,,,,,,
1,NATIONAL CAPITAL REGION,,,,,,
2,Total,2.968.651,3.095.484,12.786.611,413.0,104.0,431.0
3,Single house,1.333.067,1.396.332,6.378.542,457.0,105.0,478.0
4,Duplex,410.775,427.606,1.733.813,405.0,104.0,422.0


In [428]:
df = df.loc[df["Type of Building and City/Municipality"] != "Total"]

In [429]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [430]:
regions = list(df["group"].unique())

In [431]:
df.head()

Unnamed: 0,Type of Building and City/Municipality,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units,group
0,,,,,,,,1
1,NATIONAL CAPITAL REGION,,,,,,,1
3,Single house,1.333.067,1.396.332,6.378.542,457.0,105.0,478.0,1
4,Duplex,410.775,427.606,1.733.813,405.0,104.0,422.0,1
5,Multi-unit residential,1.207.374,1.252.815,4.608.041,368.0,104.0,382.0,1


In [432]:
df["Type of Building and City/Municipality"] = df[
    "Type of Building and City/Municipality"
].apply(replace_region)

In [433]:
housing_stats = [
    " Occupied Housing Units ",
    " Number of Households* ",
    " Household Population* ",
    " Average Household Size ",
    " Ratio of Households to Occupied Housing Units ",
    " Ratio of Household Population to Occupied Housing Units ",
]

In [434]:
bldg_types = [
    "Single house",
    "Duplex",
    "Multi-unit residential",
    "Commercial/industrial/agricultural",
    "Institutional living quarter",
    "Others",
    "Not Reported",
]

In [435]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [539]:
def parse_region(df):
    df = df.copy()
    
    region_name = df["Type of Building and City/Municipality"].loc[1]
    unique_vals = df["Type of Building and City/Municipality"].dropna().unique()
    

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df["Type of Building and City/Municipality"].isin(provinces).cumsum()
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [540]:
def parse_province(df, region_name):
    df = df.copy()
    province_name = df["Type of Building and City/Municipality"].iloc[0]
    print(province_name)
    unique_vals = df["Type of Building and City/Municipality"].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name] and val not in bldg_types]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            df.loc[df["Type of Building and City/Municipality"] == city].index
        ).tolist()[0]
        
        print(f"idx: {idx}")
        print(df["Type of Building and City/Municipality"].loc[idx])
        
        city_vals_list = parse_city(city.strip(), region_name, df, idx)

        for row in city_vals_list:
            all_vals.append(row)

In [541]:
def parse_city(city, region_name, df, idx):
    df = df.reset_index().copy()
    df = df[idx:idx+8].reset_index()
    print(df)


    city_vals_list = []

    for bldg_type in bldg_types:
        city_vals = {}
        city_vals["city"] = city
        city_vals["region_name"] = region_name
        city_vals["bldg_type"] = bldg_type

        for housing_stat in housing_stats:

            df_ = df.loc[df["Type of Building and City/Municipality"] == bldg_type]
            print(df_)

            replace_period = [
                " Occupied Housing Units ",
                " Number of Households* ",
                " Household Population* ",
            ]

            if housing_stat in replace_period:
                val = df_[housing_stat].values
                print(val)
                val = str(val[0]).strip()
                val = val.replace(".", "")

            else:
                val = df_[housing_stat].values
                val = str(val[0]).strip()
                val = val.replace(",", ".")

            try:
                city_vals[housing_stat.strip()] = float(val)
            except:
                city_vals[housing_stat.strip()] = float(np.nan)

            city_vals_list.append(city_vals)

    return city_vals_list

In [542]:
for region in regions:
    print(region)
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1]["Type of Building and City/Municipality"]
    parse_region(df_)

1
nan
 METROPOLITAN MANILA
>>> Parsing  CITY OF MANILA, METROPOLITAN MANILA in NCR - National Capital Region...
idx: 17
 CITY OF MANILA
   level_0  index Type of Building and City/Municipality  \
0       26     30                           Single house   

   Occupied Housing Units   Number of Households*   Household Population*   \
0                  34.162                  35.511                 167.356    

   Average Household Size   Ratio of Households to Occupied Housing Units   \
0                    4,71                                            1,04    

   Ratio of Household Population to Occupied Housing Units   group  \
0                                              4,90              1   

   province_no  
0            1  
['  34.162 ']
   level_0  index Type of Building and City/Municipality  \
0       26     30                           Single house   

   Occupied Housing Units   Number of Households*   Household Population*   \
0                  34.162                

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()

## Name matching

In [88]:
def data_cleaner(text):
    # Make lower caps
    text = text.lower()

    # Remove "city"
    text = text.replace("city", "")

    # Remove " of "
    text = text.replace(" of ", "")

    # Remove all strings within parentheses
    text = text.split(" (")[0]

    # Remove (Capital)
    text = text.replace(" (capital)", "")

    # Remove punctuation marks
    text = re.sub(r"[^\w\s]", "", text)

    # Remove spaces
    text = text.replace(" ", "")

    # Replace compostela valley with Davao de Oro
    text = text.replace("compostelavalley", "davaodeoro")

    # Replace Maguindanao with del Norte/del Sur
    if "maguindanao" in text:
        text = "maguindanao"

    # Replace santo with sto
    text = text.replace("santo", "sto")

    # Replace santa with sta
    text = text.replace("santa", "sta")

    return text

In [90]:
REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [91]:
ref_df = gp.read_file(REF_DF)

In [92]:
ref_df.head()

Unnamed: 0,name,city_munic,province,region,coords,geometry
0,Aborlan,Aborlan,Palawan,REGION IV-B (MIMAROPA),"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9..."
1,Abra De Ilog,Abra De Ilog,Occidental Mindoro,REGION IV-B (MIMAROPA),"13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373..."
2,Abucay,Abucay,Bataan,REGION III (Central Luzon),"14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696..."
3,Abulug,Abulug,Cagayan,REGION II (Cagayan Valley),"18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ..."
4,Abuyog,Abuyog,Leyte,REGION VIII (Eastern Visayas),"10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576..."


In [93]:
ref_df["clean_city"] = ref_df["city_munic"].apply(data_cleaner)
ref_df["clean_reg"] = ref_df["region"].apply(data_cleaner)

ref_df["clean_idx"] = (
    ref_df["clean_city"].astype(str) + ", " + ref_df["clean_reg"].astype(str)
)

In [94]:
ref_df.head()

Unnamed: 0,name,city_munic,province,region,coords,geometry,clean_city,clean_reg,clean_idx
0,Aborlan,Aborlan,Palawan,REGION IV-B (MIMAROPA),"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9...",aborlan,regionivb,"aborlan, regionivb"
1,Abra De Ilog,Abra De Ilog,Occidental Mindoro,REGION IV-B (MIMAROPA),"13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373...",abradeilog,regionivb,"abradeilog, regionivb"
2,Abucay,Abucay,Bataan,REGION III (Central Luzon),"14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696...",abucay,regioniii,"abucay, regioniii"
3,Abulug,Abulug,Cagayan,REGION II (Cagayan Valley),"18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ...",abulug,regionii,"abulug, regionii"
4,Abuyog,Abuyog,Leyte,REGION VIII (Eastern Visayas),"10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576...",abuyog,regionviii,"abuyog, regionviii"


In [95]:
final_df["region_name"] = final_df["region_name"].apply(replace_region)

In [96]:
final_df["clean_city"] = final_df["city"].apply(data_cleaner)
final_df["clean_reg"] = final_df["region_name"].apply(data_cleaner)

final_df["clean_idx"] = (
    final_df["clean_city"].astype(str) + ", " + final_df["clean_reg"].astype(str)
)

Some cities and regions are non-unique. Let's deal with those.

In [203]:
slice = final_df.loc[final_df["bldg_type"] == "Single house"]

In [204]:
unique_idx_reg = list(slice["clean_idx"])

In [205]:
newlist = []  # empty list to hold unique elements from the list
duplist = []  # empty list to hold the duplicate elements from the list
for i in unique_idx_reg:
    if i not in newlist:
        newlist.append(i)
    else:
        duplist.append(i)

In [206]:
duplist

['sanjuan, regioni',
 'sanjose, regioniii',
 'quezon, regioniva',
 'rizal, regionivb',
 'danao, regionvii',
 'valencia, regionvii',
 'magsaysay, regionx',
 'sanfrancisco, regionxiii']

In [222]:
final_df = final_df.loc[final_df["city"] != "BATANGAS"]

In [223]:
dup_idx = "sanjuan, regioni"

In [226]:
slice.loc[slice["city"] == "BATANGAS"]

Unnamed: 0,city,region_name,bldg_type,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units,clean_city,clean_reg,clean_idx


In [225]:
ref_df.loc[ref_df["clean_idx"] == dup_idx]

Unnamed: 0,name,city_munic,province,region,coords,geometry,clean_city,clean_reg,clean_idx
1282,San Juan (IS),San Juan,Ilocos Sur,REGION I (Ilocos Region),"17.7422338, 120.4583071","POLYGON ((120.47397 17.71095, 120.47037 17.712...",sanjuan,regioni,"sanjuan, regioni"
1283,San Juan (LU),San Juan,La Union,REGION I (Ilocos Region),"16.6699328, 120.3385507","POLYGON ((120.43591 16.63852, 120.42805 16.639...",sanjuan,regioni,"sanjuan, regioni"


In [200]:
new_idx = "sanjuan-is, regioni"

In [201]:
final_df.loc[final_df["city"] == "SAN JUAN (LAPOG)"]["clean_idx"] = new_idx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[final_df["city"]=="SAN JUAN (LAPOG)"]["clean_idx"] = new_idx


In [227]:
provinces = list(ref_df["province"].unique())
provinces

['Palawan',
 'Occidental Mindoro',
 'Bataan',
 'Cagayan',
 'Leyte',
 'Ilocos Norte',
 'Quezon',
 'Quirino',
 'Pangasinan',
 'Batangas',
 'La Union',
 'Ifugao',
 'Iloilo',
 'Basilan',
 'Sarangani',
 'Cotabato',
 'Laguna',
 'Bohol',
 'Cebu',
 'Romblon',
 'Surigao del Norte',
 'Cavite',
 'Nueva Vizcaya',
 'Nueva Ecija',
 'Isabela',
 'Zamboanga Sibugay',
 'Ilocos Sur',
 'Northern Samar',
 'Samar',
 'Biliran',
 'Misamis Occidental',
 'Aklan',
 'Misamis Oriental',
 'Lanao del Sur',
 'Negros Oriental',
 'Maguindanao del Sur',
 'Southern Leyte',
 'Tarlac',
 'Bulacan',
 'Pampanga',
 'Rizal',
 'Antique',
 'Masbate',
 'Eastern Samar',
 'Davao del Norte',
 'Benguet',
 'Zamboanga del Sur',
 'Camarines Sur',
 'Albay',
 'Oriental Mindoro',
 'Lanao del Norte',
 'Negros Occidental',
 'Zamboanga del Norte',
 'Catanduanes',
 'Davao Oriental',
 'Sultan Kudarat',
 'Kalinga',
 'Aurora',
 'South Cotabato',
 'Abra',
 'Davao del Sur',
 'Sorsogon',
 'Maguindanao del Norte',
 'Mountain Province',
 'Surigao del S

In [238]:
mm_pr = list(final_df["city"].unique())
mm_pr = [val.split(" (e")[0] for val in mm_pr]
mm_pr.sort()

mm_pr

['ABORLAN',
 'ABRA',
 'ABRA DE ILOG',
 'ABUCAY',
 'ABULUG',
 'ABUYOG',
 'ADAMS',
 'AGDANGAN',
 'AGLIPAY',
 'AGNO',
 'AGONCILLO',
 'AGOO',
 'AGUILAR',
 'AGUINALDO',
 'AGUSAN DEL NORTE',
 'AGUSAN DEL SUR',
 'AGUTAYA',
 'AJUY',
 'AKBAR',
 'AKLAN',
 'AL-BARKA',
 'ALABAT',
 'ALABEL (Capital)',
 'ALAMADA',
 'ALAMINOS',
 'ALANGALANG',
 'ALBAY',
 'ALBUERA',
 'ALBURQUERQUE',
 'ALCALA',
 'ALCANTARA',
 'ALCOY',
 'ALEGRIA',
 'ALEOSAN',
 'ALFONSO',
 'ALFONSO CASTANEDA',
 'ALFONSO LISTA (POTIA)',
 'ALIAGA',
 'ALICIA',
 'ALILEM',
 'ALIMODIAN',
 'ALITAGTAG',
 'ALLACAPAN',
 'ALLEN',
 'ALMAGRO',
 'ALMERIA',
 'ALOGUINSAN',
 'ALORAN',
 'ALTAVAS',
 'ALUBIJID',
 'AMADEO',
 'AMBAGUIO',
 'AMLAN (AYUQUITAN)',
 'AMPATUAN',
 'AMULUNG',
 'ANAHAWAN',
 'ANAO',
 'ANDA',
 'ANGADANAN',
 'ANGAT',
 'ANGELES CITY',
 'ANGONO',
 'ANILAO',
 'ANINI-Y',
 'ANTEQUERA',
 'ANTIPAS',
 'ANTIQUE',
 'APALIT',
 'APARRI',
 'APAYAO',
 'ARACELI',
 'ARAKAN',
 'ARAYAT',
 'ARGAO',
 'ARINGAY',
 'ARITAO',
 'AROROY',
 'ARTECHE',
 'ASINGAN',
 '

In [235]:
ok = [col for col in mm_pr if "(excluding" in col]
ok.sort()
ok

['AGUSAN DEL NORTE (excluding Butuan City)',
 'BENGUET (excluding Baguio City)',
 'DAVAO DEL SUR (excluding Davao City)',
 'LANAO DEL NORTE (excluding Iligan City)',
 'MISAMIS ORIENTAL (excluding Cagayan de Oro City)',
 'NEGROS OCCIDENTAL (excluding Bacolod City)',
 'PALAWAN (excluding Puerto Princesa City)',
 'PAMPANGA (excluding Angeles City)',
 'QUEZON (excluding Lucena City)',
 'SOUTH COTABATO (excluding General Santos City)',
 'ZAMBALES (excluding Olongapo City)',
 'ZAMBOANGA DEL SUR (excluding Zamboanga City)']

In [239]:
not_in_mmpr = [prov for prov in provinces if prov.upper() not in mm_pr]
not_in_mmpr.sort()

not_in_mmpr

['Batangas',
 'Cavite',
 'Cebu',
 'Cotabato',
 'Davao de Oro',
 'Iloilo',
 'Maguindanao del Norte',
 'Maguindanao del Sur',
 'Masbate',
 'Metropolitan Manila',
 'Romblon',
 'Samar',
 'Siquijor',
 'Sorsogon',
 'Tarlac']

In [211]:
final_df.loc[final_df["city"] == "SAN JUAN (LAPOG)"]["clean_idx"]

6006    sanjuan, regioni
6012    sanjuan, regioni
6018    sanjuan, regioni
6024    sanjuan, regioni
6030    sanjuan, regioni
6036    sanjuan, regioni
6042    sanjuan, regioni
Name: clean_idx, dtype: object

In [202]:
ref_df.loc[ref_df["name"] == "San Juan (IS)"]["clean_idx"] = new_idx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
