In [1]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [3]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housetype-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housetype-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [4]:
df = pd.read_csv(DATASET)
df = df.iloc[:, :7]  # Take only first 7 columns

In [5]:
df.head()

Unnamed: 0,Type of Building and City/Municipality,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units
0,,,,,,,
1,NATIONAL CAPITAL REGION,,,,,,
2,Total,2.968.651,3.095.484,12.786.611,413.0,104.0,431.0
3,Single house,1.333.067,1.396.332,6.378.542,457.0,105.0,478.0
4,Duplex,410.775,427.606,1.733.813,405.0,104.0,422.0


In [6]:
df = df.loc[df["Type of Building and City/Municipality"] != "Total"]

In [7]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [8]:
regions = list(df["group"].unique())

In [9]:
df.head()

Unnamed: 0,Type of Building and City/Municipality,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units,group
0,,,,,,,,1
1,NATIONAL CAPITAL REGION,,,,,,,1
3,Single house,1.333.067,1.396.332,6.378.542,457.0,105.0,478.0,1
4,Duplex,410.775,427.606,1.733.813,405.0,104.0,422.0,1
5,Multi-unit residential,1.207.374,1.252.815,4.608.041,368.0,104.0,382.0,1


In [10]:
df["Type of Building and City/Municipality"] = df[
    "Type of Building and City/Municipality"
].apply(replace_region)

In [11]:
housing_stats = [
    " Occupied Housing Units ",
    " Number of Households* ",
    " Household Population* ",
    " Average Household Size ",
    " Ratio of Households to Occupied Housing Units ",
    " Ratio of Household Population to Occupied Housing Units ",
]

In [12]:
bldg_types = [
    "Single house",
    "Duplex",
    "Multi-unit residential",
    "Commercial/industrial/agricultural",
    "Institutional living quarter",
    "Others",
    "Not Reported",
]

In [13]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [14]:
all_vals = []

In [15]:
def parse_region(df):
    df = df.copy()
    region_name = df["Type of Building and City/Municipality"].loc[1]
    unique_vals = df["Type of Building and City/Municipality"].dropna().unique()
    

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df["Type of Building and City/Municipality"].isin(provinces).cumsum()
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [16]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df["Type of Building and City/Municipality"].iloc[0]
    unique_vals = province_df["Type of Building and City/Municipality"].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name] and val not in bldg_types]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df["Type of Building and City/Municipality"] == city].index
        ).tolist()[0]
        
        city_vals_list = parse_city(city.strip(), region_name, province_df, idx)

In [17]:
def parse_city(city, region_name, province_df, idx):
    city_df = province_df[idx:idx+8]

    city_vals_list = []

    for bldg_type in bldg_types:
        city_vals = {}
        city_vals["city"] = city
        city_vals["region_name"] = region_name
        city_vals["bldg_type"] = bldg_type

        for housing_stat in housing_stats:

            df_ = city_df.loc[city_df["Type of Building and City/Municipality"] == bldg_type]

            replace_period = [
                " Occupied Housing Units ",
                " Number of Households* ",
                " Household Population* ",
            ]

            if housing_stat in replace_period:
                val = df_[housing_stat].values
                val = str(val[0]).strip()
                val = val.replace(".", "")

            else:
                val = df_[housing_stat].values
                val = str(val[0]).strip()
                val = val.replace(",", ".")


            try:
                city_vals[housing_stat.strip()] = float(val)
            except:
                city_vals[housing_stat.strip()] = float(np.nan)

        all_vals.append(city_vals)

    return city_vals_list

In [18]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1]["Type of Building and City/Municipality"]
    parse_region(df_)

>>> Parsing  CITY OF MANILA, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MANDALUYONG, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MARIKINA, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF PASIG, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  QUEZON CITY, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF SAN JUAN, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CALOOCAN CITY, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MALABON, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF NAVOTAS, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF VALENZUELA, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF LAS PIÑAS, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MAKATI, METROPOLITAN MANILA in 

In [19]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,bldg_type,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units
0,CITY OF MANILA,NCR - National Capital Region,Single house,108277.0,118203.0,538084.0,4.55,1.09,4.97
1,CITY OF MANILA,NCR - National Capital Region,Duplex,57101.0,60966.0,248820.0,4.08,1.07,4.36
2,CITY OF MANILA,NCR - National Capital Region,Multi-unit residential,238997.0,250012.0,953978.0,3.82,1.05,3.99
3,CITY OF MANILA,NCR - National Capital Region,Commercial/industrial/agricultural,2427.0,2668.0,9526.0,3.57,1.10,3.93
4,CITY OF MANILA,NCR - National Capital Region,Institutional living quarter,95.0,104.0,367.0,3.53,1.09,3.86
...,...,...,...,...,...,...,...,...,...
11440,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,Multi-unit residential,,,,,,
11441,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,Commercial/industrial/agricultural,6.0,7.0,41.0,5.86,1.17,6.83
11442,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,Institutional living quarter,,,,,,
11443,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,Others,,,,,,


In [20]:
final_df.to_csv(DATASET_DEST)

In [22]:
final_df.head()

Unnamed: 0,city,region_name,bldg_type,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units
0,CITY OF MANILA,NCR - National Capital Region,Single house,108277.0,118203.0,538084.0,4.55,1.09,4.97
1,CITY OF MANILA,NCR - National Capital Region,Duplex,57101.0,60966.0,248820.0,4.08,1.07,4.36
2,CITY OF MANILA,NCR - National Capital Region,Multi-unit residential,238997.0,250012.0,953978.0,3.82,1.05,3.99
3,CITY OF MANILA,NCR - National Capital Region,Commercial/industrial/agricultural,2427.0,2668.0,9526.0,3.57,1.1,3.93
4,CITY OF MANILA,NCR - National Capital Region,Institutional living quarter,95.0,104.0,367.0,3.53,1.09,3.86
