In [1]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [3]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housing-material-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housing-material-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [4]:
df = pd.read_csv(DATASET)
df.head()

Unnamed: 0,Construction Materials of the Outer Walls and City/Municipality,Total Occupied Housing Units,Construction Materials of the Roof,Galvanized iron/aluminum,Tile/concrete/clay tile,Half galvanized iron and half concrete,Bamboo/cogon/ nipa/anahaw,Asbestos,Makeshift/ salvaged/ improvised materials,Trapal,Others,Not Reported
0,,,,,,,,,,,,
1,NATIONAL CAPITAL REGION,,,,,,,,,,,
2,Total,2.968.651,2.522.479,196.38,223.553,5.88,2.845,8.938,6.1,2.105,371,
3,Concrete/brick/stone,1.996.376,1.786.599,173.355,33.435,190.0,1.928,226.0,129.0,514.0,-,
4,Wood,327.593,279.929,10.444,28.26,3.704,485.0,2.079,1.685,1.005,2,


In [5]:
df = df.loc[df["Construction Materials of the Outer Walls and City/Municipality"] != "Total"]

In [6]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [7]:
regions = list(df["group"].unique())

In [8]:
df.head()

Unnamed: 0,Construction Materials of the Outer Walls and City/Municipality,Total Occupied Housing Units,Construction Materials of the Roof,Galvanized iron/aluminum,Tile/concrete/clay tile,Half galvanized iron and half concrete,Bamboo/cogon/ nipa/anahaw,Asbestos,Makeshift/ salvaged/ improvised materials,Trapal,Others,Not Reported,group
0,,,,,,,,,,,,,1
1,NATIONAL CAPITAL REGION,,,,,,,,,,,,1
3,Concrete/brick/stone,1.996.376,1.786.599,173.355,33.435,190.0,1.928,226.0,129.0,514.0,-,,1
4,Wood,327.593,279.929,10.444,28.26,3.704,485.0,2.079,1.685,1.005,2,,1
5,Half concrete/brick/stone and half wood,611.058,437.499,12.005,159.641,462.0,340.0,721.0,134.0,256.0,-,,1


In [9]:
df["Construction Materials of the Outer Walls and City/Municipality"] = df[
    "Construction Materials of the Outer Walls and City/Municipality"
].apply(replace_region)

In [10]:
roof_matls = df.columns[3:12].tolist()
roof_matls

['Galvanized iron/aluminum',
 'Tile/concrete/clay tile',
 'Half galvanized iron and half concrete',
 'Bamboo/cogon/ nipa/anahaw',
 'Asbestos',
 'Makeshift/ salvaged/ improvised materials',
 'Trapal',
 'Others',
 'Not Reported']

In [11]:
wall_matls = df["Construction Materials of the Outer Walls and City/Municipality"].loc[2:14].to_list()
wall_matls

['Concrete/brick/stone',
 'Wood',
 'Half concrete/brick/stone and half wood',
 'Galvanized iron/aluminum',
 'Bamboo/sawali/cogon/nipa',
 'Asbestos',
 'Glass',
 'Makeshift/salvaged/improvised materials',
 'Trapal',
 'Others',
 'No walls',
 'Not Reported']

In [12]:
combos = [(wall_matl,roof_matl) for wall_matl in wall_matls for roof_matl in roof_matls]

for (x,y) in combos:
    print(x)
    print(y)
    print("===")

Concrete/brick/stone
Galvanized iron/aluminum
===
Concrete/brick/stone
Tile/concrete/clay tile
===
Concrete/brick/stone
Half galvanized iron and half concrete
===
Concrete/brick/stone
Bamboo/cogon/ nipa/anahaw
===
Concrete/brick/stone
Asbestos
===
Concrete/brick/stone
Makeshift/ salvaged/ improvised materials
===
Concrete/brick/stone
Trapal
===
Concrete/brick/stone
Others
===
Concrete/brick/stone
Not Reported
===
Wood
Galvanized iron/aluminum
===
Wood
Tile/concrete/clay tile
===
Wood
Half galvanized iron and half concrete
===
Wood
Bamboo/cogon/ nipa/anahaw
===
Wood
Asbestos
===
Wood
Makeshift/ salvaged/ improvised materials
===
Wood
Trapal
===
Wood
Others
===
Wood
Not Reported
===
Half concrete/brick/stone and half wood
Galvanized iron/aluminum
===
Half concrete/brick/stone and half wood
Tile/concrete/clay tile
===
Half concrete/brick/stone and half wood
Half galvanized iron and half concrete
===
Half concrete/brick/stone and half wood
Bamboo/cogon/ nipa/anahaw
===
Half concrete/brick/

In [13]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [14]:
rp_df["province"] = rp_df["province"].str.strip()

In [15]:
all_vals = []

In [16]:
def parse_region(df):
    df = df.copy()
    region_name = df["Construction Materials of the Outer Walls and City/Municipality"].loc[1]
    unique_vals = df["Construction Materials of the Outer Walls and City/Municipality"].dropna().unique()
    

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df["Construction Materials of the Outer Walls and City/Municipality"].isin(provinces).cumsum()
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [17]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df["Construction Materials of the Outer Walls and City/Municipality"].iloc[0]
    unique_vals = province_df["Construction Materials of the Outer Walls and City/Municipality"].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name, "Not Reported"] and val not in wall_matls]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df["Construction Materials of the Outer Walls and City/Municipality"] == city].index
        ).tolist()[0]
        
        parse_city(city.strip(), region_name, province_df, idx, province_name)

In [18]:
def parse_city(city, region_name, province_df, idx, province_name):
    city_df = province_df[idx:idx+13]

    for (wall_mat,roof_mat) in [(x,y) for x in wall_matls for y in roof_matls]:
        
        city_vals = {}
        city_vals["city"] = city
        city_vals["region_name"] = region_name
        city_vals["province"] = province_name
        city_vals["outer_wall_materials"] = wall_mat.strip()
        city_vals["roof_materials"] = roof_mat.strip()
        
        val = city_df[roof_mat].loc[city_df["Construction Materials of the Outer Walls and City/Municipality"] == wall_mat]
        val = str(val.values[0]).strip()
        val = val.replace(".", "")

        try:
            city_vals["count"] = float(val)
        except:
            city_vals["count"] = float(np.nan)
                
        all_vals.append(city_vals)

In [19]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1]["Construction Materials of the Outer Walls and City/Municipality"]
    parse_region(df_)

>>> Parsing CITY OF MANILA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MANDALUYONG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MARIKINA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF PASIG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing QUEZON CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF SAN JUAN,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CALOOCAN CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MALABON,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF NAVOTAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF VALENZUELA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF LAS PIÑAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MAKATI,METROPOLITAN MANILA in NCR - National Capital R

In [20]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,province,outer_wall_materials,roof_materials,count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Galvanized iron/aluminum,21164.0
1,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Tile/concrete/clay tile,3284.0
2,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Half galvanized iron and half concrete,4.0
3,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Bamboo/cogon/ nipa/anahaw,55.0
4,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Asbestos,19.0
...,...,...,...,...,...,...
176467,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Asbestos,
176468,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Makeshift/ salvaged/ improvised materials,
176469,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Trapal,
176470,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Others,


In [21]:
final_df["roof_materials"].unique()

array(['Galvanized iron/aluminum', 'Tile/concrete/clay tile',
       'Half galvanized iron and half concrete',
       'Bamboo/cogon/ nipa/anahaw', 'Asbestos',
       'Makeshift/ salvaged/ improvised materials', 'Trapal', 'Others',
       'Not Reported'], dtype=object)

In [22]:
final_df.head(40)

Unnamed: 0,city,region_name,province,outer_wall_materials,roof_materials,count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Galvanized iron/aluminum,21164.0
1,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Tile/concrete/clay tile,3284.0
2,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Half galvanized iron and half concrete,4.0
3,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Bamboo/cogon/ nipa/anahaw,55.0
4,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Asbestos,19.0
5,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Makeshift/ salvaged/ improvised materials,42.0
6,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Trapal,75.0
7,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Others,
8,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Concrete/brick/stone,Not Reported,
9,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Wood,Galvanized iron/aluminum,1168.0


In [23]:
final_df.to_csv(DATASET_DEST)