In [1]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [3]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housing-tenure-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-housing-tenure-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [4]:
df = pd.read_csv(DATASET)
df.head(20)

Unnamed: 0,Tenure Status of the Housing Unit and Lot and City/Municipality,Number of Households*,Single house,Duplex,Multi-unit residential,Commercial/ industrial/ agricultural,Institutional living quarter,Others,Not Reported
0,,,,,,,,,
1,NATIONAL CAPITAL REGION,,,,,,,,
2,Total,3.095.484,1.396.332,427.606,1.252.815,11.388,483,2.646,4.214
3,Own or owner like possession of house and lot,1.382.724,784.385,197.353,397.363,2.043,103,268.0,1.209
4,Rent house/room including lot,1.156.484,333.491,144.578,672.561,4.735,145,87.0,887
5,Own house rent lot,56.152,33.155,8.587,14.226,130,3,6.0,45
6,Own house rent-free lot with consent of owner,196.709,101.266,30.444,63.685,567,24,137.0,586
7,Own house rent-free lot without consent of owner,105.081,60.744,17.120,26.305,95,9,310.0,498
8,Rent-free house and lot with consent of owner,176.145,73.702,26.555,71.208,3.717,189,211.0,563
9,Rent-free house and lot without consent of owner,18.801,8.743,2.759,6.011,98,10,755.0,425


In [5]:
first_col = "Tenure Status of the Housing Unit and Lot and City/Municipality"	

In [6]:
df = df.loc[df[first_col] != "Total"]

In [7]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [8]:
regions = list(df["group"].unique())

In [9]:
df.head(12)

Unnamed: 0,Tenure Status of the Housing Unit and Lot and City/Municipality,Number of Households*,Single house,Duplex,Multi-unit residential,Commercial/ industrial/ agricultural,Institutional living quarter,Others,Not Reported,group
0,,,,,,,,,,1
1,NATIONAL CAPITAL REGION,,,,,,,,,1
3,Own or owner like possession of house and lot,1.382.724,784.385,197.353,397.363,2.043,103,268.0,1.209,1
4,Rent house/room including lot,1.156.484,333.491,144.578,672.561,4.735,145,87.0,887,1
5,Own house rent lot,56.152,33.155,8.587,14.226,130,3,6.0,45,1
6,Own house rent-free lot with consent of owner,196.709,101.266,30.444,63.685,567,24,137.0,586,1
7,Own house rent-free lot without consent of owner,105.081,60.744,17.120,26.305,95,9,310.0,498,1
8,Rent-free house and lot with consent of owner,176.145,73.702,26.555,71.208,3.717,189,211.0,563,1
9,Rent-free house and lot without consent of owner,18.801,8.743,2.759,6.011,98,10,755.0,425,1
10,Not Applicable,866,-,-,-,-,-,866.0,-,1


In [10]:
df[first_col] = df[
    first_col
].apply(replace_region)

In [11]:
bldg_types = df.columns[2:9].tolist()
bldg_types

[' Single house ',
 ' Duplex ',
 ' Multi-unit residential ',
 ' Commercial/ industrial/ agricultural ',
 ' Institutional living quarter ',
 ' Others ',
 ' Not Reported ']

In [12]:
tenure_types = df[first_col].loc[2:11].to_list()
tenure_types

['Own or owner like possession of house and lot',
 'Rent house/room including lot',
 'Own house rent lot',
 'Own house rent-free lot with consent of owner',
 'Own house rent-free lot without consent of owner',
 'Rent-free house and lot with consent of owner',
 'Rent-free house and lot without consent of owner',
 'Not Applicable',
 'Not Reported']

In [13]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [14]:
all_vals = []

In [15]:
def parse_region(df):
    df = df.copy()
    region_name = df[first_col].loc[1]
    unique_vals = df[first_col].dropna().unique()
    

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df[first_col].isin(provinces).cumsum()
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [16]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df[first_col].iloc[0]
    unique_vals = province_df[first_col].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name, "Not Reported"] and val not in tenure_types]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df[first_col] == city].index
        ).tolist()[0]
        
        parse_city(city.strip(), region_name, province_df, idx, province_name)

In [17]:
def parse_city(city, region_name, province_df, idx, province_name):
    city_df = province_df[idx:idx+10]

    for (tenure_type,bldg_type) in [(x,y) for x in tenure_types for y in bldg_types]:
        
        city_vals = {}
        city_vals["city"] = city
        city_vals["region_name"] = region_name
        city_vals["province"] = province_name
        city_vals["tenure_type"] = tenure_type.strip()
        city_vals["bldg_type"] = bldg_type.strip()
        
        try:
            val = city_df[bldg_type].loc[city_df[first_col] == tenure_type]
            val = str(val.values[0]).strip()
            val = val.replace(".", "")
        except IndexError:
            continue

        try:
            city_vals["count"] = float(val)
        except:
            city_vals["count"] = float(np.nan)
                
        all_vals.append(city_vals)

In [18]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1][first_col]
    parse_region(df_)

>>> Parsing  CITY OF MANILA, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MANDALUYONG, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MARIKINA, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF PASIG, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  QUEZON CITY, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF SAN JUAN, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CALOOCAN CITY, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MALABON, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF NAVOTAS, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF VALENZUELA, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF LAS PIÑAS, METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing  CITY OF MAKATI, METROPOLITAN MANILA in 

In [19]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,province,tenure_type,bldg_type,count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Single house,57658.0
1,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Duplex,27215.0
2,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Multi-unit residential,78569.0
3,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Commercial/ industrial/ agricultural,426.0
4,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Institutional living quarter,24.0
...,...,...,...,...,...,...
103014,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Multi-unit residential,
103015,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Commercial/ industrial/ agricultural,
103016,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Institutional living quarter,
103017,SIBUTU,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,Not Reported,Others,


In [20]:
final_df.head(40)

Unnamed: 0,city,region_name,province,tenure_type,bldg_type,count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Single house,57658.0
1,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Duplex,27215.0
2,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Multi-unit residential,78569.0
3,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Commercial/ industrial/ agricultural,426.0
4,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Institutional living quarter,24.0
5,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Others,158.0
6,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Own or owner like possession of house and lot,Not Reported,354.0
7,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Rent house/room including lot,Single house,34840.0
8,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Rent house/room including lot,Duplex,21217.0
9,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,Rent house/room including lot,Multi-unit residential,130042.0


In [21]:
final_df.to_csv(DATASET_DEST)