In [1]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [3]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-fuel-lighting-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-fuel-lighting-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [4]:
def strip_names(city):
    try:
        return city.strip()
    except AttributeError:
        return np.nan

In [5]:
df = pd.read_csv(DATASET)
df.head(20)

Unnamed: 0,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported
0,,,,,,,,,,,
1,NATIONAL CAPITAL REGION,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165
2,METROPOLITAN MANILA,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165
3,CITY OF MANILA,435.154,428.934,2.365,1.369,34,66.0,98.0,1.018,1.012,258
4,CITY OF MANDALUYONG,100.356,99.089,217.0,161,6,112.0,491.0,101,166,13
5,CITY OF MARIKINA,98.238,96.774,488.0,592,7,24.0,35.0,264,44,10
6,CITY OF PASIG,180.612,178.773,605.0,586,9,64.0,111.0,258,163,43
7,QUEZON CITY,683.044,671.386,3.265,3.687,99,249.0,247.0,1.605,604,1.902
8,CITY OF SAN JUAN,28.623,27.651,80.0,6,-,109.0,574.0,-,22,181
9,CALOOCAN CITY,367.878,359.640,4.856,17,3,898.0,611.0,1.767,83,3


In [6]:
first_col = "City/Municipality"	

In [7]:
df[first_col] = df[first_col].apply(strip_names)

In [8]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [9]:
regions = list(df["group"].unique())
regions

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [10]:
df.head(12)

Unnamed: 0,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported,group
0,,,,,,,,,,,,1
1,NATIONAL CAPITAL REGION,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165,1
2,METROPOLITAN MANILA,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165,1
3,CITY OF MANILA,435.154,428.934,2.365,1.369,34,66.0,98.0,1.018,1.012,258,1
4,CITY OF MANDALUYONG,100.356,99.089,217.0,161.0,6,112.0,491.0,101,166.0,13,1
5,CITY OF MARIKINA,98.238,96.774,488.0,592.0,7,24.0,35.0,264,44.0,10,1
6,CITY OF PASIG,180.612,178.773,605.0,586.0,9,64.0,111.0,258,163.0,43,1
7,QUEZON CITY,683.044,671.386,3.265,3.687,99,249.0,247.0,1.605,604.0,1.902,1
8,CITY OF SAN JUAN,28.623,27.651,80.0,6.0,-,109.0,574.0,-,22.0,181,1
9,CALOOCAN CITY,367.878,359.640,4.856,17.0,3,898.0,611.0,1.767,83.0,3,1


In [11]:
df[first_col] = df[
    first_col
].apply(replace_region)

In [12]:
lighting_sources = df.columns[2:11].tolist()
lighting_sources

[' Electricity ',
 ' Kerosene (Gaas) ',
 ' Liquified Petroleum Gas (LPG) ',
 ' Oil (vegetable animal and others) ',
 ' Solar panel ',
 ' Solar lamp ',
 ' Others ',
 ' None ',
 ' Not Reported ']

In [13]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [14]:
rp_df["province"] = rp_df["province"].apply(strip_names)

In [15]:
all_vals = []

In [16]:
def add_space(province):
    return str(province)+" "

In [17]:
def parse_region(df):
    df = df.copy()
    region_name = df[first_col].loc[1]
    unique_vals = df[first_col].dropna().unique()

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df[first_col].isin(provinces).cumsum()
    df = df.loc[df["province_no"] != 0]
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [18]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df[first_col].iloc[0]
    unique_vals = province_df[first_col].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name, "Not Reported"]]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df[first_col] == city].index
        ).tolist()[0]
        
        parse_city(city.strip(), region_name, province_df, idx, province_name)

In [19]:
def parse_city(city, region_name, province_df, idx, province_name):
    city_df = province_df[idx:idx+10]
    
    city_vals = {}
    city_vals["city"] = city
    city_vals["region_name"] = region_name
    city_vals["province"] = province_name

    for lighting_source in lighting_sources:        
        try:
            val = city_df[lighting_source].loc[city_df[first_col] == city]
            val = str(val.values[0]).strip()
            val = val.replace(".", "")
        except IndexError:
            continue

        try:
            city_vals[f"{lighting_source.strip()}_count"] = float(val)
        except:
            city_vals[f"{lighting_source.strip()}_count"] = float(np.nan)
                
        all_vals.append(city_vals)

In [20]:
df.head()

Unnamed: 0,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported,group
0,,,,,,,,,,,,1
1,NCR - National Capital Region,3.095.484,3.047.198,18.906,10.213,234.0,2.086,2.674,7.764,3.244,3.165,1
2,METROPOLITAN MANILA,3.095.484,3.047.198,18.906,10.213,234.0,2.086,2.674,7.764,3.244,3.165,1
3,CITY OF MANILA,435.154,428.934,2.365,1.369,34.0,66.0,98.0,1.018,1.012,258.0,1
4,CITY OF MANDALUYONG,100.356,99.089,217.0,161.0,6.0,112.0,491.0,101.0,166.0,13.0,1


In [21]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1][first_col]
    parse_region(df_)

>>> Parsing CITY OF MANILA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MANDALUYONG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MARIKINA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF PASIG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing QUEZON CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF SAN JUAN,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CALOOCAN CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MALABON,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF NAVOTAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF VALENZUELA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF LAS PIÑAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MAKATI,METROPOLITAN MANILA in NCR - National Capital R

In [22]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,province,Electricity_count,Kerosene (Gaas)_count,Liquified Petroleum Gas (LPG)_count,Oil (vegetable animal and others)_count,Solar panel_count,Solar lamp_count,Others_count,None_count,Not Reported_count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,428934.0,2365.0,1369.0,34.0,66.0,98.0,1018.0,1012.0,258.0
9,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,99089.0,217.0,161.0,6.0,112.0,491.0,101.0,166.0,13.0
18,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,96774.0,488.0,592.0,7.0,24.0,35.0,264.0,44.0,10.0
27,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,178773.0,605.0,586.0,9.0,64.0,111.0,258.0,163.0,43.0
36,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,671386.0,3265.0,3687.0,99.0,249.0,247.0,1605.0,604.0,1902.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13482,TANDUBAS,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,2131.0,1675.0,28.0,1.0,371.0,637.0,2.0,61.0,
13491,TURTLE ISLANDS,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,724.0,41.0,,,6.0,11.0,,,
13500,LANGUYAN,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,436.0,1838.0,509.0,12.0,958.0,2603.0,9.0,2.0,
13509,SAPA-SAPA,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,48.0,1330.0,55.0,7.0,539.0,3182.0,23.0,6.0,


In [23]:
final_df.head(40)

Unnamed: 0,city,region_name,province,Electricity_count,Kerosene (Gaas)_count,Liquified Petroleum Gas (LPG)_count,Oil (vegetable animal and others)_count,Solar panel_count,Solar lamp_count,Others_count,None_count,Not Reported_count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,428934.0,2365.0,1369.0,34.0,66.0,98.0,1018.0,1012.0,258.0
9,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,99089.0,217.0,161.0,6.0,112.0,491.0,101.0,166.0,13.0
18,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,96774.0,488.0,592.0,7.0,24.0,35.0,264.0,44.0,10.0
27,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,178773.0,605.0,586.0,9.0,64.0,111.0,258.0,163.0,43.0
36,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,671386.0,3265.0,3687.0,99.0,249.0,247.0,1605.0,604.0,1902.0
45,CITY OF SAN JUAN,NCR - National Capital Region,METROPOLITAN MANILA,27651.0,80.0,6.0,,109.0,574.0,,22.0,181.0
54,CALOOCAN CITY,NCR - National Capital Region,METROPOLITAN MANILA,359640.0,4856.0,17.0,3.0,898.0,611.0,1767.0,83.0,3.0
63,CITY OF MALABON,NCR - National Capital Region,METROPOLITAN MANILA,85004.0,784.0,1.0,3.0,15.0,40.0,286.0,58.0,
72,CITY OF NAVOTAS,NCR - National Capital Region,METROPOLITAN MANILA,59566.0,553.0,540.0,3.0,2.0,13.0,157.0,70.0,
81,CITY OF VALENZUELA,NCR - National Capital Region,METROPOLITAN MANILA,151738.0,1079.0,,,22.0,48.0,154.0,,


In [24]:
final_df["year"]=2015
final_df.head(10)

Unnamed: 0,city,region_name,province,Electricity_count,Kerosene (Gaas)_count,Liquified Petroleum Gas (LPG)_count,Oil (vegetable animal and others)_count,Solar panel_count,Solar lamp_count,Others_count,None_count,Not Reported_count,year
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,428934.0,2365.0,1369.0,34.0,66.0,98.0,1018.0,1012.0,258.0,2015
9,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,99089.0,217.0,161.0,6.0,112.0,491.0,101.0,166.0,13.0,2015
18,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,96774.0,488.0,592.0,7.0,24.0,35.0,264.0,44.0,10.0,2015
27,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,178773.0,605.0,586.0,9.0,64.0,111.0,258.0,163.0,43.0,2015
36,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,671386.0,3265.0,3687.0,99.0,249.0,247.0,1605.0,604.0,1902.0,2015
45,CITY OF SAN JUAN,NCR - National Capital Region,METROPOLITAN MANILA,27651.0,80.0,6.0,,109.0,574.0,,22.0,181.0,2015
54,CALOOCAN CITY,NCR - National Capital Region,METROPOLITAN MANILA,359640.0,4856.0,17.0,3.0,898.0,611.0,1767.0,83.0,3.0,2015
63,CITY OF MALABON,NCR - National Capital Region,METROPOLITAN MANILA,85004.0,784.0,1.0,3.0,15.0,40.0,286.0,58.0,,2015
72,CITY OF NAVOTAS,NCR - National Capital Region,METROPOLITAN MANILA,59566.0,553.0,540.0,3.0,2.0,13.0,157.0,70.0,,2015
81,CITY OF VALENZUELA,NCR - National Capital Region,METROPOLITAN MANILA,151738.0,1079.0,,,22.0,48.0,154.0,,,2015


In [25]:
final_df.to_csv(DATASET_DEST)