In [67]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [68]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [69]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-fuel-lighting-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-fuel-lighting-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [70]:
df = pd.read_csv(DATASET)
df.head(20)

Unnamed: 0,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported
0,,,,,,,,,,,
1,NATIONAL CAPITAL REGION,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165
2,METROPOLITAN MANILA,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165
3,CITY OF MANILA,435.154,428.934,2.365,1.369,34,66.0,98.0,1.018,1.012,258
4,CITY OF MANDALUYONG,100.356,99.089,217.0,161,6,112.0,491.0,101,166,13
5,CITY OF MARIKINA,98.238,96.774,488.0,592,7,24.0,35.0,264,44,10
6,CITY OF PASIG,180.612,178.773,605.0,586,9,64.0,111.0,258,163,43
7,QUEZON CITY,683.044,671.386,3.265,3.687,99,249.0,247.0,1.605,604,1.902
8,CITY OF SAN JUAN,28.623,27.651,80.0,6,-,109.0,574.0,-,22,181
9,CALOOCAN CITY,367.878,359.640,4.856,17,3,898.0,611.0,1.767,83,3


In [71]:
first_col = "City/Municipality"	

In [72]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [73]:
regions = list(df["group"].unique())
regions

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [74]:
df.head(12)

Unnamed: 0,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported,group
0,,,,,,,,,,,,1
1,NATIONAL CAPITAL REGION,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165,1
2,METROPOLITAN MANILA,3.095.484,3.047.198,18.906,10.213,234,2.086,2.674,7.764,3.244,3.165,1
3,CITY OF MANILA,435.154,428.934,2.365,1.369,34,66.0,98.0,1.018,1.012,258,1
4,CITY OF MANDALUYONG,100.356,99.089,217.0,161.0,6,112.0,491.0,101,166.0,13,1
5,CITY OF MARIKINA,98.238,96.774,488.0,592.0,7,24.0,35.0,264,44.0,10,1
6,CITY OF PASIG,180.612,178.773,605.0,586.0,9,64.0,111.0,258,163.0,43,1
7,QUEZON CITY,683.044,671.386,3.265,3.687,99,249.0,247.0,1.605,604.0,1.902,1
8,CITY OF SAN JUAN,28.623,27.651,80.0,6.0,-,109.0,574.0,-,22.0,181,1
9,CALOOCAN CITY,367.878,359.640,4.856,17.0,3,898.0,611.0,1.767,83.0,3,1


In [75]:
df[first_col] = df[
    first_col
].apply(replace_region)

In [76]:
lighting_sources = df.columns[2:11].tolist()
lighting_sources

[' Electricity ',
 ' Kerosene (Gaas) ',
 ' Liquified Petroleum Gas (LPG) ',
 ' Oil (vegetable animal and others) ',
 ' Solar panel ',
 ' Solar lamp ',
 ' Others ',
 ' None ',
 ' Not Reported ']

In [77]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [78]:
all_vals = []

In [79]:
def parse_region(df):
    df = df.copy()
    region_name = df[first_col].loc[2]
    unique_vals = df[first_col].dropna().unique()
    

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df[first_col].isin(provinces).cumsum()
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [80]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df[first_col].iloc[1]
    print(province_name)
    unique_vals = province_df[first_col].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name, "Not Reported"]]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df[first_col] == city].index
        ).tolist()[0]
        
        parse_city(city.strip(), region_name, province_df, idx, province_name)

In [81]:
def parse_city(city, region_name, province_df, idx, province_name):
    city_df = province_df[idx:idx+10]
    
    city_vals = {}
    city_vals["city"] = city
    city_vals["region_name"] = region_name
    city_vals["province"] = province_name

    for lighting_source in lighting_sources:        
        try:
            val = city_df[lighting_source].loc[city_df[first_col] == city]
            val = str(val.values[0]).strip()
            val = val.replace(".", "")
        except IndexError:
            continue

        try:
            city_vals[f"{lighting_source.strip()}_count"] = float(val)
        except:
            city_vals[f"{lighting_source.strip()}_count"] = float(np.nan)
                
        all_vals.append(city_vals)

In [87]:
provinces = list(rp_df["province"].loc[rp_df["region"]=="CAR - Cordillera Administrative Region"].unique()) 
provinces

[' ABRA',
 ' BENGUET (excluding Baguio City)',
 ' IFUGAO',
 ' KALINGA',
 ' MOUNTAIN PROVINCE',
 ' APAYAO']

In [91]:
df_["City/Municipality"].iloc[2]

' ABRA '

In [88]:
df_[first_col].isin(provinces).cumsum()

0     0
1     0
2     0
3     0
4     0
     ..
80    0
81    0
82    0
83    0
84    0
Name: City/Municipality, Length: 85, dtype: int64

In [85]:
df_["province_no"] = df_[first_col].isin(provinces).cumsum()

In [86]:
df_

Unnamed: 0,index,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported,group,province_no
0,20,,,,,,,,,,,,2,0
1,21,CAR - Cordillera Administrative Region,395.748,360.533,19.865,453,219,5.643,3.031,5.911,93,-,2,0
2,22,ABRA,52.929,46.491,4.009,22,7,612,444,1.339,5,-,2,0
3,23,BANGUED (Capital),11.024,10.491,466,-,1,2,31,33,-,-,2,0
4,24,BOLINEY,760,714,29,-,-,-,7,10,-,-,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,100,PARACELIS,5.834,4.929,617,7,-,175,103,-,3,-,2,0
81,101,SABANGAN,2.215,2.163,41,-,-,-,-,11,-,-,2,0
82,102,SADANGA,1.765,1.712,43,3,-,-,2,5,-,-,2,0
83,103,SAGADA,2.644,2.637,7,-,-,-,-,-,-,-,2,0


In [83]:
df_

Unnamed: 0,index,City/Municipality,Number of Households*,Electricity,Kerosene (Gaas),Liquified Petroleum Gas (LPG),Oil (vegetable animal and others),Solar panel,Solar lamp,Others,None,Not Reported,group
0,20,,,,,,,,,,,,2
1,21,CAR - Cordillera Administrative Region,395.748,360.533,19.865,453,219,5.643,3.031,5.911,93,-,2
2,22,ABRA,52.929,46.491,4.009,22,7,612,444,1.339,5,-,2
3,23,BANGUED (Capital),11.024,10.491,466,-,1,2,31,33,-,-,2
4,24,BOLINEY,760,714,29,-,-,-,7,10,-,-,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,100,PARACELIS,5.834,4.929,617,7,-,175,103,-,3,-,2
81,101,SABANGAN,2.215,2.163,41,-,-,-,-,11,-,-,2
82,102,SADANGA,1.765,1.712,43,3,-,-,2,5,-,-,2
83,103,SAGADA,2.644,2.637,7,-,-,-,-,-,-,-,2


In [82]:
df_ = df.loc[df["group"] == 2].reset_index()
region = df_.iloc[1][first_col]
parse_region(df_)

CAR - Cordillera Administrative Region
>>> Parsing  BANGUED (Capital) ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  BOLINEY ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  BUCAY ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  BUCLOC ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  DAGUIOMAN ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  DANGLAS ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  DOLORES ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  LA PAZ ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  LACUB ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  LAGANGILANG ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  LAGAYAN ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  LANGIDEN ,CAR - Cordillera Administrative Region in  ABRA ...
>>> Parsing  LICUAN-BAAY (LICUAN) ,CAR - Cordillera Administra

In [58]:
region

'CAR - Cordillera Administrative Region'

In [59]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1][first_col]
    parse_region(df_)

>>> Parsing  METROPOLITAN MANILA, CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF MANDALUYONG , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF MARIKINA , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF PASIG , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  QUEZON CITY , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF SAN JUAN , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CALOOCAN CITY , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF MALABON , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF NAVOTAS , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF VALENZUELA , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF LAS PIÑAS , CITY OF MANILA  in NCR - National Capital Region...
>>> Parsing  CITY OF MAKATI , CITY OF MANILA  in NCR - National Capital Region...

In [60]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,province,Electricity_count,Kerosene (Gaas)_count,Liquified Petroleum Gas (LPG)_count,Oil (vegetable animal and others)_count,Solar panel_count,Solar lamp_count,Others_count,None_count,Not Reported_count
0,ILOCOS NORTE,REGION I (Ilocos Region),REGION I (Ilocos Region),137511.0,1433.0,14.0,12.0,22.0,38.0,243.0,63.0,
9,ADAMS,REGION I (Ilocos Region),REGION I (Ilocos Region),366.0,44.0,,,1.0,,,,
18,BACARRA,REGION I (Ilocos Region),REGION I (Ilocos Region),8202.0,36.0,2.0,1.0,2.0,1.0,20.0,5.0,
27,BADOC,REGION I (Ilocos Region),REGION I (Ilocos Region),7275.0,79.0,,,1.0,,12.0,8.0,
36,BANGUI,REGION I (Ilocos Region),REGION I (Ilocos Region),3529.0,15.0,,,2.0,3.0,19.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
13113,CAGDIANAO,REGION XIII (Caraga),REGION XIII (Caraga),3069.0,673.0,1.0,,5.0,14.0,10.0,5.0,
13122,DINAGAT,REGION XIII (Caraga),REGION XIII (Caraga),1788.0,287.0,110.0,,7.0,17.0,10.0,1.0,
13131,LIBJO (ALBOR),REGION XIII (Caraga),REGION XIII (Caraga),3558.0,512.0,17.0,,1.0,14.0,,1.0,
13140,SAN JOSE (CAPITAL),REGION XIII (Caraga),REGION XIII (Caraga),5071.0,616.0,10.0,2.0,13.0,27.0,21.0,5.0,


In [61]:
final_df.head(40)

Unnamed: 0,city,region_name,province,Electricity_count,Kerosene (Gaas)_count,Liquified Petroleum Gas (LPG)_count,Oil (vegetable animal and others)_count,Solar panel_count,Solar lamp_count,Others_count,None_count,Not Reported_count
0,ILOCOS NORTE,REGION I (Ilocos Region),REGION I (Ilocos Region),137511.0,1433.0,14.0,12.0,22.0,38.0,243.0,63.0,
9,ADAMS,REGION I (Ilocos Region),REGION I (Ilocos Region),366.0,44.0,,,1.0,,,,
18,BACARRA,REGION I (Ilocos Region),REGION I (Ilocos Region),8202.0,36.0,2.0,1.0,2.0,1.0,20.0,5.0,
27,BADOC,REGION I (Ilocos Region),REGION I (Ilocos Region),7275.0,79.0,,,1.0,,12.0,8.0,
36,BANGUI,REGION I (Ilocos Region),REGION I (Ilocos Region),3529.0,15.0,,,2.0,3.0,19.0,,
45,CITY OF BATAC,REGION I (Ilocos Region),REGION I (Ilocos Region),12713.0,65.0,3.0,,1.0,2.0,7.0,6.0,
54,BURGOS,REGION I (Ilocos Region),REGION I (Ilocos Region),2357.0,24.0,1.0,,,1.0,20.0,,
63,CARASI,REGION I (Ilocos Region),REGION I (Ilocos Region),293.0,11.0,,,,,,,
72,CURRIMAO,REGION I (Ilocos Region),REGION I (Ilocos Region),2741.0,53.0,,,,1.0,6.0,4.0,
81,DINGRAS,REGION I (Ilocos Region),REGION I (Ilocos Region),8678.0,55.0,,3.0,,3.0,12.0,6.0,


In [62]:
final_df["year"]=2015

In [63]:
# final_df.to_csv(DATASET_DEST)