In [1]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [3]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-water-supply-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-water-supply-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [4]:
def strip_names(city):
    try:
        return city.strip()
    except AttributeError:
        return np.nan

In [5]:
df = pd.read_csv(DATASET)
df.head(20)

Unnamed: 0,City/Municipality,Number of Households*,Own use faucet community water system,Shared faucet community water system,Own use tubed/piped deep well,Shared tubed/piped deep well,Tubed/piped shallow well,Dug well,Protected spring,Unprotected spring,...,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28
0,,,,,,,,,,,...,,,,,,,,,,
1,PHILIPPINES,22.969.666,6.023.891,2.988.858,1.370.172,2.784.478,432.498,1.010.631,1.001.955,335.642,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,NATIONAL CAPITAL REGION,3.095.484,1.448.079,339.318,11.577,17.820,1.259,1.510,7.769,56,...,,,,,,,,,,
4,METROPOLITAN MANILA,3.095.484,1.448.079,339.318,11.577,17.820,1.259,1.510,7.769,56,...,,,,,,,,,,
5,CITY OF MANILA,435.154,188.513,52.760,883,834,91,50,622,6,...,,,,,,,,,,
6,CITY OF MANDALUYONG,100.356,56.381,6.412,48,216,48,2,233,1,...,,,,,,,,,,
7,CITY OF MARIKINA,98.238,68.434,6.373,229,397,24,15,318,-,...,,,,,,,,,,
8,CITY OF PASIG,180.612,99.263,11.296,234,435,55,31,846,5,...,,,,,,,,,,
9,QUEZON CITY,683.044,401.143,90.313,1.591,2.165,325,337,1.576,23,...,,,,,,,,,,


In [6]:
first_col = "City/Municipality"	

In [7]:
df[first_col] = df[first_col].apply(strip_names)

In [8]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [9]:
regions = list(df["group"].unique())
regions

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [10]:
df.head(12)

Unnamed: 0,City/Municipality,Number of Households*,Own use faucet community water system,Shared faucet community water system,Own use tubed/piped deep well,Shared tubed/piped deep well,Tubed/piped shallow well,Dug well,Protected spring,Unprotected spring,...,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,group
0,,,,,,,,,,,...,,,,,,,,,,1
1,PHILIPPINES,22.969.666,6.023.891,2.988.858,1.370.172,2.784.478,432.498,1.010.631,1.001.955,335.642,...,,,,,,,,,,1
2,,,,,,,,,,,...,,,,,,,,,,2
3,NATIONAL CAPITAL REGION,3.095.484,1.448.079,339.318,11.577,17.820,1.259,1.510,7.769,56,...,,,,,,,,,,2
4,METROPOLITAN MANILA,3.095.484,1.448.079,339.318,11.577,17.820,1.259,1.510,7.769,56,...,,,,,,,,,,2
5,CITY OF MANILA,435.154,188.513,52.760,883,834,91,50,622,6,...,,,,,,,,,,2
6,CITY OF MANDALUYONG,100.356,56.381,6.412,48,216,48,2,233,1,...,,,,,,,,,,2
7,CITY OF MARIKINA,98.238,68.434,6.373,229,397,24,15,318,-,...,,,,,,,,,,2
8,CITY OF PASIG,180.612,99.263,11.296,234,435,55,31,846,5,...,,,,,,,,,,2
9,QUEZON CITY,683.044,401.143,90.313,1.591,2.165,325,337,1.576,23,...,,,,,,,,,,2


In [11]:
df[first_col] = df[
    first_col
].apply(replace_region)

In [12]:
water_supply = df.columns[2:15].tolist()
water_supply

['Own use faucet community water system',
 'Shared faucet community water system',
 'Own use tubed/piped deep well',
 'Shared tubed/piped deep well',
 'Tubed/piped shallow well',
 'Dug well',
 'Protected spring',
 'Unprotected spring',
 'Lake, river, rain and others',
 'Peddler',
 'Bottled water',
 'Others',
 'Not Reported']

In [13]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [14]:
rp_df["province"] = rp_df["province"].apply(strip_names)

In [15]:
all_vals = []

In [16]:
def parse_region(df):
    df = df.copy()
    region_name = df[first_col].loc[1]
    unique_vals = df[first_col].dropna().unique()

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df[first_col].isin(provinces).cumsum()
    df = df.loc[df["province_no"] != 0]
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [17]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df[first_col].iloc[0]
    unique_vals = province_df[first_col].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name, "Not Reported"]]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df[first_col] == city].index
        ).tolist()[0]
        
        parse_city(city.strip(), region_name, province_df, idx, province_name)

In [18]:
def parse_city(city, region_name, province_df, idx, province_name):
    city_df = province_df[idx:idx+10]
    
    city_vals = {}
    city_vals["city"] = city
    city_vals["region_name"] = region_name
    city_vals["province"] = province_name

    for supply in water_supply:        
        try:
            val = city_df[supply].loc[city_df[first_col] == city]
            val = str(val.values[0]).strip()
            val = val.replace(".", "")
        except IndexError:
            continue

        try:
            city_vals[f"{supply.strip()}_count"] = float(val)
        except:
            city_vals[f"{supply.strip()}_count"] = float(np.nan)
                
        all_vals.append(city_vals)

In [19]:
df.head()

Unnamed: 0,City/Municipality,Number of Households*,Own use faucet community water system,Shared faucet community water system,Own use tubed/piped deep well,Shared tubed/piped deep well,Tubed/piped shallow well,Dug well,Protected spring,Unprotected spring,...,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,group
0,,,,,,,,,,,...,,,,,,,,,,1
1,PHILIPPINES,22.969.666,6.023.891,2.988.858,1.370.172,2.784.478,432.498,1.010.631,1.001.955,335.642,...,,,,,,,,,,1
2,,,,,,,,,,,...,,,,,,,,,,2
3,NCR - National Capital Region,3.095.484,1.448.079,339.318,11.577,17.820,1.259,1.510,7.769,56.0,...,,,,,,,,,,2
4,METROPOLITAN MANILA,3.095.484,1.448.079,339.318,11.577,17.820,1.259,1.510,7.769,56.0,...,,,,,,,,,,2


In [20]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1][first_col]
    parse_region(df_)

>>> Parsing CITY OF MANILA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MANDALUYONG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MARIKINA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF PASIG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing QUEZON CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF SAN JUAN,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CALOOCAN CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MALABON,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF NAVOTAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF VALENZUELA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF LAS PIÑAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MAKATI,METROPOLITAN MANILA in NCR - National Capital R

In [21]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,province,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,"Lake, river, rain and others_count",Peddler_count,Bottled water_count,Others_count,Not Reported_count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,188513.0,52760.0,883.0,834.0,91.0,50.0,622.0,6.0,29.0,11449.0,179153.0,764.0,
13,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,56381.0,6412.0,48.0,216.0,48.0,2.0,233.0,1.0,4.0,2720.0,33300.0,991.0,
26,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,68434.0,6373.0,229.0,397.0,24.0,15.0,318.0,,2.0,1380.0,20907.0,159.0,
39,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,99263.0,11296.0,234.0,435.0,55.0,31.0,846.0,5.0,10.0,2403.0,64530.0,1504.0,
52,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,401143.0,90313.0,1591.0,2165.0,325.0,337.0,1576.0,23.0,42.0,9656.0,170761.0,5112.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19474,TANDUBAS,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,396.0,1453.0,6.0,6.0,10.0,3016.0,2.0,1.0,4.0,10.0,2.0,,
19487,TURTLE ISLANDS,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,4.0,,3.0,20.0,8.0,727.0,1.0,,1.0,,18.0,,
19500,LANGUYAN,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,1062.0,258.0,73.0,56.0,85.0,3710.0,3.0,2.0,714.0,342.0,62.0,,
19513,SAPA-SAPA,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,15.0,13.0,114.0,840.0,15.0,2781.0,,1.0,1400.0,6.0,5.0,,


In [22]:
final_df.head(40)

Unnamed: 0,city,region_name,province,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,"Lake, river, rain and others_count",Peddler_count,Bottled water_count,Others_count,Not Reported_count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,188513.0,52760.0,883.0,834.0,91.0,50.0,622.0,6.0,29.0,11449.0,179153.0,764.0,
13,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,56381.0,6412.0,48.0,216.0,48.0,2.0,233.0,1.0,4.0,2720.0,33300.0,991.0,
26,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,68434.0,6373.0,229.0,397.0,24.0,15.0,318.0,,2.0,1380.0,20907.0,159.0,
39,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,99263.0,11296.0,234.0,435.0,55.0,31.0,846.0,5.0,10.0,2403.0,64530.0,1504.0,
52,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,401143.0,90313.0,1591.0,2165.0,325.0,337.0,1576.0,23.0,42.0,9656.0,170761.0,5112.0,
65,CITY OF SAN JUAN,NCR - National Capital Region,METROPOLITAN MANILA,15950.0,2444.0,,,,,,,,773.0,9456.0,,
78,CALOOCAN CITY,NCR - National Capital Region,METROPOLITAN MANILA,212602.0,57637.0,3628.0,4241.0,247.0,483.0,14.0,,,3794.0,84570.0,662.0,
91,CITY OF MALABON,NCR - National Capital Region,METROPOLITAN MANILA,30322.0,11656.0,363.0,276.0,9.0,21.0,1.0,,,3377.0,40000.0,166.0,
104,CITY OF NAVOTAS,NCR - National Capital Region,METROPOLITAN MANILA,19461.0,7915.0,50.0,321.0,18.0,4.0,268.0,,2.0,2788.0,29983.0,94.0,
117,CITY OF VALENZUELA,NCR - National Capital Region,METROPOLITAN MANILA,72828.0,24330.0,868.0,2173.0,36.0,40.0,,,,3305.0,49366.0,95.0,


In [23]:
final_df["year"]=2015
final_df.head(10)

Unnamed: 0,city,region_name,province,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,"Lake, river, rain and others_count",Peddler_count,Bottled water_count,Others_count,Not Reported_count,year
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,188513.0,52760.0,883.0,834.0,91.0,50.0,622.0,6.0,29.0,11449.0,179153.0,764.0,,2015
13,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,56381.0,6412.0,48.0,216.0,48.0,2.0,233.0,1.0,4.0,2720.0,33300.0,991.0,,2015
26,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,68434.0,6373.0,229.0,397.0,24.0,15.0,318.0,,2.0,1380.0,20907.0,159.0,,2015
39,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,99263.0,11296.0,234.0,435.0,55.0,31.0,846.0,5.0,10.0,2403.0,64530.0,1504.0,,2015
52,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,401143.0,90313.0,1591.0,2165.0,325.0,337.0,1576.0,23.0,42.0,9656.0,170761.0,5112.0,,2015
65,CITY OF SAN JUAN,NCR - National Capital Region,METROPOLITAN MANILA,15950.0,2444.0,,,,,,,,773.0,9456.0,,,2015
78,CALOOCAN CITY,NCR - National Capital Region,METROPOLITAN MANILA,212602.0,57637.0,3628.0,4241.0,247.0,483.0,14.0,,,3794.0,84570.0,662.0,,2015
91,CITY OF MALABON,NCR - National Capital Region,METROPOLITAN MANILA,30322.0,11656.0,363.0,276.0,9.0,21.0,1.0,,,3377.0,40000.0,166.0,,2015
104,CITY OF NAVOTAS,NCR - National Capital Region,METROPOLITAN MANILA,19461.0,7915.0,50.0,321.0,18.0,4.0,268.0,,2.0,2788.0,29983.0,94.0,,2015
117,CITY OF VALENZUELA,NCR - National Capital Region,METROPOLITAN MANILA,72828.0,24330.0,868.0,2173.0,36.0,40.0,,,,3305.0,49366.0,95.0,,2015


In [24]:
final_df.to_csv(DATASET_DEST)