In [1]:
import itertools
import math
import re
from pathlib import Path

import geopandas as gp
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def replace_region(name):
    try:
        name = name.replace("MIMAROPA REGION", "REGION IV-B (MIMAROPA)")
        name = name.replace("REGION III - CENTRAL LUZON", "REGION III (Central Luzon)")
        name = name.replace("REGION II - CAGAYAN VALLEY", "REGION II (Cagayan Valley)")
        name = name.replace(
            "REGION VIII - EASTERN VISAYAS", "REGION VIII (Eastern Visayas)"
        )
        name = name.replace("REGION I - ILOCOS", "REGION I (Ilocos Region)")
        name = name.replace("REGION IV-A - CALABARZON", "REGION IV-A (CALABARZON)")
        name = name.replace(
            "CORDILLERA ADMINISTRATIVE REGION", "CAR - Cordillera Administrative Region"
        )
        name = name.replace(
            "REGION VI - WESTERN VISAYAS", "REGION VI (Western Visayas)"
        )
        name = name.replace(
            "AUTONOMOUS REGION IN MUSLIM MINDANAO",
            "BARMM - Bangsamoro Autonomous Region in Muslim Mindanao",
        )
        name = name.replace("REGION XII - SOCCSKSARGEN", "REGION XII (Soccsksargen)")
        name = name.replace(
            "REGION VII - CENTRAL VISAYAS", "REGION VII (Central Visayas)"
        )
        name = name.replace("REGION XIII - CARAGA", "REGION XIII (Caraga)")
        name = name.replace(
            "REGION IX - ZAMBOANGA PENINSULA", "REGION IX (Zamboanga Peninsula)"
        )
        name = name.replace(
            "REGION X - NORTHERN MINDANAO", "REGION X (Northern Mindanao)"
        )
        name = name.replace("REGION V - BICOL", "REGION V (Bicol Region)")
        name = name.replace("REGION XI - DAVAO", "REGION XI (Davao Region)")
        name = name.replace("NATIONAL CAPITAL REGION", "NCR - National Capital Region")
        return name
    except:
        return name

In [3]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]

DATASET = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-water-supply-cooking-2015.csv",
)

REG_PROV = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "region-province.csv",
)

DATASET_DEST = Path(
    PROJECTROOT,
    "data",
    "gathered-datasets",
    "housing-census",
    "housing-census-water-supply-cooking-2015-flattened.csv",
)

REF_DF = Path(PROJECTROOT, "data", "cleaned-datasets", "ph-shp-file", "ph-shp-file.shp")

In [4]:
def strip_names(city):
    try:
        return city.strip()
    except AttributeError:
        return np.nan

In [5]:
df = pd.read_csv(DATASET)
df.head(20)

Unnamed: 0,City/Municipality,Number of Households*,Own use faucet community water system,Shared faucet community water system,Own use tubed/piped deep well,Shared tubed/piped deep well,Tubed/piped shallow well,Dug well,Protected spring,Unprotected spring,Lake river rain and others,Peddler,Bottled water,Others,Not Reported
0,,,,,,,,,,,,,,,
1,NATIONAL CAPITAL REGION,3.095.484,2.469.267,440.905,21.969,34.255,1.704,2.157,1.337,27,35,40.197,80.566,3.065,-
2,METROPOLITAN MANILA,3.095.484,2.469.267,440.905,21.969,34.255,1.704,2.157,1.337,27,35,40.197,80.566,3.065,-
3,CITY OF MANILA,435.154,347.935,69.597,1.181,1.488,100,48,233,9,13,9.313,5.068,169,-
4,CITY OF MANDALUYONG,100.356,89.749,7.551,107,254,55,1,36,2,-,875,1.637,89,-
5,CITY OF MARIKINA,98.238,89.537,6.991,290,389,14,16,91,-,-,224,657.0,29,-
6,CITY OF PASIG,180.612,162.226,13.063,197,435,94,34,43,-,-,1.321,2.954,245,-
7,QUEZON CITY,683.044,561.729,102.919,2.375,3.006,334,343,293,4,17,4.548,6.572,904,-
8,CITY OF SAN JUAN,28.623,24.496,3.008,-,-,-,-,-,-,-,783,336.0,-,-
9,CALOOCAN CITY,367.878,283.142,66.524,5.172,5.595,191,290,15,-,-,2.740,3.974,235,-


In [6]:
first_col = "City/Municipality"	

In [7]:
df[first_col] = df[first_col].apply(strip_names)

In [8]:
# Group into regions
df["group"] = df.isnull().all(axis=1).cumsum()

In [9]:
regions = list(df["group"].unique())
regions

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [10]:
df.head(12)

Unnamed: 0,City/Municipality,Number of Households*,Own use faucet community water system,Shared faucet community water system,Own use tubed/piped deep well,Shared tubed/piped deep well,Tubed/piped shallow well,Dug well,Protected spring,Unprotected spring,Lake river rain and others,Peddler,Bottled water,Others,Not Reported,group
0,,,,,,,,,,,,,,,,1
1,NATIONAL CAPITAL REGION,3.095.484,2.469.267,440.905,21.969,34.255,1.704,2.157,1.337,27,35,40.197,80.566,3.065,-,1
2,METROPOLITAN MANILA,3.095.484,2.469.267,440.905,21.969,34.255,1.704,2.157,1.337,27,35,40.197,80.566,3.065,-,1
3,CITY OF MANILA,435.154,347.935,69.597,1.181,1.488,100,48,233,9,13,9.313,5.068,169,-,1
4,CITY OF MANDALUYONG,100.356,89.749,7.551,107,254,55,1,36,2,-,875.0,1.637,89,-,1
5,CITY OF MARIKINA,98.238,89.537,6.991,290,389,14,16,91,-,-,224.0,657.0,29,-,1
6,CITY OF PASIG,180.612,162.226,13.063,197,435,94,34,43,-,-,1.321,2.954,245,-,1
7,QUEZON CITY,683.044,561.729,102.919,2.375,3.006,334,343,293,4,17,4.548,6.572,904,-,1
8,CITY OF SAN JUAN,28.623,24.496,3.008,-,-,-,-,-,-,-,783.0,336.0,-,-,1
9,CALOOCAN CITY,367.878,283.142,66.524,5.172,5.595,191,290,15,-,-,2.74,3.974,235,-,1


In [11]:
df[first_col] = df[
    first_col
].apply(replace_region)

In [12]:
water_supply = df.columns[2:15].tolist()
water_supply

['Own use faucet community water system',
 'Shared faucet community water system',
 'Own use tubed/piped deep well',
 'Shared tubed/piped deep well',
 'Tubed/piped shallow well',
 'Dug well',
 'Protected spring',
 'Unprotected spring',
 'Lake river rain and others',
 'Peddler',
 'Bottled water',
 'Others',
 'Not Reported']

In [13]:
rp_df = pd.read_csv(REG_PROV)
rp_df.head()

Unnamed: 0,region,province
0,NCR - National Capital Region,METROPOLITAN MANILA
1,CAR - Cordillera Administrative Region,ABRA
2,CAR - Cordillera Administrative Region,BENGUET (excluding Baguio City)
3,CAR - Cordillera Administrative Region,IFUGAO
4,CAR - Cordillera Administrative Region,KALINGA


In [14]:
rp_df["province"] = rp_df["province"].apply(strip_names)

In [15]:
all_vals = []

In [16]:
def parse_region(df):
    df = df.copy()
    region_name = df[first_col].loc[1]
    unique_vals = df[first_col].dropna().unique()

    provinces = list(rp_df["province"].loc[rp_df["region"]==region_name].unique()) 
    
    df["province_no"] = df[first_col].isin(provinces).cumsum()
    df = df.loc[df["province_no"] != 0]
    
    province_no_list = df["province_no"].unique()
    
    for province in province_no_list:
            province_df = df.loc[df["province_no"]==province]
            parse_province(province_df, region_name)

In [17]:
def parse_province(province_df, region_name):
    province_df = province_df.copy()
    province_df = province_df.reset_index()
    
    province_name = province_df[first_col].iloc[0]
    unique_vals = province_df[first_col].dropna().unique()
    
    cities = [val for val in unique_vals if val not in [region_name, province_name, "Not Reported"]]
    
    for city in cities:
        print(f">>> Parsing {city},{province_name} in {region_name}...")
        idx = (
            province_df.loc[province_df[first_col] == city].index
        ).tolist()[0]
        
        parse_city(city.strip(), region_name, province_df, idx, province_name)

In [18]:
def parse_city(city, region_name, province_df, idx, province_name):
    city_df = province_df[idx:idx+10]
    
    city_vals = {}
    city_vals["city"] = city
    city_vals["region_name"] = region_name
    city_vals["province"] = province_name

    for supply in water_supply:        
        try:
            val = city_df[supply].loc[city_df[first_col] == city]
            val = str(val.values[0]).strip()
            val = val.replace(".", "")
        except IndexError:
            continue

        try:
            city_vals[f"{supply.strip()}_count"] = float(val)
        except:
            city_vals[f"{supply.strip()}_count"] = float(np.nan)
                
        all_vals.append(city_vals)

In [19]:
df.head()

Unnamed: 0,City/Municipality,Number of Households*,Own use faucet community water system,Shared faucet community water system,Own use tubed/piped deep well,Shared tubed/piped deep well,Tubed/piped shallow well,Dug well,Protected spring,Unprotected spring,Lake river rain and others,Peddler,Bottled water,Others,Not Reported,group
0,,,,,,,,,,,,,,,,1
1,NCR - National Capital Region,3.095.484,2.469.267,440.905,21.969,34.255,1.704,2.157,1.337,27.0,35,40.197,80.566,3.065,-,1
2,METROPOLITAN MANILA,3.095.484,2.469.267,440.905,21.969,34.255,1.704,2.157,1.337,27.0,35,40.197,80.566,3.065,-,1
3,CITY OF MANILA,435.154,347.935,69.597,1.181,1.488,100.0,48.0,233.0,9.0,13,9.313,5.068,169.0,-,1
4,CITY OF MANDALUYONG,100.356,89.749,7.551,107.0,254.0,55.0,1.0,36.0,2.0,-,875.0,1.637,89.0,-,1


In [20]:
for region in regions:
    df_ = df.loc[df["group"] == region].reset_index()
    region = df_.iloc[1][first_col]
    parse_region(df_)

>>> Parsing CITY OF MANILA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MANDALUYONG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MARIKINA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF PASIG,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing QUEZON CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF SAN JUAN,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CALOOCAN CITY,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MALABON,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF NAVOTAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF VALENZUELA,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF LAS PIÑAS,METROPOLITAN MANILA in NCR - National Capital Region...
>>> Parsing CITY OF MAKATI,METROPOLITAN MANILA in NCR - National Capital R

In [21]:
final_df = pd.DataFrame.from_dict(all_vals).drop_duplicates()
final_df

Unnamed: 0,city,region_name,province,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,Lake river rain and others_count,Peddler_count,Bottled water_count,Others_count,Not Reported_count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,347935.0,69597.0,1181.0,1488.0,100.0,48.0,233.0,9.0,13.0,9313.0,5068.0,169.0,
13,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,89749.0,7551.0,107.0,254.0,55.0,1.0,36.0,2.0,,875.0,1637.0,89.0,
26,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,89537.0,6991.0,290.0,389.0,14.0,16.0,91.0,,,224.0,657.0,29.0,
39,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,162226.0,13063.0,197.0,435.0,94.0,34.0,43.0,,,1321.0,2954.0,245.0,
52,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,561729.0,102919.0,2375.0,3006.0,334.0,343.0,293.0,4.0,17.0,4548.0,6572.0,904.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19474,TANDUBAS,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,344.0,1551.0,6.0,7.0,9.0,2978.0,,1.0,2.0,8.0,,,
19487,TURTLE ISLANDS,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,3.0,,1.0,20.0,2.0,753.0,1.0,1.0,,,1.0,,
19500,LANGUYAN,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,1060.0,262.0,71.0,56.0,89.0,3956.0,2.0,2.0,703.0,161.0,5.0,,
19513,SAPA-SAPA,BARMM - Bangsamoro Autonomous Region in Muslim...,TAWI-TAWI,14.0,15.0,115.0,819.0,5.0,2852.0,1.0,1.0,1363.0,3.0,2.0,,


In [22]:
final_df.head(40)

Unnamed: 0,city,region_name,province,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,Lake river rain and others_count,Peddler_count,Bottled water_count,Others_count,Not Reported_count
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,347935.0,69597.0,1181.0,1488.0,100.0,48.0,233.0,9.0,13.0,9313.0,5068.0,169.0,
13,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,89749.0,7551.0,107.0,254.0,55.0,1.0,36.0,2.0,,875.0,1637.0,89.0,
26,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,89537.0,6991.0,290.0,389.0,14.0,16.0,91.0,,,224.0,657.0,29.0,
39,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,162226.0,13063.0,197.0,435.0,94.0,34.0,43.0,,,1321.0,2954.0,245.0,
52,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,561729.0,102919.0,2375.0,3006.0,334.0,343.0,293.0,4.0,17.0,4548.0,6572.0,904.0,
65,CITY OF SAN JUAN,NCR - National Capital Region,METROPOLITAN MANILA,24496.0,3008.0,,,,,,,,783.0,336.0,,
78,CALOOCAN CITY,NCR - National Capital Region,METROPOLITAN MANILA,283142.0,66524.0,5172.0,5595.0,191.0,290.0,15.0,,,2740.0,3974.0,235.0,
91,CITY OF MALABON,NCR - National Capital Region,METROPOLITAN MANILA,62428.0,19561.0,412.0,416.0,18.0,11.0,,,,1728.0,1575.0,42.0,
104,CITY OF NAVOTAS,NCR - National Capital Region,METROPOLITAN MANILA,42529.0,13190.0,73.0,367.0,28.0,3.0,60.0,,1.0,3864.0,785.0,4.0,
117,CITY OF VALENZUELA,NCR - National Capital Region,METROPOLITAN MANILA,111294.0,31136.0,2056.0,3234.0,33.0,309.0,,,,2352.0,2582.0,45.0,


In [23]:
final_df["year"]=2015
final_df.head(10)

Unnamed: 0,city,region_name,province,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,Lake river rain and others_count,Peddler_count,Bottled water_count,Others_count,Not Reported_count,year
0,CITY OF MANILA,NCR - National Capital Region,METROPOLITAN MANILA,347935.0,69597.0,1181.0,1488.0,100.0,48.0,233.0,9.0,13.0,9313.0,5068.0,169.0,,2015
13,CITY OF MANDALUYONG,NCR - National Capital Region,METROPOLITAN MANILA,89749.0,7551.0,107.0,254.0,55.0,1.0,36.0,2.0,,875.0,1637.0,89.0,,2015
26,CITY OF MARIKINA,NCR - National Capital Region,METROPOLITAN MANILA,89537.0,6991.0,290.0,389.0,14.0,16.0,91.0,,,224.0,657.0,29.0,,2015
39,CITY OF PASIG,NCR - National Capital Region,METROPOLITAN MANILA,162226.0,13063.0,197.0,435.0,94.0,34.0,43.0,,,1321.0,2954.0,245.0,,2015
52,QUEZON CITY,NCR - National Capital Region,METROPOLITAN MANILA,561729.0,102919.0,2375.0,3006.0,334.0,343.0,293.0,4.0,17.0,4548.0,6572.0,904.0,,2015
65,CITY OF SAN JUAN,NCR - National Capital Region,METROPOLITAN MANILA,24496.0,3008.0,,,,,,,,783.0,336.0,,,2015
78,CALOOCAN CITY,NCR - National Capital Region,METROPOLITAN MANILA,283142.0,66524.0,5172.0,5595.0,191.0,290.0,15.0,,,2740.0,3974.0,235.0,,2015
91,CITY OF MALABON,NCR - National Capital Region,METROPOLITAN MANILA,62428.0,19561.0,412.0,416.0,18.0,11.0,,,,1728.0,1575.0,42.0,,2015
104,CITY OF NAVOTAS,NCR - National Capital Region,METROPOLITAN MANILA,42529.0,13190.0,73.0,367.0,28.0,3.0,60.0,,1.0,3864.0,785.0,4.0,,2015
117,CITY OF VALENZUELA,NCR - National Capital Region,METROPOLITAN MANILA,111294.0,31136.0,2056.0,3234.0,33.0,309.0,,,,2352.0,2582.0,45.0,,2015


In [24]:
final_df.to_csv(DATASET_DEST)