In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import geopandas as gp
import re

from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Setting up directories

WORKINGDIR = Path(os.getcwd())
PROJECTROOT = WORKINGDIR.parents[1]
CLEAN_DATASETS = Path(PROJECTROOT, "data", "cleaned-datasets")

# Reference df
COMPLETE_DATASET = Path(PROJECTROOT, "data", "cleaned-datasets","ph-shp-file", "ph-shp-file.shp")

# Health
HEALTH_FACILITIES_2019 = Path(CLEAN_DATASETS, "health-facilities-2019.csv")
HEALTH_FACILITIES_2021 = Path(CLEAN_DATASETS, "health-facilities-2021.csv")
HEALTH_PERSONNEL = Path(CLEAN_DATASETS, "health-personnel-2017.csv")

# Poverty
POVERTY_ESTIMATES = Path(CLEAN_DATASETS, "poverty-estimates-2009-2012-2015.csv")

# Vulnerable groups 
VULNERABLE_GRPS = Path(CLEAN_DATASETS, "vulnerable-grps-2016.csv")

# Housing datasets
TOILET_TYPES = Path(CLEAN_DATASETS, "toilet-types-2010.csv")
LIGHTING_SOURCE = Path(CLEAN_DATASETS, "housing-census-fuel-lighting-2015.csv")
HOUSETYPE = Path(CLEAN_DATASETS, "housing-census-housetype-2015.csv") # WIP
HOUSING_MATERIAL = Path(CLEAN_DATASETS, "housing-census-housing-material-2015.csv") # WIP
HOUSING_TENURE = Path(CLEAN_DATASETS, "housing-census-housing-tenure-2015.csv") # WIP
WATER_SUPPLY_COOKING = Path(CLEAN_DATASETS, "housing-census-water-supply-cooking-2015.csv")
WATER_SUPPLY_DRINKING = Path(CLEAN_DATASETS, "housing-census-water-supply-drinking-2015.csv")

# DTI 
DTI_DATASET = Path(CLEAN_DATASETS, "file-name-of-dti-dataset.csv")

## Import reference dataframe

In [3]:
ref_df = gp.read_file(COMPLETE_DATASET)
ref_df.head()

Unnamed: 0,name,city_munic,province,clean_idx,coords,geometry
0,Aborlan,Aborlan,Palawan,"aborlan, palawan","9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9..."
1,Abra De Ilog,Abra De Ilog,Occidental Mindoro,"abradeilog, occidentalmindoro","13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373..."
2,Abucay,Abucay,Bataan,"abucay, bataan","14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696..."
3,Abulug,Abulug,Cagayan,"abulug, cagayan","18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ..."
4,Abuyog,Abuyog,Leyte,"abuyog, leyte","10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576..."


In [4]:
ref_df.drop(columns=["geometry", "coords"], inplace=True)
ref_df.head()

Unnamed: 0,name,city_munic,province,clean_idx
0,Aborlan,Aborlan,Palawan,"aborlan, palawan"
1,Abra De Ilog,Abra De Ilog,Occidental Mindoro,"abradeilog, occidentalmindoro"
2,Abucay,Abucay,Bataan,"abucay, bataan"
3,Abulug,Abulug,Cagayan,"abulug, cagayan"
4,Abuyog,Abuyog,Leyte,"abuyog, leyte"


## Health

### Merging health facilities datasets

In [5]:
hf_2019 = pd.read_csv(HEALTH_FACILITIES_2019)
hf_2019.drop(columns=["Unnamed: 0"], inplace=True)
hf_2019.head()

Unnamed: 0,name,Ambulatory Surgical Clinic,Animal Bite Treatment Center,Barangay Health Station,Birthing Home,City Health Office,DepEd Clinic,Drug Abuse Treatment and Rehabilitation Centers,General Clinic Laboratory,Hospital,Infirmary,Municipal Health Office,Provincial Health Office,Psychiatric Care Facility,Rural Health Unit,Social hygiene Clinic,year
0,Aborlan,0,0,18,0,0,0,0,0,0,1,0,0,0,1,0,2019
1,Abra De Ilog,0,0,9,0,0,0,0,0,0,1,0,0,0,1,0,2019
2,Abucay,0,0,11,1,0,0,0,0,0,0,0,0,0,1,0,2019
3,Abulug,0,0,8,0,0,0,0,0,1,0,0,0,0,1,0,2019
4,Abuyog,0,0,17,0,0,0,0,0,1,0,0,0,0,2,0,2019


In [6]:
hf_2021 = pd.read_csv(HEALTH_FACILITIES_2021)
hf_2021.drop(columns=["Unnamed: 0"], inplace=True)
hf_2021.head()

Unnamed: 0,name,Ambulatory Surgical Clinic,Animal Bite Treatment Center,Barangay Health Station,Birthing Home,City Health Office,COVID-19 Testing Laboratory,DepEd Clinic,Dialysis Clinic,Drug Abuse Treatment and Rehabilitation Centers,...,General Clinic Laboratory,Hospital,Infirmary,Municipal Health Office,Provincial Health Office,Psychiatric Care Facility,Rural Health Unit,Social hygiene Clinic,Total,year
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,20.0,2021
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,11.0,2021
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,13.0,2021
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,14.0,2021
4,Abuyog,0.0,0.0,17.0,4.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,26.0,2021


In [7]:
hf_2019.shape

(1634, 17)

In [8]:
hf_2021.shape

(1634, 21)

In [9]:
col_diff = [col for col in hf_2021.columns.tolist() if col not in hf_2019.columns.tolist()]
col_diff

['COVID-19 Testing Laboratory',
 'Dialysis Clinic',
 'Drug Testing Laboratory',
 'Total']

In [10]:
merged_hf = pd.concat([hf_2019.drop(columns=["year"]), hf_2021.drop(columns=["year"])], join='inner', ignore_index=True)
merged_hf.shape

(3268, 16)

In [11]:
merged_hf.columns

Index(['name', 'Ambulatory Surgical Clinic', 'Animal Bite Treatment Center',
       'Barangay Health Station', 'Birthing Home', 'City Health Office',
       'DepEd Clinic', 'Drug Abuse Treatment and Rehabilitation Centers',
       'General Clinic Laboratory', 'Hospital', 'Infirmary',
       'Municipal Health Office', 'Provincial Health Office',
       'Psychiatric Care Facility', 'Rural Health Unit',
       'Social hygiene Clinic'],
      dtype='object')

In [12]:
col_diff.remove("Total")
col_diff

['COVID-19 Testing Laboratory', 'Dialysis Clinic', 'Drug Testing Laboratory']

In [13]:
merged_hf

Unnamed: 0,name,Ambulatory Surgical Clinic,Animal Bite Treatment Center,Barangay Health Station,Birthing Home,City Health Office,DepEd Clinic,Drug Abuse Treatment and Rehabilitation Centers,General Clinic Laboratory,Hospital,Infirmary,Municipal Health Office,Provincial Health Office,Psychiatric Care Facility,Rural Health Unit,Social hygiene Clinic
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Abuyog,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3263,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,13.0,3.0,0.0,0.0,0.0,17.0,1.0
3264,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3265,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3266,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
merged_hf = merged_hf.groupby(by="name").median().reset_index()
merged_hf

Unnamed: 0,name,Ambulatory Surgical Clinic,Animal Bite Treatment Center,Barangay Health Station,Birthing Home,City Health Office,DepEd Clinic,Drug Abuse Treatment and Rehabilitation Centers,General Clinic Laboratory,Hospital,Infirmary,Municipal Health Office,Provincial Health Office,Psychiatric Care Facility,Rural Health Unit,Social hygiene Clinic
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.0
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,2.5,0.0,0.0,0.0,17.0,1.0
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
col_diff.append("name")
col_diff

['COVID-19 Testing Laboratory',
 'Dialysis Clinic',
 'Drug Testing Laboratory',
 'name']

In [16]:
hf_2021[col_diff]

Unnamed: 0,COVID-19 Testing Laboratory,Dialysis Clinic,Drug Testing Laboratory,name
0,0.0,0.0,0.0,Aborlan
1,0.0,0.0,0.0,Abra De Ilog
2,0.0,0.0,0.0,Abucay
3,0.0,0.0,0.0,Abulug
4,0.0,0.0,0.0,Abuyog
...,...,...,...,...
1629,4.0,0.0,0.0,Zamboanga
1630,0.0,0.0,0.0,Zamboanguita
1631,0.0,0.0,0.0,Zaragoza
1632,0.0,0.0,0.0,Zarraga


In [17]:
merged_hf = merged_hf.merge(hf_2021[col_diff], how="left", on="name")
merged_hf

Unnamed: 0,name,Ambulatory Surgical Clinic,Animal Bite Treatment Center,Barangay Health Station,Birthing Home,City Health Office,DepEd Clinic,Drug Abuse Treatment and Rehabilitation Centers,General Clinic Laboratory,Hospital,Infirmary,Municipal Health Office,Provincial Health Office,Psychiatric Care Facility,Rural Health Unit,Social hygiene Clinic,COVID-19 Testing Laboratory,Dialysis Clinic,Drug Testing Laboratory
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,2.5,0.0,0.0,0.0,17.0,1.0,4.0,0.0,0.0
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [18]:
merged_hf.columns.tolist()

['name',
 'Ambulatory Surgical Clinic',
 'Animal Bite Treatment Center',
 'Barangay Health Station',
 'Birthing Home',
 'City Health Office',
 'DepEd Clinic',
 'Drug Abuse Treatment and Rehabilitation Centers',
 'General Clinic Laboratory',
 'Hospital',
 'Infirmary',
 'Municipal Health Office',
 'Provincial Health Office',
 'Psychiatric Care Facility',
 'Rural Health Unit',
 'Social hygiene Clinic',
 'COVID-19 Testing Laboratory',
 'Dialysis Clinic',
 'Drug Testing Laboratory']

### Health personnel

In [19]:
health_personnel_df = pd.read_csv(HEALTH_PERSONNEL)
health_personnel_df.drop(columns=["Unnamed: 0", "year"], inplace=True)
health_personnel_df.head()

Unnamed: 0,id,DOCTOR,NURSE,MIDWIFE,DENTIST,NUTRITIONI,PHARMACIST,OCCUPATION,MEDICAL TE,PHYSICAL T,RADIOLOGY,X-RAY TECH
0,Aborlan,20,30,7,1,0,0,0,1,0,0,0.0
1,Abra De Ilog,10,30,0,0,0,1,0,0,1,0,0.0
2,Abucay,20,10,5,1,0,0,0,4,0,0,0.0
3,Abulug,10,30,7,1,0,0,0,1,0,0,0.0
4,Abuyog,80,210,20,2,1,3,0,7,0,1,0.0


In [20]:
merged_df = merged_hf.merge(health_personnel_df, how="left", left_on="name", right_on="id")
merged_df

Unnamed: 0,name,Ambulatory Surgical Clinic,Animal Bite Treatment Center,Barangay Health Station,Birthing Home,City Health Office,DepEd Clinic,Drug Abuse Treatment and Rehabilitation Centers,General Clinic Laboratory,Hospital,...,NURSE,MIDWIFE,DENTIST,NUTRITIONI,PHARMACIST,OCCUPATION,MEDICAL TE,PHYSICAL T,RADIOLOGY,X-RAY TECH
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,30,7,1,0,0,0,1,0,0,0.0
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30,0,0,0,1,0,0,1,0,0.0
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,10,5,1,0,0,0,4,0,0,0.0
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,30,7,1,0,0,0,1,0,0,0.0
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,210,20,2,1,3,0,7,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,1365,394,11,30,51,0,118,20,54,10.0
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,40,7,0,0,0,0,1,0,0,0.0
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,30,10,1,0,0,0,1,0,0,0.0
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80,6,1,0,0,0,0,0,0,0.0


In [21]:
merged_df.drop(columns=["id"], inplace=True)

In [22]:
new_col_names = ["health_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "name" else col for col in merged_df.columns.tolist()]
new_col_names

['name',
 'health_ambulatory_surgical_clinic',
 'health_animal_bite_treatment_center',
 'health_barangay_health_station',
 'health_birthing_home',
 'health_city_health_office',
 'health_deped_clinic',
 'health_drug_abuse_treatment_and_rehabilitation_centers',
 'health_general_clinic_laboratory',
 'health_hospital',
 'health_infirmary',
 'health_municipal_health_office',
 'health_provincial_health_office',
 'health_psychiatric_care_facility',
 'health_rural_health_unit',
 'health_social_hygiene_clinic',
 'health_covid_19_testing_laboratory',
 'health_dialysis_clinic',
 'health_drug_testing_laboratory',
 'health_doctor',
 'health_nurse',
 'health_midwife',
 'health_dentist',
 'health_nutritioni',
 'health_pharmacist',
 'health_occupation',
 'health_medical_te',
 'health_physical_t',
 'health_radiology',
 'health_x_ray_tech']

In [23]:
merged_df.columns = new_col_names

In [24]:
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,health_nurse,health_midwife,health_dentist,health_nutritioni,health_pharmacist,health_occupation,health_medical_te,health_physical_t,health_radiology,health_x_ray_tech
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,30,7,1,0,0,0,1,0,0,0.0
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30,0,0,0,1,0,0,1,0,0.0
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,10,5,1,0,0,0,4,0,0,0.0
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,30,7,1,0,0,0,1,0,0,0.0
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,210,20,2,1,3,0,7,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,1365,394,11,30,51,0,118,20,54,10.0
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,40,7,0,0,0,0,1,0,0,0.0
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,30,10,1,0,0,0,1,0,0,0.0
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80,6,1,0,0,0,0,0,0,0.0


## Poverty

In [25]:
poverty_df = pd.read_csv(POVERTY_ESTIMATES)
poverty_df.drop(columns=["Unnamed: 0", "year"], inplace=True)
poverty_df.head()

Unnamed: 0,name,poverty-inc,cov,conf-int-lowerlimit,conf-int-upper-limit
0,Aborlan,23.4,17.7,16.6,30.2
1,Abra De Ilog,37.5,16.4,27.4,47.5
2,Abucay,7.0,29.5,3.6,10.4
3,Abulug,21.6,12.1,17.3,25.9
4,Abuyog,35.4,5.5,32.1,38.6


In [26]:
new_col_names = ["poverty_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "name" else col for col in poverty_df.columns.tolist()]
new_col_names

['name',
 'poverty_poverty_inc',
 'poverty_cov',
 'poverty_conf_int_lowerlimit',
 'poverty_conf_int_upper_limit']

In [27]:
poverty_df = poverty_df.groupby("name").median().reset_index()
poverty_df

Unnamed: 0,name,poverty-inc,cov,conf-int-lowerlimit,conf-int-upper-limit
0,Aborlan,22.4,12.4,17.4,26.9
1,Abra De Ilog,34.8,13.8,27.4,41.9
2,Abucay,7.0,25.8,3.6,10.4
3,Abulug,20.9,12.1,17.3,24.3
4,Abuyog,35.9,6.2,32.1,39.6
...,...,...,...,...,...
1629,Zamboanga,17.3,10.2,14.7,19.8
1630,Zamboanguita,39.1,11.6,31.0,46.5
1631,Zaragoza,18.2,13.6,15.0,20.9
1632,Zarraga,16.8,14.8,12.9,20.7


In [28]:
poverty_df.columns = new_col_names
poverty_df

Unnamed: 0,name,poverty_poverty_inc,poverty_cov,poverty_conf_int_lowerlimit,poverty_conf_int_upper_limit
0,Aborlan,22.4,12.4,17.4,26.9
1,Abra De Ilog,34.8,13.8,27.4,41.9
2,Abucay,7.0,25.8,3.6,10.4
3,Abulug,20.9,12.1,17.3,24.3
4,Abuyog,35.9,6.2,32.1,39.6
...,...,...,...,...,...
1629,Zamboanga,17.3,10.2,14.7,19.8
1630,Zamboanguita,39.1,11.6,31.0,46.5
1631,Zaragoza,18.2,13.6,15.0,20.9
1632,Zarraga,16.8,14.8,12.9,20.7


In [29]:
merged_df = merged_df.merge(poverty_df, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,health_pharmacist,health_occupation,health_medical_te,health_physical_t,health_radiology,health_x_ray_tech,poverty_poverty_inc,poverty_cov,poverty_conf_int_lowerlimit,poverty_conf_int_upper_limit
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0,0,1,0,0,0.0,22.4,12.4,17.4,26.9
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0.0,34.8,13.8,27.4,41.9
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,4,0,0,0.0,7.0,25.8,3.6,10.4
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0,0,1,0,0,0.0,20.9,12.1,17.3,24.3
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,3,0,7,0,1,0.0,35.9,6.2,32.1,39.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,51,0,118,20,54,10.0,17.3,10.2,14.7,19.8
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.0,39.1,11.6,31.0,46.5
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0.0,18.2,13.6,15.0,20.9
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.0,16.8,14.8,12.9,20.7


## Vulnerable groups 

In [30]:
vlnrble_grps = pd.read_csv(VULNERABLE_GRPS)
vlnrble_grps.drop(columns=["Unnamed: 0", "year"], inplace=True)
vlnrble_grps.head()

Unnamed: 0,id,Child Headed_Male,Child Headed_Female,Single Headed_Male,Single Headed_Female,Disability_Male,Disability_Female,Solo Parent_Male,Solo Parent_Female,Older_Male,Older_Female
0,Aborlan,8,3,580,17,1240,1040,620,1290,1990,1730
1,Abra De Ilog,26,1,430,9,980,890,1030,1610,2040,1900
2,Abucay,5,0,1460,6,330,180,110,530,340,400
3,Abulug,4,0,90,8,610,380,320,760,1170,1440
4,Abuyog,4,1,550,7,2010,2260,1840,3920,4270,4920


In [31]:
new_col_names = ["vulnerable_grps_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "id" else "name" for col in vlnrble_grps.columns.tolist()]
new_col_names

['name',
 'vulnerable_grps_child_headed_male',
 'vulnerable_grps_child_headed_female',
 'vulnerable_grps_single_headed_male',
 'vulnerable_grps_single_headed_female',
 'vulnerable_grps_disability_male',
 'vulnerable_grps_disability_female',
 'vulnerable_grps_solo_parent_male',
 'vulnerable_grps_solo_parent_female',
 'vulnerable_grps_older_male',
 'vulnerable_grps_older_female']

In [32]:
vlnrble_grps = vlnrble_grps.groupby("id").median().reset_index()
vlnrble_grps

Unnamed: 0,id,Child Headed_Male,Child Headed_Female,Single Headed_Male,Single Headed_Female,Disability_Male,Disability_Female,Solo Parent_Male,Solo Parent_Female,Older_Male,Older_Female
0,Aborlan,8.0,3.0,580.0,17.0,1240.0,1040.0,620.0,1290.0,1990.0,1730.0
1,Abra De Ilog,26.0,1.0,430.0,9.0,980.0,890.0,1030.0,1610.0,2040.0,1900.0
2,Abucay,5.0,0.0,1460.0,6.0,330.0,180.0,110.0,530.0,340.0,400.0
3,Abulug,4.0,0.0,90.0,8.0,610.0,380.0,320.0,760.0,1170.0,1440.0
4,Abuyog,4.0,1.0,550.0,7.0,2010.0,2260.0,1840.0,3920.0,4270.0,4920.0
...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,150.0,28.0,8910.0,273.0,1274.0,1029.0,2021.0,5769.0,3556.0,4332.0
1630,Zamboanguita,8.0,0.0,290.0,37.0,380.0,480.0,1150.0,3240.0,2580.0,3310.0
1631,Zaragoza,0.0,0.0,650.0,3.0,310.0,220.0,290.0,530.0,750.0,1000.0
1632,Zarraga,3.0,2.0,330.0,17.0,1140.0,1030.0,570.0,1360.0,1230.0,1850.0


In [33]:
vlnrble_grps.columns = new_col_names
vlnrble_grps.head()

Unnamed: 0,name,vulnerable_grps_child_headed_male,vulnerable_grps_child_headed_female,vulnerable_grps_single_headed_male,vulnerable_grps_single_headed_female,vulnerable_grps_disability_male,vulnerable_grps_disability_female,vulnerable_grps_solo_parent_male,vulnerable_grps_solo_parent_female,vulnerable_grps_older_male,vulnerable_grps_older_female
0,Aborlan,8.0,3.0,580.0,17.0,1240.0,1040.0,620.0,1290.0,1990.0,1730.0
1,Abra De Ilog,26.0,1.0,430.0,9.0,980.0,890.0,1030.0,1610.0,2040.0,1900.0
2,Abucay,5.0,0.0,1460.0,6.0,330.0,180.0,110.0,530.0,340.0,400.0
3,Abulug,4.0,0.0,90.0,8.0,610.0,380.0,320.0,760.0,1170.0,1440.0
4,Abuyog,4.0,1.0,550.0,7.0,2010.0,2260.0,1840.0,3920.0,4270.0,4920.0


In [34]:
merged_df = merged_df.merge(vlnrble_grps, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,vulnerable_grps_child_headed_male,vulnerable_grps_child_headed_female,vulnerable_grps_single_headed_male,vulnerable_grps_single_headed_female,vulnerable_grps_disability_male,vulnerable_grps_disability_female,vulnerable_grps_solo_parent_male,vulnerable_grps_solo_parent_female,vulnerable_grps_older_male,vulnerable_grps_older_female
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,8.0,3.0,580.0,17.0,1240.0,1040.0,620.0,1290.0,1990.0,1730.0
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26.0,1.0,430.0,9.0,980.0,890.0,1030.0,1610.0,2040.0,1900.0
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,1460.0,6.0,330.0,180.0,110.0,530.0,340.0,400.0
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,4.0,0.0,90.0,8.0,610.0,380.0,320.0,760.0,1170.0,1440.0
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,4.0,1.0,550.0,7.0,2010.0,2260.0,1840.0,3920.0,4270.0,4920.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,150.0,28.0,8910.0,273.0,1274.0,1029.0,2021.0,5769.0,3556.0,4332.0
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,290.0,37.0,380.0,480.0,1150.0,3240.0,2580.0,3310.0
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,650.0,3.0,310.0,220.0,290.0,530.0,750.0,1000.0
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,330.0,17.0,1140.0,1030.0,570.0,1360.0,1230.0,1850.0


## Housing datasets

### Toilet types

In [35]:
toilet_types = pd.read_csv(TOILET_TYPES)
toilet_types.drop(columns=["Unnamed: 0", "year"], inplace=True)
toilet_types.head()

Unnamed: 0,id,water-seal,closed-pit,open-pit,none
0,Aborlan,3025,2378,7270,8520
1,Abra De Ilog,3791,1730,2180,199
2,Abucay,8028,740,520,3060
3,Abulug,5556,1103,1130,550
4,Abuyog,9547,6080,3860,1722


In [36]:
new_col_names = ["toilet_type_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "id" else "name" for col in toilet_types.columns.tolist()]
new_col_names

['name',
 'toilet_type_water_seal',
 'toilet_type_closed_pit',
 'toilet_type_open_pit',
 'toilet_type_none']

In [37]:
toilet_types.columns = new_col_names

In [38]:
merged_df = merged_df.merge(toilet_types, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,vulnerable_grps_disability_male,vulnerable_grps_disability_female,vulnerable_grps_solo_parent_male,vulnerable_grps_solo_parent_female,vulnerable_grps_older_male,vulnerable_grps_older_female,toilet_type_water_seal,toilet_type_closed_pit,toilet_type_open_pit,toilet_type_none
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,1240.0,1040.0,620.0,1290.0,1990.0,1730.0,3025,2378,7270,8520
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,980.0,890.0,1030.0,1610.0,2040.0,1900.0,3791,1730,2180,199
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,330.0,180.0,110.0,530.0,340.0,400.0,8028,740,520,3060
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,610.0,380.0,320.0,760.0,1170.0,1440.0,5556,1103,1130,550
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,2010.0,2260.0,1840.0,3920.0,4270.0,4920.0,9547,6080,3860,1722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,1274.0,1029.0,2021.0,5769.0,3556.0,4332.0,12434,24132,17413,6039
1630,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,380.0,480.0,1150.0,3240.0,2580.0,3310.0,362,4540,3500,108
1631,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,310.0,220.0,290.0,530.0,750.0,1000.0,8765,5960,500,2150
1632,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1140.0,1030.0,570.0,1360.0,1230.0,1850.0,4642,2980,1020,110


### Lighting source

In [39]:
lighting_source_df = pd.read_csv(LIGHTING_SOURCE)
lighting_source_df.drop(columns=["Unnamed: 0", "year", "city"], inplace=True)
lighting_source_df.head()

Unnamed: 0,name,Electricity_count,Kerosene (Gaas)_count,Liquified Petroleum Gas (LPG)_count,Oil (vegetable animal and others)_count,Solar panel_count,Solar lamp_count,Others_count,None_count,Not Reported_count
0,Aborlan,5606.0,1436.0,,,669.0,472.0,111.0,,
1,Abra De Ilog,4773.0,1333.0,30.0,18.0,20.0,550.0,578.0,27.0,
2,Abucay,9256.0,165.0,4.0,,1.0,1.0,19.0,7.0,
3,Abulug,6729.0,643.0,3.0,1.0,8.0,18.0,3.0,,
4,Abuyog,11369.0,2062.0,7.0,,12.0,43.0,10.0,5.0,


In [40]:
new_col_names = ["light_source_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "name" else "name" for col in lighting_source_df.columns.tolist()]
new_col_names

['name',
 'light_source_electricity_count',
 'light_source_kerosene_(gaas)_count',
 'light_source_liquified_petroleum_gas_(lpg)_count',
 'light_source_oil_(vegetable_animal_and_others)_count',
 'light_source_solar_panel_count',
 'light_source_solar_lamp_count',
 'light_source_others_count',
 'light_source_none_count',
 'light_source_not_reported_count']

In [41]:
lighting_source_df.columns = new_col_names

In [42]:
merged_df = merged_df.merge(lighting_source_df, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,toilet_type_none,light_source_electricity_count,light_source_kerosene_(gaas)_count,light_source_liquified_petroleum_gas_(lpg)_count,light_source_oil_(vegetable_animal_and_others)_count,light_source_solar_panel_count,light_source_solar_lamp_count,light_source_others_count,light_source_none_count,light_source_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,8520,5606.0,1436.0,,,669.0,472.0,111.0,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,199,4773.0,1333.0,30.0,18.0,20.0,550.0,578.0,27.0,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3060,9256.0,165.0,4.0,,1.0,1.0,19.0,7.0,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,550,6729.0,643.0,3.0,1.0,8.0,18.0,3.0,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,1722,11369.0,2062.0,7.0,,12.0,43.0,10.0,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,6039,168368.0,17544.0,867.0,33.0,1115.0,1765.0,382.0,489.0,5.0
1632,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,108,4304.0,1951.0,,2.0,111.0,212.0,27.0,,
1633,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2150,11438.0,257.0,5.0,1.0,3.0,4.0,12.0,3.0,
1634,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,110,5544.0,296.0,1.0,,2.0,3.0,7.0,,


In [43]:
len(merged_df.name.unique())

1634

In [44]:
merged_df.drop_duplicates()

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,toilet_type_none,light_source_electricity_count,light_source_kerosene_(gaas)_count,light_source_liquified_petroleum_gas_(lpg)_count,light_source_oil_(vegetable_animal_and_others)_count,light_source_solar_panel_count,light_source_solar_lamp_count,light_source_others_count,light_source_none_count,light_source_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,8520,5606.0,1436.0,,,669.0,472.0,111.0,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,199,4773.0,1333.0,30.0,18.0,20.0,550.0,578.0,27.0,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3060,9256.0,165.0,4.0,,1.0,1.0,19.0,7.0,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,550,6729.0,643.0,3.0,1.0,8.0,18.0,3.0,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,1722,11369.0,2062.0,7.0,,12.0,43.0,10.0,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,6039,168368.0,17544.0,867.0,33.0,1115.0,1765.0,382.0,489.0,5.0
1632,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,108,4304.0,1951.0,,2.0,111.0,212.0,27.0,,
1633,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2150,11438.0,257.0,5.0,1.0,3.0,4.0,12.0,3.0,
1634,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,110,5544.0,296.0,1.0,,2.0,3.0,7.0,,


### Water supply for cooking

In [45]:
water_cooking_df = pd.read_csv(WATER_SUPPLY_COOKING)
water_cooking_df.drop(columns=["Unnamed: 0", "year", "city"], inplace=True)
water_cooking_df.head()

Unnamed: 0,name,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,Lake river rain and others_count,Peddler_count,Bottled water_count,Others_count,Not Reported_count
0,Aborlan,2447.0,1564.0,1007.0,2060.0,413.0,524.0,150.0,13.0,101.0,4.0,11.0,,
1,Abra De Ilog,1282.0,461.0,917.0,2612.0,848.0,299.0,538.0,197.0,174.0,,,1.0,
2,Abucay,1146.0,1518.0,628.0,5777.0,33.0,4.0,129.0,,12.0,1.0,203.0,2.0,
3,Abulug,632.0,273.0,2274.0,1987.0,236.0,1929.0,,1.0,,,73.0,,
4,Abuyog,806.0,3322.0,947.0,3260.0,153.0,2408.0,1779.0,499.0,9.0,50.0,275.0,,


In [46]:
new_col_names = ["cooking_water_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "name" else "name" for col in water_cooking_df.columns.tolist()]
new_col_names

['name',
 'cooking_water_own_use_faucet_community_water_system_count',
 'cooking_water_shared_faucet_community_water_system_count',
 'cooking_water_own_use_tubed/piped_deep_well_count',
 'cooking_water_shared_tubed/piped_deep_well_count',
 'cooking_water_tubed/piped_shallow_well_count',
 'cooking_water_dug_well_count',
 'cooking_water_protected_spring_count',
 'cooking_water_unprotected_spring_count',
 'cooking_water_lake_river_rain_and_others_count',
 'cooking_water_peddler_count',
 'cooking_water_bottled_water_count',
 'cooking_water_others_count',
 'cooking_water_not_reported_count']

In [47]:
water_cooking_df.columns = new_col_names

In [48]:
merged_df = merged_df.merge(water_cooking_df, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,cooking_water_shared_tubed/piped_deep_well_count,cooking_water_tubed/piped_shallow_well_count,cooking_water_dug_well_count,cooking_water_protected_spring_count,cooking_water_unprotected_spring_count,cooking_water_lake_river_rain_and_others_count,cooking_water_peddler_count,cooking_water_bottled_water_count,cooking_water_others_count,cooking_water_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,2060.0,413.0,524.0,150.0,13.0,101.0,4.0,11.0,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2612.0,848.0,299.0,538.0,197.0,174.0,,,1.0,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5777.0,33.0,4.0,129.0,,12.0,1.0,203.0,2.0,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,1987.0,236.0,1929.0,,1.0,,,73.0,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,3260.0,153.0,2408.0,1779.0,499.0,9.0,50.0,275.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,14045.0,1419.0,14196.0,4591.0,443.0,1456.0,13796.0,3672.0,189.0,
1636,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1385.0,8.0,673.0,104.0,22.0,,83.0,40.0,3.0,
1637,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2791.0,302.0,,3.0,1.0,,,48.0,5.0,
1638,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1885.0,56.0,87.0,1.0,2.0,6.0,327.0,545.0,,


### Water supply for drinking

In [49]:
water_drinking_df = pd.read_csv(WATER_SUPPLY_COOKING)
water_drinking_df.drop(columns=["Unnamed: 0", "year", "city"], inplace=True)
water_drinking_df.head()

Unnamed: 0,name,Own use faucet community water system_count,Shared faucet community water system_count,Own use tubed/piped deep well_count,Shared tubed/piped deep well_count,Tubed/piped shallow well_count,Dug well_count,Protected spring_count,Unprotected spring_count,Lake river rain and others_count,Peddler_count,Bottled water_count,Others_count,Not Reported_count
0,Aborlan,2447.0,1564.0,1007.0,2060.0,413.0,524.0,150.0,13.0,101.0,4.0,11.0,,
1,Abra De Ilog,1282.0,461.0,917.0,2612.0,848.0,299.0,538.0,197.0,174.0,,,1.0,
2,Abucay,1146.0,1518.0,628.0,5777.0,33.0,4.0,129.0,,12.0,1.0,203.0,2.0,
3,Abulug,632.0,273.0,2274.0,1987.0,236.0,1929.0,,1.0,,,73.0,,
4,Abuyog,806.0,3322.0,947.0,3260.0,153.0,2408.0,1779.0,499.0,9.0,50.0,275.0,,


In [50]:
new_col_names = ["drinking_water_"+(col.lower().replace(" ", "_").replace("-", "_")) if col != "name" else "name" for col in water_drinking_df.columns.tolist()]
new_col_names

['name',
 'drinking_water_own_use_faucet_community_water_system_count',
 'drinking_water_shared_faucet_community_water_system_count',
 'drinking_water_own_use_tubed/piped_deep_well_count',
 'drinking_water_shared_tubed/piped_deep_well_count',
 'drinking_water_tubed/piped_shallow_well_count',
 'drinking_water_dug_well_count',
 'drinking_water_protected_spring_count',
 'drinking_water_unprotected_spring_count',
 'drinking_water_lake_river_rain_and_others_count',
 'drinking_water_peddler_count',
 'drinking_water_bottled_water_count',
 'drinking_water_others_count',
 'drinking_water_not_reported_count']

In [51]:
water_drinking_df.columns = new_col_names

In [52]:
merged_df = merged_df.merge(water_drinking_df, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,drinking_water_shared_tubed/piped_deep_well_count,drinking_water_tubed/piped_shallow_well_count,drinking_water_dug_well_count,drinking_water_protected_spring_count,drinking_water_unprotected_spring_count,drinking_water_lake_river_rain_and_others_count,drinking_water_peddler_count,drinking_water_bottled_water_count,drinking_water_others_count,drinking_water_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,2060.0,413.0,524.0,150.0,13.0,101.0,4.0,11.0,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2612.0,848.0,299.0,538.0,197.0,174.0,,,1.0,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5777.0,33.0,4.0,129.0,,12.0,1.0,203.0,2.0,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,1987.0,236.0,1929.0,,1.0,,,73.0,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,3260.0,153.0,2408.0,1779.0,499.0,9.0,50.0,275.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1643,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,14045.0,1419.0,14196.0,4591.0,443.0,1456.0,13796.0,3672.0,189.0,
1644,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1385.0,8.0,673.0,104.0,22.0,,83.0,40.0,3.0,
1645,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2791.0,302.0,,3.0,1.0,,,48.0,5.0,
1646,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1885.0,56.0,87.0,1.0,2.0,6.0,327.0,545.0,,


### House types

In [53]:
house_type_df = pd.read_csv(HOUSETYPE)
house_type_df.drop(columns=["Unnamed: 0", "year", "city"], inplace=True)
house_type_df = house_type_df.loc[~house_type_df["bldg_type"].isnull()]
house_type_df.head()

Unnamed: 0,name,bldg_type,Occupied Housing Units,Number of Households*,Household Population*,Average Household Size,Ratio of Households to Occupied Housing Units,Ratio of Household Population to Occupied Housing Units
0,Aborlan,Single house,7621.0,7644.0,33011.0,4.32,1.0,4.33
1,Aborlan,Duplex,274.0,276.0,1082.0,3.92,1.01,3.95
2,Aborlan,Multi-unit residential,358.0,360.0,855.0,2.38,1.01,2.39
3,Aborlan,Commercial/industrial/agricultural,5.0,5.0,23.0,4.6,1.0,4.6
4,Aborlan,Institutional living quarter,1.0,1.0,4.0,4.0,1.0,4.0


In [54]:
house_types = tuple(house_type_df.bldg_type.unique())
house_types

('Single house',
 'Duplex',
 'Multi-unit residential',
 'Commercial/industrial/agricultural',
 'Institutional living quarter',
 'Others',
 'Not Reported')

In [55]:
for i in range(len(house_types)):
    new_housetype_df = house_type_df.loc[house_type_df.bldg_type == house_types[i]].drop(columns=["bldg_type"])
    clean_house_type = house_types[i].lower().replace(" ", "_")
    
    new_col_names = [f"housetype_{clean_house_type}_"+(col.lower().replace(" ", "_").replace("-", "_")).replace("*", "") if col != "name" else "name" for col in new_housetype_df.columns.tolist()]

    new_housetype_df.columns = new_col_names
    
    if i != 0:
        merged_housetype_df = merged_housetype_df.merge(new_housetype_df, how="left", on="name")
    else:
        merged_housetype_df = new_housetype_df

In [56]:
merged_housetype_df

Unnamed: 0,name,housetype_single_house_occupied_housing_units,housetype_single_house_number_of_households,housetype_single_house_household_population,housetype_single_house_average_household_size,housetype_single_house_ratio_of_households_to_occupied_housing_units,housetype_single_house_ratio_of_household_population_to_occupied_housing_units,housetype_duplex_occupied_housing_units,housetype_duplex_number_of_households,housetype_duplex_household_population,...,housetype_others_household_population,housetype_others_average_household_size,housetype_others_ratio_of_households_to_occupied_housing_units,housetype_others_ratio_of_household_population_to_occupied_housing_units,housetype_not_reported_occupied_housing_units,housetype_not_reported_number_of_households,housetype_not_reported_household_population,housetype_not_reported_average_household_size,housetype_not_reported_ratio_of_households_to_occupied_housing_units,housetype_not_reported_ratio_of_household_population_to_occupied_housing_units
0,Aborlan,7621.0,7644.0,33011.0,4.32,1.00,4.33,274.0,276.0,1082.0,...,,,,,8.0,8.0,29.0,3.63,1.00,3.63
1,Abra De Ilog,7060.0,7198.0,30733.0,4.27,1.02,4.35,88.0,94.0,324.0,...,,,,,4.0,4.0,15.0,3.75,1.00,3.75
2,Abucay,7643.0,7819.0,33215.0,4.25,1.02,4.35,862.0,878.0,3579.0,...,,,,,,,,,,
3,Abulug,7055.0,7084.0,31184.0,4.40,1.00,4.42,243.0,243.0,991.0,...,,,,,2.0,3.0,11.0,3.67,1.50,5.50
4,Abuyog,12629.0,12826.0,54747.0,4.27,1.02,4.34,480.0,495.0,1962.0,...,,,,,9.0,10.0,48.0,4.80,1.11,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499,Zamboanga,151315.0,155680.0,699640.0,4.49,1.03,4.62,17182.0,17527.0,72517.0,...,105.0,4.2,1.0,4.2,1423.0,1452.0,5886.0,4.05,1.02,4.14
1500,Zamboanguita,6378.0,6384.0,26637.0,4.17,1.00,4.18,128.0,129.0,504.0,...,,,,,5.0,5.0,20.0,4.00,1.00,4.00
1501,Zaragoza,10493.0,11216.0,47223.0,4.21,1.07,4.50,344.0,356.0,1476.0,...,,,,,8.0,14.0,56.0,4.00,1.75,7.00
1502,Zarraga,5618.0,5623.0,24583.0,4.37,1.00,4.38,156.0,156.0,627.0,...,,,,,3.0,3.0,9.0,3.00,1.00,3.00


In [57]:
merged_df = merged_df.merge(merged_housetype_df, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,housetype_others_household_population,housetype_others_average_household_size,housetype_others_ratio_of_households_to_occupied_housing_units,housetype_others_ratio_of_household_population_to_occupied_housing_units,housetype_not_reported_occupied_housing_units,housetype_not_reported_number_of_households,housetype_not_reported_household_population,housetype_not_reported_average_household_size,housetype_not_reported_ratio_of_households_to_occupied_housing_units,housetype_not_reported_ratio_of_household_population_to_occupied_housing_units
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,,,,,8.0,8.0,29.0,3.63,1.00,3.63
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,4.0,4.0,15.0,3.75,1.00,3.75
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,,,,,2.0,3.0,11.0,3.67,1.50,5.50
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,,,,,9.0,10.0,48.0,4.80,1.11,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1643,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,105.0,4.2,1.0,4.2,1423.0,1452.0,5886.0,4.05,1.02,4.14
1644,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,5.0,5.0,20.0,4.00,1.00,4.00
1645,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,8.0,14.0,56.0,4.00,1.75,7.00
1646,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,3.0,3.0,9.0,3.00,1.00,3.00


### House material

In [58]:
def clean_string(str_, prefix):
    return (prefix + "_" + str_.lower().replace("/", "_").replace(" ", "_")).replace("__", "_")

In [59]:
house_material_df = pd.read_csv(HOUSING_MATERIAL)
house_material_df.drop(columns=["Unnamed: 0", "year", "city"], inplace=True)
house_material_df = house_material_df.loc[~house_material_df["outer_wall_materials"].isnull()].loc[~house_material_df["roof_materials"].isnull()]
house_material_df.head()

Unnamed: 0,name,outer_wall_materials,roof_materials,count
0,Aborlan,Concrete/brick/stone,Galvanized iron/aluminum,11.0
1,Aborlan,Concrete/brick/stone,Tile/concrete/clay tile,7.0
2,Aborlan,Concrete/brick/stone,Half galvanized iron and half concrete,11.0
3,Aborlan,Concrete/brick/stone,Bamboo/cogon/ nipa/anahaw,
4,Aborlan,Concrete/brick/stone,Asbestos,


In [60]:
house_material_df.outer_wall_materials = house_material_df.outer_wall_materials.apply(clean_string, prefix="outer_wall")
house_material_df.roof_materials = house_material_df.roof_materials.apply(clean_string, prefix="roof")

In [61]:
house_material_df

Unnamed: 0,name,outer_wall_materials,roof_materials,count
0,Aborlan,outer_wall_concrete_brick_stone,roof_galvanized_iron_aluminum,11.0
1,Aborlan,outer_wall_concrete_brick_stone,roof_tile_concrete_clay_tile,7.0
2,Aborlan,outer_wall_concrete_brick_stone,roof_half_galvanized_iron_and_half_concrete,11.0
3,Aborlan,outer_wall_concrete_brick_stone,roof_bamboo_cogon_nipa_anahaw,
4,Aborlan,outer_wall_concrete_brick_stone,roof_asbestos,
...,...,...,...,...
162452,Zumarraga,outer_wall_not_reported,roof_asbestos,
162453,Zumarraga,outer_wall_not_reported,roof_makeshift_salvaged_improvised_materials,
162454,Zumarraga,outer_wall_not_reported,roof_trapal,
162455,Zumarraga,outer_wall_not_reported,roof_others,


In [62]:
outer_wall_mats = tuple(house_material_df.outer_wall_materials.unique())
outer_wall_mats

('outer_wall_concrete_brick_stone',
 'outer_wall_wood',
 'outer_wall_half_concrete_brick_stone_and_half_wood',
 'outer_wall_galvanized_iron_aluminum',
 'outer_wall_bamboo_sawali_cogon_nipa',
 'outer_wall_asbestos',
 'outer_wall_glass',
 'outer_wall_makeshift_salvaged_improvised_materials',
 'outer_wall_trapal',
 'outer_wall_others',
 'outer_wall_no_walls',
 'outer_wall_not_reported')

In [63]:
roof_mats = tuple(house_material_df.roof_materials.unique())
roof_mats

('roof_galvanized_iron_aluminum',
 'roof_tile_concrete_clay_tile',
 'roof_half_galvanized_iron_and_half_concrete',
 'roof_bamboo_cogon_nipa_anahaw',
 'roof_asbestos',
 'roof_makeshift_salvaged_improvised_materials',
 'roof_trapal',
 'roof_others',
 'roof_not_reported')

In [64]:
mats_combo = [(outer_wall_mat, roof_mat) for outer_wall_mat in outer_wall_mats for roof_mat in roof_mats]

In [65]:
for (outer_wall_mat, roof_mat) in mats_combo:
    
    new_df = house_material_df[["name", "count"]].loc[house_material_df.outer_wall_materials == outer_wall_mat].loc[house_material_df.roof_materials == roof_mat]
    new_df = new_df.sort_values("name")
    
    new_vals = new_df["count"].tolist()
    new_vals = [float(val) for val in new_vals]
    
    new_df[f"house_mats_{outer_wall_mat}_{roof_mat}_count"] = new_vals
    
    if outer_wall_mat == mats_combo[0][0] and roof_mat == mats_combo[0][1]:
        merged_housemat_df = new_df
    else:
        merged_housemat_df[f"house_mats_{outer_wall_mat}_{roof_mat}_count"] = new_vals

In [66]:
merged_df = merged_df.merge(merged_housemat_df, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,house_mats_outer_wall_no_walls_roof_not_reported_count,house_mats_outer_wall_not_reported_roof_galvanized_iron_aluminum_count,house_mats_outer_wall_not_reported_roof_tile_concrete_clay_tile_count,house_mats_outer_wall_not_reported_roof_half_galvanized_iron_and_half_concrete_count,house_mats_outer_wall_not_reported_roof_bamboo_cogon_nipa_anahaw_count,house_mats_outer_wall_not_reported_roof_asbestos_count,house_mats_outer_wall_not_reported_roof_makeshift_salvaged_improvised_materials_count,house_mats_outer_wall_not_reported_roof_trapal_count,house_mats_outer_wall_not_reported_roof_others_count,house_mats_outer_wall_not_reported_roof_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,,,3.0,,,,,,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,,,,,,,,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,,,3.0,,,,,,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,,6.0,3.0,4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,,4.0,41.0,,1.0,,,2.0,2.0,
1660,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1661,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,6.0,,,,,,2.0,
1662,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


### House tenure

In [67]:
house_tenure_df = pd.read_csv(HOUSING_TENURE)
house_tenure_df.drop(columns=["Unnamed: 0", "year", "city"], inplace=True)
house_tenure_df = house_tenure_df.loc[~house_tenure_df["tenure_type"].isnull()].loc[~house_tenure_df["bldg_type"].isnull()]
house_tenure_df.head()

Unnamed: 0,name,tenure_type,bldg_type,count
0,Aborlan,Own or owner like possession of house and lot,Single house,4785.0
1,Aborlan,Own or owner like possession of house and lot,Duplex,118.0
2,Aborlan,Own or owner like possession of house and lot,Multi-unit residential,61.0
3,Aborlan,Own or owner like possession of house and lot,Commercial/ industrial/ agricultural,1.0
4,Aborlan,Own or owner like possession of house and lot,Institutional living quarter,


In [68]:
replace_tenure = {'Own or owner like possession of house and lot': "tenure_own_house_lot",
       'Rent house/room including lot': "tenure_rented_house_lot",
       'Own house rent lot': "tenure_own_house_rent_lot",
       'Own house rent-free lot with consent of owner': "tenure_own_house_borrowed_lot_w_consent",
       'Own house rent-free lot without consent of owner': "tenure_own_house_borrowed_lot_no_consent",
       'Rent-free house and lot with consent of owner': "tenure_rent_free_house_and_lot_w_consent",
       'Rent-free house and lot without consent of owner': "tenure_rent_free_house_and_lot_no_consent",
       'Not Applicable': "tenure_not_app",
       'Not Reported': "tenure_not_reported",}

In [69]:
replace_type = {'Single house':'type_single_house', 
'Duplex':'type_duplex', 
'Multi-unit residential':'type_multi-unit',
'Commercial/ industrial/ agricultural':'type_commercial_industrial_agricultural',
'Institutional living quarter':'type_institutional', 
'Others':'type_others', 
'Not Reported':'type_not_reported'}

In [70]:
house_tenure_df = house_tenure_df.replace({"tenure_type":replace_tenure}).replace({"bldg_type":replace_type})

In [71]:
house_tenure_df

Unnamed: 0,name,tenure_type,bldg_type,count
0,Aborlan,tenure_own_house_lot,type_single_house,4785.0
1,Aborlan,tenure_own_house_lot,type_duplex,118.0
2,Aborlan,tenure_own_house_lot,type_multi-unit,61.0
3,Aborlan,tenure_own_house_lot,type_commercial_industrial_agricultural,1.0
4,Aborlan,tenure_own_house_lot,type_institutional,
...,...,...,...,...
94665,Zumarraga,tenure_not_reported,type_multi-unit,
94666,Zumarraga,tenure_not_reported,type_commercial_industrial_agricultural,
94667,Zumarraga,tenure_not_reported,type_institutional,
94668,Zumarraga,tenure_not_reported,type_others,


In [72]:
tenure_types = tuple(house_tenure_df.tenure_type.unique())
tenure_types

('tenure_own_house_lot',
 'tenure_rented_house_lot',
 'tenure_own_house_rent_lot',
 'tenure_own_house_borrowed_lot_w_consent',
 'tenure_own_house_borrowed_lot_no_consent',
 'tenure_rent_free_house_and_lot_w_consent',
 'tenure_rent_free_house_and_lot_no_consent',
 'tenure_not_app',
 'tenure_not_reported')

In [73]:
bldg_types = tuple(house_tenure_df.bldg_type.unique())
bldg_types

('type_single_house',
 'type_duplex',
 'type_multi-unit',
 'type_commercial_industrial_agricultural',
 'type_institutional',
 'type_others',
 'type_not_reported')

In [74]:
tenure_combo = [(tenure_type, bldg_type) for tenure_type in tenure_types for bldg_type in bldg_types]
tenure_combo

[('tenure_own_house_lot', 'type_single_house'),
 ('tenure_own_house_lot', 'type_duplex'),
 ('tenure_own_house_lot', 'type_multi-unit'),
 ('tenure_own_house_lot', 'type_commercial_industrial_agricultural'),
 ('tenure_own_house_lot', 'type_institutional'),
 ('tenure_own_house_lot', 'type_others'),
 ('tenure_own_house_lot', 'type_not_reported'),
 ('tenure_rented_house_lot', 'type_single_house'),
 ('tenure_rented_house_lot', 'type_duplex'),
 ('tenure_rented_house_lot', 'type_multi-unit'),
 ('tenure_rented_house_lot', 'type_commercial_industrial_agricultural'),
 ('tenure_rented_house_lot', 'type_institutional'),
 ('tenure_rented_house_lot', 'type_others'),
 ('tenure_rented_house_lot', 'type_not_reported'),
 ('tenure_own_house_rent_lot', 'type_single_house'),
 ('tenure_own_house_rent_lot', 'type_duplex'),
 ('tenure_own_house_rent_lot', 'type_multi-unit'),
 ('tenure_own_house_rent_lot', 'type_commercial_industrial_agricultural'),
 ('tenure_own_house_rent_lot', 'type_institutional'),
 ('tenure

In [75]:
house_tenure_df

Unnamed: 0,name,tenure_type,bldg_type,count
0,Aborlan,tenure_own_house_lot,type_single_house,4785.0
1,Aborlan,tenure_own_house_lot,type_duplex,118.0
2,Aborlan,tenure_own_house_lot,type_multi-unit,61.0
3,Aborlan,tenure_own_house_lot,type_commercial_industrial_agricultural,1.0
4,Aborlan,tenure_own_house_lot,type_institutional,
...,...,...,...,...
94665,Zumarraga,tenure_not_reported,type_multi-unit,
94666,Zumarraga,tenure_not_reported,type_commercial_industrial_agricultural,
94667,Zumarraga,tenure_not_reported,type_institutional,
94668,Zumarraga,tenure_not_reported,type_others,


In [76]:
names = pd.DataFrame(house_tenure_df["name"].unique().tolist(),columns=["name"])
names

Unnamed: 0,name
0,Aborlan
1,Abra De Ilog
2,Abucay
3,Abulug
4,Abuyog
...,...
1494,Zamboanga
1495,Zamboanguita
1496,Zaragoza
1497,Zarraga


In [77]:
new_df = house_tenure_df[["name", "count"]].loc[house_tenure_df.tenure_type == "tenure_own_house_lot"].loc[house_tenure_df.bldg_type == "type_single_house"]
new_df

Unnamed: 0,name,count
0,Aborlan,4785.0
63,Abra De Ilog,3654.0
126,Abucay,5722.0
189,Abulug,5360.0
252,Abuyog,5398.0
...,...,...
94355,Zamboanga,79422.0
94418,Zamboanguita,4348.0
94481,Zaragoza,8473.0
94544,Zarraga,2612.0


In [78]:
complete = names.merge(new_df, on ="name", how="left")
complete

Unnamed: 0,name,count
0,Aborlan,4785.0
1,Abra De Ilog,3654.0
2,Abucay,5722.0
3,Abulug,5360.0
4,Abuyog,5398.0
...,...,...
1496,Zamboanga,79422.0
1497,Zamboanguita,4348.0
1498,Zaragoza,8473.0
1499,Zarraga,2612.0


In [79]:
for (tenure_type_, bldg_type_) in tenure_combo:
    new_df = house_tenure_df[["name", "count"]].loc[house_tenure_df.tenure_type == tenure_type_].loc[house_tenure_df.bldg_type == bldg_type_]
    
    complete = names.merge(new_df, on ="name", how="left")
    
    new_vals = complete["count"].tolist()
    new_vals = [float(val) for val in new_vals]
    
    if tenure_type_ == tenure_combo[0][0] and bldg_type_ == tenure_combo[0][1]:
        complete[f"tenure_type_{tenure_type_}_{bldg_type_}_count"] = new_vals
        merged_ = complete.copy()
    else:
        merged_[f"tenure_type_{tenure_type_}_{bldg_type_}_count"] = new_vals

In [80]:
merged_df = merged_df.merge(merged_, how="left", on="name")
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,tenure_type_tenure_not_app_type_institutional_count,tenure_type_tenure_not_app_type_others_count,tenure_type_tenure_not_app_type_not_reported_count,tenure_type_tenure_not_reported_type_single_house_count,tenure_type_tenure_not_reported_type_duplex_count,tenure_type_tenure_not_reported_type_multi-unit_count,tenure_type_tenure_not_reported_type_commercial_industrial_agricultural_count,tenure_type_tenure_not_reported_type_institutional_count,tenure_type_tenure_not_reported_type_others_count,tenure_type_tenure_not_reported_type_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,,,,,,,,,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,,,,,,,,,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,,1.0,,5.0,,,,,,
1692,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1693,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1694,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## DTI Dataset

## Final merging

In [81]:
merged_df.columns.tolist()

['name',
 'health_ambulatory_surgical_clinic',
 'health_animal_bite_treatment_center',
 'health_barangay_health_station',
 'health_birthing_home',
 'health_city_health_office',
 'health_deped_clinic',
 'health_drug_abuse_treatment_and_rehabilitation_centers',
 'health_general_clinic_laboratory',
 'health_hospital',
 'health_infirmary',
 'health_municipal_health_office',
 'health_provincial_health_office',
 'health_psychiatric_care_facility',
 'health_rural_health_unit',
 'health_social_hygiene_clinic',
 'health_covid_19_testing_laboratory',
 'health_dialysis_clinic',
 'health_drug_testing_laboratory',
 'health_doctor',
 'health_nurse',
 'health_midwife',
 'health_dentist',
 'health_nutritioni',
 'health_pharmacist',
 'health_occupation',
 'health_medical_te',
 'health_physical_t',
 'health_radiology',
 'health_x_ray_tech',
 'poverty_poverty_inc',
 'poverty_cov',
 'poverty_conf_int_lowerlimit',
 'poverty_conf_int_upper_limit',
 'vulnerable_grps_child_headed_male',
 'vulnerable_grps_chil

In [82]:
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,tenure_type_tenure_not_app_type_institutional_count,tenure_type_tenure_not_app_type_others_count,tenure_type_tenure_not_app_type_not_reported_count,tenure_type_tenure_not_reported_type_single_house_count,tenure_type_tenure_not_reported_type_duplex_count,tenure_type_tenure_not_reported_type_multi-unit_count,tenure_type_tenure_not_reported_type_commercial_industrial_agricultural_count,tenure_type_tenure_not_reported_type_institutional_count,tenure_type_tenure_not_reported_type_others_count,tenure_type_tenure_not_reported_type_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,,,,,,,,,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,,,,,,,,,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,,1.0,,5.0,,,,,,
1692,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1693,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1694,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [83]:
merged_df.shape

(1696, 298)

In [84]:
merged_df = merged_df.drop_duplicates(subset=['name'])

In [85]:
merged_df

Unnamed: 0,name,health_ambulatory_surgical_clinic,health_animal_bite_treatment_center,health_barangay_health_station,health_birthing_home,health_city_health_office,health_deped_clinic,health_drug_abuse_treatment_and_rehabilitation_centers,health_general_clinic_laboratory,health_hospital,...,tenure_type_tenure_not_app_type_institutional_count,tenure_type_tenure_not_app_type_others_count,tenure_type_tenure_not_app_type_not_reported_count,tenure_type_tenure_not_reported_type_single_house_count,tenure_type_tenure_not_reported_type_duplex_count,tenure_type_tenure_not_reported_type_multi-unit_count,tenure_type_tenure_not_reported_type_commercial_industrial_agricultural_count,tenure_type_tenure_not_reported_type_institutional_count,tenure_type_tenure_not_reported_type_others_count,tenure_type_tenure_not_reported_type_not_reported_count
0,Aborlan,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.5,...,,,,,,,,,,
1,Abra De Ilog,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,Abucay,0.0,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,Abulug,0.0,0.0,8.0,0.0,0.0,0.0,0.0,2.0,1.0,...,,,,,,,,,,
4,Abuyog,0.0,0.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691,Zamboanga,0.0,0.0,87.0,17.0,0.0,0.0,1.0,0.0,12.5,...,,1.0,,5.0,,,,,,
1692,Zamboanguita,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1693,Zaragoza,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1694,Zarraga,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [86]:
merged_df.to_csv(Path(PROJECTROOT, "data", "merged_df.csv"))