# Tanzania
This notebook is used to create the dataset for Tanzania. The dataset consists of multiple files. 
- hh_sec_a
- hh_sec_b
- hh_sec_c
- hh_sec_d
- hh_sec_e1
- hh_sec_e2
- hh_sec_f
- hh_sec_i
- hh_sec_j1
- hh_sec_k
- hh_sec_l
- hh_sec_m
- hh_sec_n
- hh_sec_o2
- hh_sec_p
- lf_sec_02
- ag_sec_02
- ag_sec_3a
- ag_sec_5a
- ag_sec_5b
- ag_sec_7a
- ag_sec_7b
- ag_sec_10
- GADM_level_0
- GADM_level_1
- GADM_level_2


In [1]:
import os
import sys
import pandas as pd
import numpy as np

sys.path.append("../../")  # Adds higher directory to python modules path.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
from core.ODEDataset import ODEDataset
from utils import common_modifiers, dwelling_modifiers, socio_modifiers, finance_modifiers, appliances_modifiers, \
    geospatial, energy_modifiers

import pathlib
from utils import constants


In [2]:
DB_ROOT = "../../playground/data/LSMS/Tanzania"
root = pathlib.Path(DB_ROOT)

ID_COL = "sdd_hhid"

# Clusters
Clusters are group of columns used together to create a new feature. 


In [3]:
Dwelling_quality_cluster = [
    "Dwelling_wall",
    "Dwelling_roof",
    "Dwelling_floor",
    "Dwelling_water",
    "Dwelling_toilet"
]

Expenditure_yearly_cluster = ["hh_c28_1",
                              "hh_c28_2",
                              "hh_c28_3",
                              "hh_c28_4",
                              "hh_c28_5",
                              "hh_c28_6",
                              "hh_c28_7",
                              "hh_c28_8",
                              "hh_l02a",
                              "hh_l03"

                              ]  # hh_sec_c, hh_sec_l
Expenditure_weekly_cluster = [
    "hh_f03",
    "hh_f05",
    "hh_f07",
    "hh_f09",
    "hh_f11",
    "hh_f13",
    "hh_f15",
    "hh_j04"
]  # hh_sec_f, hh_sec_j1
Expenditure_monthly_cluster = ["hh_d07",
                               "hh_d08",
                               "hh_d09",
                               ]  # hh_sec_d
Expenditure_multi_period_cluster = ["hh_k02"]  # hh_sec_k

## Categories
Map the original values to the standard values.


In [4]:
Education_level_original2final = {1: 'No schooling',
                                  11: 'Primary education', 12: 'Primary education',
                                  13: 'Primary education', 14: 'Primary education', 15: 'Primary education',
                                  16: 'Primary education', 17: 'Primary education',
                                  18: 'Primary education',
                                  19: 'Primary education',
                                  20: 'Primary education',
                                  21: 'Upper secondary education',
                                  22: 'Upper secondary education', 23: 'Upper secondary education',
                                  24: 'Upper secondary education', 25: 'Upper secondary education',
                                  31: 'Post-secondary non-tertiary education',
                                  32: 'Post-secondary non-tertiary education',
                                  33: 'Post-secondary non-tertiary education',
                                  34: 'Post-secondary non-tertiary education', 41: 'Bachelor\'s or equivalent level',
                                  42: 'Bachelor\'s or equivalent level', 43: 'Bachelor\'s or equivalent level',
                                  44: 'Master\'s or equivalent level', 45: 'Doctoral or equivalent level',
                                  2: 'Not elsewhere classified', }
Socio_status_original2final = {1: 'Employee', 2: 'Employer', 3: 'Own-account worker non-farm',
                               4: 'Contributing family worker', 5: 'Contributing family worker',
                               6: 'Own-account worker farm',
                               7: "Unemployed",
                               8: "Unemployed"
                               }
Connection_type_original2final = {1: 'National grid', 2: 'Local mini-grid', 3: 'Solar Home System', 7: np.nan}
Dwelling_quality_wall_original2final = {1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 6: 1, 7: 0, }
Dwelling_quality_roof_original2final = {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, 6: 0, 7: 0, }
Dwelling_quality_floor_original2final = {1: 0, 2: 1, 3: 0, }
Dwelling_quality_toilet_original2final = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 0, }

Clean_fuel_original2final = {
    1: "No",
    2: "No",
    5: "No",
    6: "No",
    8: "No",
    3: "Yes",
    4: "Yes",
    7: "Yes",
}

Dwelling_toilet_1_original2final = {1: 1,
                                    2: 1, 3: 1, 5: 1,
                                    7: 1, 8: 1, 9: 1,
                                    10: 1, 4: 0,
                                    6: 0,
                                    11: 0,
                                    12: 0, }
Dwelling_toilet_2_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, }
Dwelling_toilet_3_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, }
Dwelling_toilet_4_original2final = {1: 1, 2: 1, 3: 1, 5: 1, 7: 1, 8: 1, 9: 1, 10: 1, 4: 0, 6: 0, 11: 0, 12: 0, }
Dwelling_toilet_5_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, 11: np.nan}
Dwelling_toilet_6_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, 11: np.nan}

## hh_sec_a
- Monthly_expenditure

In [12]:
hh_sec_a = ODEDataset("Tanzania/hh_sec_a")
hh_sec_a.from_csv(root.joinpath("hh_sec_a.csv")).group_by(ID_COL)
# Merge Helper Sections
# ---------------
helper_sections = ['hh_sec_c', 'hh_sec_d', 'hh_sec_f', 'hh_sec_i', 'hh_sec_j1', 'hh_sec_k', 'hh_sec_l', 'hh_sec_k']

for section_name in helper_sections:
    section = ODEDataset(section_name)
    section.from_csv(root.joinpath(f"{section_name}.csv")).group_by(ID_COL)
    main_cols = hh_sec_a.get_columns()
    sec_cols = [col for col in section.get_columns() if col not in main_cols]
    sec_cols.append(ID_COL)
    section = section.select(sec_cols)
    hh_sec_a = hh_sec_a.merge(section, on=ID_COL)

# Extract Drivers
# -------------------
hh_sec_a = hh_sec_a.new_feature("Monthly_expenditure", finance_modifiers.expenditure_multi_section(Expenditure_yearly_cluster, Expenditure_weekly_cluster, Expenditure_monthly_cluster, Expenditure_multi_period_cluster))


hh_sec_a.select([
    ID_COL, "Monthly_expenditure"
])

hh_sec_a.preview()


Unnamed: 0,sdd_hhid,Monthly_expenditure
0,0001-001-001,171233.333333
1,0001-001-003,76316.666667
2,0001-001-004,423750.0
3,0001-004-001,199066.666667
4,0001-004-002,48066.666667


# hh_sec_b 
contains: 

- Age_HHH
- Number_adults
- Years_of_HHH_in_community


In [6]:
hh_sec_b = ODEDataset("Tanzania/hh_sec_b")

hh_sec_b.from_csv(root.joinpath("hh_sec_b.csv")).group_by(ID_COL)

hh_sec_b = hh_sec_b.new_feature("Age_HHH", socio_modifiers.extract_age_of_head("hh_b04", "hh_b05", 1))

hh_sec_b = hh_sec_b.new_feature("Number_adults", socio_modifiers.extract_age_groups("hh_b05", "adults"))

# years_of_HHH_in_community(data,source,questionnaire,'hh_sec_b','hh_sec_b',hh,'hh_b05','hh_b26','hh_b04',1,'roster')

hh_sec_b = hh_sec_b.new_feature("Years_of_HHH_in_community",
                                socio_modifiers.get_years_of_hhh_in_community_roster('hh_b05', 'hh_b26', 'hh_b04', 1))
hh_sec_b = hh_sec_b.select([
    ID_COL, "Age_HHH", "Number_adults", "Years_of_HHH_in_community"
])
hh_sec_b.preview()

Unnamed: 0,sdd_hhid,Age_HHH,Number_adults,Years_of_HHH_in_community
0,0001-001-001,79.0,0.0,79.0
1,0001-001-003,36.0,0.0,36.0
2,0001-001-004,29.0,0.0,3.0
3,0001-004-001,29.0,0.0,29.0
4,0001-004-002,28.0,0.0,3.0


## hh_sec_c
- Education_level_HHH

In [7]:
hh_sec_c = ODEDataset("Tanzania/hh_sec_c")
hh_sec_c.from_csv(root.joinpath("hh_sec_c.csv")).group_by(ID_COL)

hh_sec_c = hh_sec_c.new_feature("Education_level_HHH_raw",
                                common_modifiers.categorize("hh_c07", Education_level_original2final))

hh_sec_c = hh_sec_c.new_feature("Education_level_HHH", lambda x: x['Education_level_HHH_raw'][0])

hh_sec_c = hh_sec_c.select([
    ID_COL, 'Education_level_HHH'

])
hh_sec_c.preview()

Unnamed: 0,sdd_hhid,Education_level_HHH
0,0001-001-001,
1,0001-001-003,Primary education
2,0001-001-004,Primary education
3,0001-004-001,Primary education
4,0001-004-002,Upper secondary education


# hh_sec_e1
- Socio_status_HHH

In [8]:
hh_sec_e1 = ODEDataset("Tanzania/hh_sec_e1")
hh_sec_e1.from_csv(root.joinpath("hh_sec_e1.csv")).group_by(ID_COL)

# Helper sections
#------------------
hh_sec_e2 = ODEDataset("Tanzania/hh_sec_e2")
hh_sec_e2.from_csv(root.joinpath("hh_sec_e2.csv")).group_by(ID_COL)
hh_sec_e2 = hh_sec_e2.select([
    ID_COL, "hh_e207_1",
])
hh_sec_b_helper = ODEDataset('Tanzania/hh_sec_b_helper')
hh_sec_b_helper = hh_sec_b_helper.from_csv(root.joinpath("hh_sec_b.csv")).group_by(ID_COL)

hh_sec_b_helper = hh_sec_b_helper.new_feature("HHH_relation_pos",
                                              socio_modifiers.extract_relation_pos("hh_b05", "sdd_indid", 1))

hh_sec_b_helper = hh_sec_b_helper.select([
    ID_COL, 'HHH_relation_pos'
])
# Merge Helper sections
#------------------

hh_sec_e1 = hh_sec_e1.merge(hh_sec_e2, on=ID_COL)
hh_sec_e1 = hh_sec_e1.merge(hh_sec_b_helper, on=ID_COL)

# Extract drivers
#------------------
hh_sec_e1 = hh_sec_e1.new_feature("Socio_status_HHH_raw",
                                  common_modifiers.categorize("hh_e207_1", Socio_status_original2final))

# socio_status_HHH(data,source,questionnaire,'Derived_variables','hh_sec_e1',hh,'hh_e31b_1','Socio_status_HHH_raw','-')
hh_sec_e1 = (hh_sec_e1
             .new_feature("Socio_status_HHH",
                          socio_modifiers.extract_socio_status_hhh("Tanzania", 'Socio_status_HHH_raw',
                                                                   'hh_e31b_1', '-')))

hh_sec_e1 = hh_sec_e1.select([
    ID_COL, "Socio_status_HHH"
])

hh_sec_e1.preview()

Unnamed: 0,sdd_hhid,Socio_status_HHH
0,0001-001-001,
1,0001-001-003,
2,0001-001-004,8.0
3,0001-004-001,
4,0001-004-002,


## hh_sec_n
contains: 
- HH_with_home_business

In [9]:
hh_sec_n = ODEDataset("Tanzania/hh_sec_n")
hh_sec_n.from_csv(root.joinpath("hh_sec_n.csv")).group_by(ID_COL)

# Helper sections
#------------------
hh_sec_e1_helper = ODEDataset("Tanzania/hh_sec_e1_helper")
hh_sec_e1_helper.from_csv(root.joinpath("hh_sec_e1.csv")).group_by(ID_COL)
hh_sec_e1_helper = hh_sec_e1_helper.select([ID_COL, "hh_e07", "hh_e09"])

# Merge Helper sections
#------------------
hh_sec_n = hh_sec_n.merge(hh_sec_e1_helper, on=ID_COL)

# Extract drivers
#------------------

hh_sec_n = hh_sec_n.new_feature('HH_with_home_business_raw',
                                socio_modifiers.extract_head_w_home_business_lsms('hh_e07', 'hh_e09',
                                                                                  'hh_n01a'))

hh_sec_n = hh_sec_n.new_feature("HH_with_home_business", lambda row: row["HH_with_home_business_raw"][0])

hh_sec_n = hh_sec_n.select([
    ID_COL, "HH_with_home_business"
])
hh_sec_n.preview()

Unnamed: 0,sdd_hhid,HH_with_home_business
0,0001-001-001,Yes
1,0001-001-003,No
2,0001-001-004,No
3,0001-004-001,Yes
4,0001-004-002,Yes


## hh_sec_i
- Clean_fuel
- Dwelling_quality_index

In [18]:
hh_sec_i = ODEDataset("Tanzania/hh_sec_i")
hh_sec_i.from_csv(root.joinpath("hh_sec_i.csv")).group_by(ID_COL)

hh_sec_i = hh_sec_i.new_feature("Clean_fuel", common_modifiers.categorize("hh_i16", Clean_fuel_original2final))
hh_sec_i = hh_sec_i.apply(common_modifiers.take("Clean_fuel", 0))

hh_sec_i = hh_sec_i.new_feature("Number_of_rooms", lambda row: row["hh_i07_1"][0])

hh_sec_i = hh_sec_i.new_feature("Connection_type",
                                common_modifiers.categorize("hh_i18", Connection_type_original2final)).apply(
    common_modifiers.take("Connection_type", 0))

# Dwelling Wall
# re_categorization(data,source,questionnaire,'hh_sec_i',hh,'hh_i08','-','-',map_cat,'Dwelling_quality_wall',clusters)
hh_sec_i = hh_sec_i.new_feature("Dwelling_wall",
                                common_modifiers.categorize("hh_i08", Dwelling_quality_wall_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_roof",
                                common_modifiers.categorize("hh_i09", Dwelling_quality_roof_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_floor",
                                common_modifiers.categorize("hh_i10", Dwelling_quality_floor_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_water",
                                common_modifiers.categorize("hh_i12", Dwelling_quality_toilet_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_1",
                                common_modifiers.categorize("hh_i19", Dwelling_quality_toilet_original2final)).apply(
    common_modifiers.take("Dwelling_toilet_1", 0))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_2",
                                common_modifiers.categorize("hh_i24_1", Dwelling_quality_toilet_original2final)).apply(
    common_modifiers.take("Dwelling_toilet_2", 0))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_3",
                                common_modifiers.categorize("hh_i24_2", Dwelling_quality_toilet_original2final)).apply(
    common_modifiers.take("Dwelling_toilet_3", 0))

rainy_toilet_modifier = common_modifiers.take_one_with_value(
    ["Dwelling_toilet_1", "Dwelling_toilet_2", "Dwelling_toilet_3"], 1, np.nan)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_rainy",
                                rainy_toilet_modifier)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_4",
                                common_modifiers.categorize("hh_i29", Dwelling_quality_toilet_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_5",
                                common_modifiers.categorize("hh_i34_1", Dwelling_quality_toilet_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_6",
                                common_modifiers.categorize("hh_i34_2", Dwelling_quality_toilet_original2final))

dry_toilet_modifier = common_modifiers.take_one_with_value(
    ["Dwelling_toilet_4", "Dwelling_toilet_5", "Dwelling_toilet_6", "Dwelling_toilet_3"], 1, np.nan)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_dry",
                                dry_toilet_modifier)

# dwelling_toilet(data,source,questionnaire,'Derived_variables',hh,'Dwelling_toilet_rainy','Dwelling_toilet_dry','Dwelling_toilet_rainy','Dwelling_toilet_dry','not_list')

toilet_modifier = common_modifiers.take_one_with_value(["Dwelling_toilet_rainy", "Dwelling_toilet_dry"], 1, np.nan)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_raw",
                                toilet_modifier)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet", lambda row: [row["Dwelling_toilet_raw"]])

hh_sec_i = hh_sec_i.new_feature("Dwelling_quality_index", dwelling_modifiers.dwelling_quality_index)
hh_sec_i = hh_sec_i.select([
    ID_COL, "Clean_fuel", "Number_of_rooms", "Dwelling_quality_index"
])
hh_sec_i.preview()

Unnamed: 0,sdd_hhid,Clean_fuel,Number_of_rooms,Dwelling_quality_index
0,0001-001-001,No,3.0,0.2
1,0001-001-003,No,3.0,
2,0001-001-004,No,1.0,0.8
3,0001-004-001,No,2.0,0.2
4,0001-004-002,No,1.0,


# express sections
contains:
- Number_of_rooms
- Dwelling_quality_index
- Years_of_HHH_in_community
- Measurement_age
- Hours_available_electricity
- Tariff_payment_frequency
- GADM_level_1
- GADM_level_2
- Climate_zone_lev_1
- Climate_zone_lev_2

In [80]:
express_sections = ODEDataset("Zambia/express sections")
express_sections.from_csv("../playground/data/ESMAP/zambia/express sections.csv", encoding='latin1'
                          ).group_by("HouseholdID")

mtf_a1 = ODEDataset("Nigeria/mtf_ng_hh_sec_a1 helper")

mtf_a1.from_csv("../playground/data/ESMAP/zambia/section a.csv", encoding='latin1'
                ).group_by("HouseholdID").select(["HouseholdID", "A4"])

express_sections = express_sections.merge(mtf_a1, "HouseholdID")
express_sections = express_sections.apply(common_modifiers.transform_list_int("B9"))
express_sections = express_sections.new_feature("Number_of_rooms", lambda x: x['B9'][0])

Transform the Dwelling Wall, Roof, Floor, Water and Toilet quality to a standard format. (From Float to Int)

In [81]:
dwelling_cols = [
    "B10",
    "B11",
    "B12",
    "B14",
    "B13_1",
    "A4",
    "B4",
    "B1"
]

for col in dwelling_cols:
    express_sections = express_sections.apply(
        common_modifiers.transform_list_int(col))




#### Dwelling Quality Index

In [82]:
express_sections = express_sections.new_feature('Dwelling_wall',
                                                common_modifiers.categorize('B10',
                                                                            Dwelling_Wall_Quality_original2final))

express_sections = express_sections.new_feature("Dwelling_roof",
                                                common_modifiers.categorize("B11",
                                                                            Dwelling_Roof_Quality_original2final))

express_sections = express_sections.new_feature("Dwelling_floor",
                                                common_modifiers.categorize("B12",
                                                                            Dwelling_Floor_Quality_original2final))

express_sections = express_sections.new_feature("Dwelling_water",
                                                common_modifiers.categorize("B14",
                                                                            Dwelling_Water_Quality_original2final))

express_sections = express_sections.new_feature("Dwelling_toilet",
                                                common_modifiers.categorize("B13_1",
                                                                            Dwelling_Water_Quality_original2final))
express_sections = express_sections.new_feature("Dwelling_quality_index", dwelling_modifiers.dwelling_quality_index)


In [83]:

express_sections = express_sections.new_feature("Years_of_HHH_in_community",
                                                socio_modifiers.get_years_of_hhh_in_community_multi_section("A4", "B4",
                                                                                                            "B1", 1))

section_c_solar_devices = ODEDataset("Zambia/section_c_solar_devices")
section_c_solar_devices.from_csv("../playground/data/ESMAP/zambia/section c solar devices.csv", encoding='latin1'
                                 ).group_by("HouseholdID")

express_sections = express_sections.merge(section_c_solar_devices, "HouseholdID")

connection_type_valid_answers = {constants.NATIONAL_GRID: 1, constants.LOCAL_MINI_GRID: 888,
                                 constants.SOLAR_HOME_SYSTEM: 1}

express_sections = express_sections.new_feature("Connection_type",
                                                energy_modifiers.get_connection_type('C2', 'C2',
                                                                                     'C138',
                                                                                     connection_type_valid_answers))

express_sections = express_sections.new_feature("Hours_available_electricity_not_filtered",
                                                energy_modifiers.get_hours_available_electricity('C28B',
                                                                                                 '-',
                                                                                                 'C164',
                                                                                                 'C140',
                                                                                                 3))

express_sections = express_sections.new_feature("Hours_available_electricity",
                                                energy_modifiers.filtering('Hours_available_electricity_not_filtered',
                                                                           888, -8))

express_sections = express_sections.new_feature("Measurement_age",
                                                socio_modifiers.measurement_age('C7', 'C46', 'C152', 'C140'))
express_sections = express_sections.new_feature("Tariff_payment_frequency_raw",
                                                common_modifiers.categorize("C13",
                                                                            Tariff_payment_frequency_original2final)).new_feature(
    "Tariff_payment_frequency",
    lambda x: x['Tariff_payment_frequency_raw'][0])





#### GIS Data

In [84]:

express_sections = express_sections.apply(common_modifiers.add_const_driver("GADM_level_0", "Zambia"))

express_sections = express_sections.new_feature("GADM_level_1_raw",
                                                common_modifiers.categorize("province", Province_raw_original2final))

express_sections = express_sections.new_feature("GADM_level_1", lambda x: x['GADM_level_1_raw'][0])

express_sections = express_sections.apply(common_modifiers.add_const_driver("GADM_level_2", np.nan))

gadm_level_1_df = pd.read_excel("../playground/data/ESMAP/zambia/GADM_level_1.xlsx")
express_sections = express_sections.new_feature("Climate_zone_lev_1",
                                                geospatial.gis_info_by_gadm_level('Climate_majority',
                                                                                  gadm_level_1_df,
                                                                                  'GADM_level_1',
                                                                                  ))
express_sections = express_sections.apply(common_modifiers.add_const_driver("Climate_zone_lev_2", np.nan))





In [85]:
express_sections = express_sections.select([
    "HouseholdID", "Dwelling_quality_index", "Years_of_HHH_in_community", "Measurement_age",
    "Hours_available_electricity", "Tariff_payment_frequency",
    "Climate_zone_lev_1", "Climate_zone_lev_2",
    "Number_of_rooms"
])

express_sections.preview(1000)

Unnamed: 0,HouseholdID,Dwelling_quality_index,Years_of_HHH_in_community,Measurement_age,Hours_available_electricity,Tariff_payment_frequency,Climate_zone_lev_1,Climate_zone_lev_2,Number_of_rooms
0,1016101,0.4,15.0,,,,11,,2.0
1,1016102,0.6,,,,,11,,3.0
2,1016103,0.6,,,,,11,,2.0
3,1016104,0.4,,,,,11,,2.0
4,1016105,0.8,,,,,11,,3.0
...,...,...,...,...,...,...,...,...,...
995,3067304,0.2,,,,,3,,3.0
996,3067305,0.2,,,,,3,,1.0
997,3067306,0.0,,,,,3,,2.0
998,3067307,0.2,15.0,,,,3,,2.0


# Section I 
contains:
- Monthly_expenditure

In [68]:
section_i = ODEDataset("Zambia/Section I")
section_i.from_csv("../playground/data/ESMAP/zambia/section i.csv",
                   encoding='latin1').group_by("HouseholdID")
section_l = ODEDataset("Zambia/Section L")
section_l.from_csv("../playground/data/ESMAP/zambia/section l.csv",
                   encoding='latin1').group_by("HouseholdID")

section_i = section_i.merge(section_l, "HouseholdID")

section_i = section_i.new_feature("Monthly_expenditure",
                                  finance_modifiers.expenditure_zambia())

# fuel_usage(data,source,questionnaire,'section i',hh,question,clusters)
section_i = section_i.new_feature("Fuel_usage",
                                  energy_modifiers.extract_fuel_usage(cooking_hrs_cluster))

section_i = section_i.new_feature("Clean_fuel",
                                  energy_modifiers.is_clean_fuel(constants.CLEAN_FUELS, 'I19A'))

section_i = section_i.select(["HouseholdID", "Monthly_expenditure", "Clean_fuel"])

section_i.preview()

Unnamed: 0,HouseholdID,Monthly_expenditure,Clean_fuel
0,1016101,1474.333333,No
1,1016102,656.666667,No
2,1016103,335.0,No
3,1016104,0.0,No
4,1016105,660.0,No


# Section M 
contains:
- Ownership_motorized_vehicle_all
- Ownership_small_livestock_all
- Ownership_large_livestock_all

In [69]:
section_m = ODEDataset("Zambia/Section M")
section_m.from_csv("../playground/data/ESMAP/zambia/section m.csv",
                   encoding='latin1').group_by("HouseholdID")

section_m = section_m.apply(
    common_modifiers.transform_list_int("mitem"))

section_m = section_m.new_feature("Ownership_motorized_vehicle_all",
                                  common_modifiers.categorize("mitem",
                                                              Ownership_motorized_vehicle_all_original2final))


def get_ownership_vehicle(row: pd.Series):
    if any(t == 1 for t in row['Ownership_motorized_vehicle_all']):
        result = 'Yes'
    else:
        result = 'No'
    return result


section_m = section_m.new_feature("Ownership_motorized_vehicle",
                                  common_modifiers.find_any(
                                      'Ownership_motorized_vehicle_all', 1,
                                      'Yes', 'No'))

section_m = section_m.new_feature("Ownership_small_livestock_all",
                                  common_modifiers.categorize("mitem",
                                                              Ownership_small_livestock_all_original2final))
section_m = section_m.new_feature("Ownership_small_livestock",
                                  common_modifiers.find_any(
                                      'Ownership_small_livestock_all', 1,
                                      'Yes', 'No'))

section_m = section_m.new_feature("Ownership_large_livestock_all",
                                  common_modifiers.categorize("mitem",
                                                              Ownership_large_livestock_all_original2final))

section_m = section_m.new_feature("Ownership_large_livestock",
                                  common_modifiers.find_any(
                                      'Ownership_large_livestock_all', 1,
                                      'Yes', 'No'))

section_m = section_m.select(
    ["HouseholdID", "Ownership_motorized_vehicle", "Ownership_small_livestock", "Ownership_large_livestock"])

section_m.preview()



Unnamed: 0,HouseholdID,Ownership_motorized_vehicle,Ownership_small_livestock,Ownership_large_livestock
0,1016101,Yes,Yes,Yes
1,1016102,Yes,Yes,Yes
2,1016103,Yes,Yes,Yes
3,1016104,Yes,Yes,Yes
4,1016105,Yes,Yes,Yes


# Section N
contains:
- Presence_phone_charger
- Presence_iron
- Presence_TV
- Presence_refrigerator/freezer
- Presence_radio/stereo
- Presence_DVD_player
- Presence_fan


In [70]:
section_n = ODEDataset("Zambia/Section N")
section_n.from_csv("../playground/data/ESMAP/zambia/section n.csv",
                   encoding='latin1').group_by("HouseholdID")

section_n = section_n.new_feature("Presence_smartphone_charger",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      25, 'nitem'))

section_n = section_n.new_feature("Presence_regular_phone_charger",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      26, "nitem"))

section_n = section_n.new_feature("Presence_phone_charger",
                                  common_modifiers.multi_unify_presence(
                                      ['Presence_smartphone_charger',
                                       'Presence_regular_phone_charger']))

section_n = section_n.new_feature("Presence_iron",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      11, "nitem"))

section_n = section_n.new_feature("Presence_freezer",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      15, "nitem"))

section_n = section_n.new_feature("Presence_refrigerator",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      9, "nitem"))

section_n = section_n.new_feature("Presence_refrigerator/freezer",
                                  common_modifiers.multi_unify_presence([
                                      'Presence_refrigerator',
                                      'Presence_freezer']))

section_n = section_n.new_feature("Presence_radio/stereo",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      6, 'nitem'))

section_n = section_n.new_feature("Presence_DVD_player",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      7, 'nitem'))

section_n = section_n.new_feature("Presence_fan",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      8, 'nitem'))

section_n = section_n.new_feature("Presence_black&white_TV",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      27, 'nitem'))

section_n = section_n.new_feature("Presence_color_TV",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B',
                                      28,
                                      'nitem'))

section_n = section_n.new_feature("Presence_flat_color_TV",
                                  appliances_modifiers.presence_appliances_long(
                                      'N1B', 29,
                                      'nitem'))

section_n = section_n.new_feature("Presence_TV",
                                  common_modifiers.multi_unify_presence(
                                      ['Presence_black&white_TV', 'Presence_color_TV',
                                       'Presence_flat_color_TV']))

section_n = section_n.select([
    "HouseholdID", "Presence_phone_charger", "Presence_iron", "Presence_TV", "Presence_refrigerator/freezer",
    "Presence_radio/stereo", "Presence_DVD_player", "Presence_fan"
])

section_n.preview()



Unnamed: 0,HouseholdID,Presence_phone_charger,Presence_iron,Presence_TV,Presence_refrigerator/freezer,Presence_radio/stereo,Presence_DVD_player,Presence_fan
0,1016101,0,0.0,0,0,1.0,0.0,0.0
1,1016102,0,0.0,0,0,0.0,0.0,0.0
2,1016103,0,0.0,0,0,0.0,0.0,0.0
3,1016104,1,0.0,0,0,0.0,0.0,0.0
4,1016105,0,0.0,0,0,0.0,0.0,0.0


# Merging all the datasets
- section_a
- express_sections
- section_i
- section_m
- section_n

In [71]:
Zambia = section_a.merge(express_sections, "HouseholdID")
Zambia = Zambia.merge(section_i, "HouseholdID")
Zambia = Zambia.merge(section_m, "HouseholdID")
Zambia = Zambia.merge(section_n, "HouseholdID")

Zambia.preview()

Unnamed: 0,HouseholdID,Age_HHH,Education_level_HHH,Number_adults,HH_with_home_business,Socio_status_HHH,Dwelling_quality_index,Years_of_HHH_in_community,Measurement_age,Hours_available_electricity,...,Ownership_motorized_vehicle,Ownership_small_livestock,Ownership_large_livestock,Presence_phone_charger,Presence_iron,Presence_TV,Presence_refrigerator/freezer,Presence_radio/stereo,Presence_DVD_player,Presence_fan
0,1016101,26.0,Primary education,1.078,No,Own-account worker non-farm,0.4,15.0,,,...,Yes,Yes,Yes,0,0.0,0,0,1.0,0.0,0.0
1,1016102,65.0,,3.234,No,Unemployed,0.6,,,,...,Yes,Yes,Yes,0,0.0,0,0,0.0,0.0,0.0
2,1016103,73.0,,1.617,No,Own-account worker farm,0.6,,,,...,Yes,Yes,Yes,0,0.0,0,0,0.0,0.0,0.0
3,1016104,31.0,,3.234,No,Own-account worker farm,0.4,,,,...,Yes,Yes,Yes,1,0.0,0,0,0.0,0.0,0.0
4,1016105,59.0,,3.234,No,Unemployed,0.8,,,,...,Yes,Yes,Yes,0,0.0,0,0,0.0,0.0,0.0


In [72]:
Zambia.to_csv("../playground/data/ESMAP/zambia/Zambia.csv")

<core.ODEDataset.ODEDataset at 0x76b7470f7b90>