# Tanzania
This notebook is used to create the dataset for Tanzania. The dataset consists of multiple files. 
- hh_sec_a
- hh_sec_b
- hh_sec_c
- hh_sec_d
- hh_sec_e1
- hh_sec_e2
- hh_sec_f
- hh_sec_i
- hh_sec_j1
- hh_sec_k
- hh_sec_l
- hh_sec_m
- hh_sec_n
- hh_sec_o2
- hh_sec_p
- lf_sec_02
- ag_sec_02
- ag_sec_3a
- ag_sec_5a
- ag_sec_5b
- ag_sec_7a
- ag_sec_7b
- ag_sec_10
- GADM_level_0
- GADM_level_1
- GADM_level_2


In [13]:
import os
import sys
import pandas as pd
import numpy as np

sys.path.append("../../")  # Adds higher directory to python modules path.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
from core.ODEDataset import ODEDataset
from utils import common_modifiers, dwelling_modifiers, socio_modifiers, finance_modifiers, appliances_modifiers, \
    geospatial, energy_modifiers, helpers

import pathlib
from utils import constants


In [14]:
DB_ROOT = "../../playground/data/LSMS/Tanzania"
root = pathlib.Path(DB_ROOT)

ID_COL = "sdd_hhid"

## Clusters
Clusters are group of columns used together to create a new feature. 


In [15]:
Dwelling_quality_cluster = [
    "Dwelling_wall",
    "Dwelling_roof",
    "Dwelling_floor",
    "Dwelling_water",
    "Dwelling_toilet"
]

Expenditure_yearly_cluster = ["hh_c28_1",
                              "hh_c28_2",
                              "hh_c28_3",
                              "hh_c28_4",
                              "hh_c28_5",
                              "hh_c28_6",
                              "hh_c28_7",
                              "hh_c28_8",
                              "hh_l02a",
                              "hh_l03"

                              ]  # hh_sec_c, hh_sec_l
Expenditure_weekly_cluster = [
    "hh_f03",
    "hh_f05",
    "hh_f07",
    "hh_f09",
    "hh_f11",
    "hh_f13",
    "hh_f15",
    "hh_j04"
]  # hh_sec_f, hh_sec_j1
Expenditure_monthly_cluster = ["hh_d07",
                               "hh_d08",
                               "hh_d09",
                               ]  # hh_sec_d
Expenditure_multi_period_cluster = ["hh_k02"]  # hh_sec_k

## Categories
Map the original values to the standard values.


In [16]:
Education_level_original2final = {1: 'No schooling',
                                  11: 'Primary education', 12: 'Primary education',
                                  13: 'Primary education', 14: 'Primary education', 15: 'Primary education',
                                  16: 'Primary education', 17: 'Primary education',
                                  18: 'Primary education',
                                  19: 'Primary education',
                                  20: 'Primary education',
                                  21: 'Upper secondary education',
                                  22: 'Upper secondary education', 23: 'Upper secondary education',
                                  24: 'Upper secondary education', 25: 'Upper secondary education',
                                  31: 'Post-secondary non-tertiary education',
                                  32: 'Post-secondary non-tertiary education',
                                  33: 'Post-secondary non-tertiary education',
                                  34: 'Post-secondary non-tertiary education', 41: 'Bachelor\'s or equivalent level',
                                  42: 'Bachelor\'s or equivalent level', 43: 'Bachelor\'s or equivalent level',
                                  44: 'Master\'s or equivalent level', 45: 'Doctoral or equivalent level',
                                  2: 'Not elsewhere classified', }
Socio_status_original2final = {1: 'Employee', 2: 'Employer', 3: 'Own-account worker non-farm',
                               4: 'Contributing family worker', 5: 'Contributing family worker',
                               6: 'Own-account worker farm',
                               7: "Unemployed",
                               8: "Unemployed"
                               }
Connection_type_original2final = {1: 'National grid', 2: 'Local mini-grid', 3: 'Solar Home System', 7: np.nan}
Dwelling_quality_wall_original2final = {1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 6: 1, 7: 0, }
Dwelling_quality_roof_original2final = {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, 6: 0, 7: 0, }
Dwelling_quality_floor_original2final = {1: 0, 2: 1, 3: 0, }
Dwelling_quality_toilet_original2final = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 0, }

Clean_fuel_original2final = {
    1: "No",
    2: "No",
    5: "No",
    6: "No",
    8: "No",
    3: "Yes",
    4: "Yes",
    7: "Yes",
}

Dwelling_toilet_1_original2final = {1: 1,
                                    2: 1, 3: 1, 5: 1,
                                    7: 1, 8: 1, 9: 1,
                                    10: 1, 4: 0,
                                    6: 0,
                                    11: 0,
                                    12: 0, }
Dwelling_toilet_2_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, }
Dwelling_toilet_3_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, }
Dwelling_toilet_4_original2final = {1: 1, 2: 1, 3: 1, 5: 1, 7: 1, 8: 1, 9: 1, 10: 1, 4: 0, 6: 0, 11: 0, 12: 0, }
Dwelling_toilet_5_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, 11: np.nan}
Dwelling_toilet_6_original2final = {2: 1, 3: 1, 4: 1, 1: 0, 5: 0, 6: 0, 7: 0, 11: np.nan}

Region_original2final = {1: 'Dodoma', 2: 'Arusha', 3: 'Kilimanjaro', 4: 'Tanga', 5: 'Morogoro', 6: 'Pwani',
                         7: 'Dar es salaam', 8: 'Lindi', 9: 'Mtwara', 10: 'Ruvuma', 11: 'Iringa', 12: 'Mbeya',
                         13: 'Singida', 14: 'Tabora', 15: 'Rukwa', 16: 'Kigoma', 17: 'Shinyanga', 18: 'Kagera',
                         19: 'Mwanza', 20: 'Mara', 21: 'Manyara', 22: 'Njombe', 23: 'Katavi', 24: 'Simiyu', 25: 'Geita',
                         26: 'Mbeya', 51: 'Zanzibar North', 52: 'Zanzibar South', 53: 'Zanzibar West',
                         54: 'Pemba North', 55: 'Pemba South', }
District_original2final = {11: 'Kondoa', 12: 'Mpwapwa', 13: 'Kongwa', 15: 'Dodoma Urban', 17: 'Chemba', 21: 'Monduli',
                           22: 'Meru', 23: 'Arusha Urban', 24: 'Karatu', 31: 'Rombo', 32: 'MWANGA', 36: 'MOSHI URBAN',
                           41: 'LUSHOTO', 42: 'KOROGWE RURAL', 43: 'MUHEZA', 44: 'TANGA URBAN', 46: 'HANDENI',
                           47: 'KILINDI', 49: 'KOROGWE URBAN', 51: 'KILOSA', 52: 'MOROGORO RURAL', 53: 'KILOMBERO',
                           55: 'MOROGORO URBAN', 56: 'MVOMERO', 57: 'GAIRO', 61: 'BAGAMOYO', 62: 'KIBAHA RURAL',
                           63: 'KISARAWE', 64: 'MKURANGA', 65: 'RUFIJI', 67: 'KIBAHA URBAN', 71: 'KINONDONI',
                           72: 'ILALA', 73: 'TEMEKE', 81: 'KILWA', 82: 'LINDI RURAL', 83: 'NACHINGWEA', 85: 'RUANGWA',
                           86: 'LINDI URBAN', 91: 'MTWARA RURAL', 92: 'NEWALA', 93: 'MASASI RURAL', 94: 'TANDAHIMBA',
                           95: 'MTWARA MIKINDANI', 96: 'NANYUMBU', 97: 'MASASI URBAN', 102: 'SONGEA RURAL',
                           103: 'MBINGA', 104: 'SONGEA URBAN', 106: 'NYASA', 112: 'MUFINDI', 113: 'IRINGA URBAN',
                           115: 'MAFINGA', 121: 'CHUNYA', 122: 'MBEYA RURAL', 127: 'MBALALI', 128: 'MBEYA URBAN',
                           129: 'MOMBA', 131: 'IRAMBA', 132: 'SINGIDA RURAL', 134: 'SINGIDA URBAN', 141: 'NZEGA',
                           142: 'IGUNGA', 143: 'UYUI', 144: 'URAMBA', 145: 'SIKONGE', 146: 'TABORA URBAN',
                           147: 'KALIUA', 151: 'KALAMBO', 152: 'SUMBAWANGA RURAL', 153: 'NKASI',
                           154: 'SUMBAWANGA URBAN', 161: 'KIBONDO', 162: 'KASULU RURAL', 164: 'KIGOMA URBAN',
                           166: 'BUHIGWE', 173: 'SHINYANGA URBAN', 174: 'KAHAMA RURAL', 175: 'KAHAMA URBAN',
                           181: 'KARAGWE', 182: 'BUKOBA RURAL', 183: 'MULEBA', 184: 'BIHARAMULO', 185: 'NGARA',
                           186: 'BUKOBA URBAN', 187: 'MISSENYI', 188: 'KYERWA', 192: 'MAGU', 193: 'NYAMAGANA',
                           194: 'KWIMBA', 195: 'SENGEREMA', 196: 'ILEMELA', 197: 'MISUNGWI', 201: 'TARIME',
                           203: 'MUSOMA RURAL', 205: 'MUSOMA URBAN', 207: 'BUTIAMA', 211: 'BABATI RURAL',
                           214: 'SIMANJIRO', 223: 'MAKETE', 231: 'MPANDA URBAN', 242: 'ITILIMA', 244: 'MASWA',
                           251: 'GEITA', 252: 'NYANG\'HWALE', 253: 'MBOGWE', 254: 'BUKOMBE', 255: 'CHATO',
                           511: 'KASKAZINI ‘A’', 512: 'KASKAZINI ‘B’', 521: 'KATI', 522: 'KUSINI', 531: 'MAGHARIBI',
                           532: 'MJINI', 541: 'WETE', 542: 'MICHWEWENI', 552: 'MKOANI', }

## hh_sec_a
- Monthly_expenditure
- Climate_zone_lev_1
- Climate_zone_lev_2

In [17]:
hh_sec_a = ODEDataset("Tanzania/hh_sec_a")
hh_sec_a.from_csv(root.joinpath("hh_sec_a.csv")).group_by(ID_COL)
# Merge Helper Sections
# ---------------
helper_sections = ['hh_sec_c', 'hh_sec_d', 'hh_sec_f', 'hh_sec_i', 'hh_sec_j1', 'hh_sec_k', 'hh_sec_l', 'hh_sec_k']

for section_name in helper_sections:
    section = ODEDataset(section_name)
    section.from_csv(root.joinpath(f"{section_name}.csv")).group_by(ID_COL)
    main_cols = hh_sec_a.get_columns()
    sec_cols = [col for col in section.get_columns() if col not in main_cols]
    sec_cols.append(ID_COL)
    section = section.select(sec_cols)
    hh_sec_a = hh_sec_a.merge(section, on=ID_COL)

# Extract Drivers
# -------------------
hh_sec_a = hh_sec_a.new_feature("Monthly_expenditure",
                                finance_modifiers.expenditure_multi_section(Expenditure_yearly_cluster,
                                                                            Expenditure_weekly_cluster,
                                                                            Expenditure_monthly_cluster,
                                                                            Expenditure_multi_period_cluster))




In [18]:
hh_sec_a = hh_sec_a.apply(common_modifiers.add_const_driver("GADM_level_0", "Tanzania"))

gadm_level_1_df = pd.read_excel(root.joinpath("GADM_level_1.xlsx"))
gadm_level_2_df = pd.read_excel(root.joinpath("GADM_level_2.xlsx"))
gadm_level_0_df = pd.read_csv(root.joinpath("GADM_level_0.csv"))

hh_sec_a = hh_sec_a.new_feature("Region_number", common_modifiers.one_of_two('t0_region', 'hh_a01_1'))
# Region_number
hh_sec_a = hh_sec_a.new_feature("Region_raw", common_modifiers.categorize("Region_number", Region_original2final))

hh_sec_a = hh_sec_a.new_feature("District_number", common_modifiers.one_of_two('t0_district', 'hh_a02_1'))
# District_number
hh_sec_a = hh_sec_a.new_feature("District_raw", common_modifiers.categorize("District_number", District_original2final))

hh_sec_a = hh_sec_a.new_feature("GADM_raw", geospatial.get_gadm(
    gadm_level_1_df,
    gadm_level_2_df,
    gadm_level_0_df,
    "Region_raw", "District_raw", '', '',
    'Tanzania'
))

hh_sec_a = hh_sec_a.new_feature("GADM_level_1", lambda x: x["GADM_raw"][1])
hh_sec_a = hh_sec_a.new_feature("GADM_level_2", lambda x: x["GADM_raw"][2])

hh_sec_a = hh_sec_a.new_feature("Climate_zone_lev_1",
                                geospatial.gis_info_by_gadm_level('Climate_majority',
                                                                  gadm_level_1_df,
                                                                  'GADM_level_1',
                                                                  ))

hh_sec_a = hh_sec_a.new_feature("Climate_zone_lev_2",
                                geospatial.gis_info_by_gadm_level('Climate_majority', gadm_level_2_df, 'GADM_level_2'))




In [19]:
hh_sec_a = hh_sec_a.select([
    ID_COL,
    "Climate_zone_lev_2", "Climate_zone_lev_1", "Monthly_expenditure"
])

## hh_sec_b 
contains: 

- Age_HHH
- Number_adults
- Years_of_HHH_in_community


In [20]:
hh_sec_b = ODEDataset("Tanzania/hh_sec_b")

hh_sec_b.from_csv(root.joinpath("hh_sec_b.csv")).group_by(ID_COL)

hh_sec_b = hh_sec_b.new_feature("Age_HHH", socio_modifiers.extract_age_of_head("hh_b04", "hh_b05", 1))

hh_sec_b = hh_sec_b.new_feature("Number_adults", socio_modifiers.extract_age_groups("hh_b05", "adults"))

# years_of_HHH_in_community(data,source,questionnaire,'hh_sec_b','hh_sec_b',hh,'hh_b05','hh_b26','hh_b04',1,'roster')

hh_sec_b = hh_sec_b.new_feature("Years_of_HHH_in_community",
                                socio_modifiers.get_years_of_hhh_in_community_roster('hh_b05', 'hh_b26', 'hh_b04', 1))
hh_sec_b = hh_sec_b.select([
    ID_COL, "Age_HHH", "Number_adults", "Years_of_HHH_in_community"
])
hh_sec_b.preview()

Unnamed: 0,sdd_hhid,Age_HHH,Number_adults,Years_of_HHH_in_community
0,0001-001-001,79.0,0.0,79.0
1,0001-001-003,36.0,0.0,36.0
2,0001-001-004,29.0,0.0,3.0
3,0001-004-001,29.0,0.0,29.0
4,0001-004-002,28.0,0.0,3.0


## hh_sec_c
- Education_level_HHH

In [21]:
hh_sec_c = ODEDataset("Tanzania/hh_sec_c")
hh_sec_c.from_csv(root.joinpath("hh_sec_c.csv")).group_by(ID_COL)

hh_sec_c = hh_sec_c.new_feature("Education_level_HHH_raw",
                                common_modifiers.categorize("hh_c07", Education_level_original2final))

hh_sec_c = hh_sec_c.new_feature("Education_level_HHH", lambda x: x['Education_level_HHH_raw'][0])

hh_sec_c = hh_sec_c.select([
    ID_COL, 'Education_level_HHH'

])
hh_sec_c.preview()

Unnamed: 0,sdd_hhid,Education_level_HHH
0,0001-001-001,
1,0001-001-003,Primary education
2,0001-001-004,Primary education
3,0001-004-001,Primary education
4,0001-004-002,Upper secondary education


## hh_sec_e1
- Socio_status_HHH

In [22]:
hh_sec_e1 = ODEDataset("Tanzania/hh_sec_e1")
hh_sec_e1.from_csv(root.joinpath("hh_sec_e1.csv")).group_by(ID_COL)

# Helper sections
#------------------
hh_sec_e2 = ODEDataset("Tanzania/hh_sec_e2")
hh_sec_e2.from_csv(root.joinpath("hh_sec_e2.csv")).group_by(ID_COL)
hh_sec_e2 = hh_sec_e2.select([
    ID_COL, "hh_e207_1",
])
hh_sec_b_helper = ODEDataset('Tanzania/hh_sec_b_helper')
hh_sec_b_helper = hh_sec_b_helper.from_csv(root.joinpath("hh_sec_b.csv")).group_by(ID_COL)

hh_sec_b_helper = hh_sec_b_helper.new_feature("HHH_relation_pos",
                                              socio_modifiers.extract_relation_pos("hh_b05", "sdd_indid", 1))

hh_sec_b_helper = hh_sec_b_helper.select([
    ID_COL, 'HHH_relation_pos'
])
# Merge Helper sections
#------------------

hh_sec_e1 = hh_sec_e1.merge(hh_sec_e2, on=ID_COL)
hh_sec_e1 = hh_sec_e1.merge(hh_sec_b_helper, on=ID_COL)

# Extract drivers
#------------------
hh_sec_e1 = hh_sec_e1.new_feature("Socio_status_HHH",
                                  common_modifiers.categorize("hh_e207_1", Socio_status_original2final)).apply(common_modifiers.take("Socio_status_HHH", 0))

# socio_status_HHH(data,source,questionnaire,'Derived_variables','hh_sec_e1',hh,'hh_e31b_1','Socio_status_HHH_raw','-')
# hh_sec_e1 = (hh_sec_e1
#              .new_feature("Socio_status_HHH",
#                           socio_modifiers.extract_socio_status_hhh("Tanzania", 'Socio_status_HHH_raw',
#                                                                    'hh_e31b_1', '-')))

hh_sec_e1 = hh_sec_e1.select([
    ID_COL, "Socio_status_HHH"
])

hh_sec_e1.preview()

Unnamed: 0,sdd_hhid,Socio_status_HHH
0,0001-001-001,
1,0001-001-003,Own-account worker farm
2,0001-001-004,Employee
3,0001-004-001,Own-account worker non-farm
4,0001-004-002,Employee


## hh_sec_n
contains: 
- HH_with_home_business

In [23]:
hh_sec_n = ODEDataset("Tanzania/hh_sec_n")
hh_sec_n.from_csv(root.joinpath("hh_sec_n.csv")).group_by(ID_COL)

# Helper sections
#------------------
hh_sec_e1_helper = ODEDataset("Tanzania/hh_sec_e1_helper")
hh_sec_e1_helper.from_csv(root.joinpath("hh_sec_e1.csv")).group_by(ID_COL)
hh_sec_e1_helper = hh_sec_e1_helper.select([ID_COL, "hh_e07", "hh_e09"])

# Merge Helper sections
#------------------
hh_sec_n = hh_sec_n.merge(hh_sec_e1_helper, on=ID_COL)

# Extract drivers
#------------------

hh_sec_n = hh_sec_n.new_feature('HH_with_home_business_raw',
                                socio_modifiers.extract_head_w_home_business_lsms('hh_e07', 'hh_e09',
                                                                                  'hh_n01a'))

hh_sec_n = hh_sec_n.new_feature("HH_with_home_business", lambda row: row["HH_with_home_business_raw"][0])

hh_sec_n = hh_sec_n.select([
    ID_COL, "HH_with_home_business"
])
hh_sec_n.preview()

Unnamed: 0,sdd_hhid,HH_with_home_business
0,0001-001-001,Yes
1,0001-001-003,No
2,0001-001-004,No
3,0001-004-001,Yes
4,0001-004-002,Yes


## hh_sec_i
- Clean_fuel
- Dwelling_quality_index

In [24]:
hh_sec_i = ODEDataset("Tanzania/hh_sec_i")
hh_sec_i.from_csv(root.joinpath("hh_sec_i.csv")).group_by(ID_COL)

hh_sec_i = hh_sec_i.new_feature("Clean_fuel", common_modifiers.categorize("hh_i16", Clean_fuel_original2final))
hh_sec_i = hh_sec_i.apply(common_modifiers.take("Clean_fuel", 0))

hh_sec_i = hh_sec_i.new_feature("Number_of_rooms", lambda row: row["hh_i07_1"][0])

hh_sec_i = hh_sec_i.new_feature("Connection_type",
                                common_modifiers.categorize("hh_i18", Connection_type_original2final)).apply(
    common_modifiers.take("Connection_type", 0))

# Dwelling Wall
# re_categorization(data,source,questionnaire,'hh_sec_i',hh,'hh_i08','-','-',map_cat,'Dwelling_quality_wall',clusters)
hh_sec_i = hh_sec_i.new_feature("Dwelling_wall",
                                common_modifiers.categorize("hh_i08", Dwelling_quality_wall_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_roof",
                                common_modifiers.categorize("hh_i09", Dwelling_quality_roof_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_floor",
                                common_modifiers.categorize("hh_i10", Dwelling_quality_floor_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_water",
                                common_modifiers.categorize("hh_i12", Dwelling_quality_toilet_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_1",
                                common_modifiers.categorize("hh_i19", Dwelling_quality_toilet_original2final)).apply(
    common_modifiers.take("Dwelling_toilet_1", 0))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_2",
                                common_modifiers.categorize("hh_i24_1", Dwelling_quality_toilet_original2final)).apply(
    common_modifiers.take("Dwelling_toilet_2", 0))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_3",
                                common_modifiers.categorize("hh_i24_2", Dwelling_quality_toilet_original2final)).apply(
    common_modifiers.take("Dwelling_toilet_3", 0))

rainy_toilet_modifier = common_modifiers.take_one_with_value(
    ["Dwelling_toilet_1", "Dwelling_toilet_2", "Dwelling_toilet_3"], 1, np.nan)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_rainy",
                                rainy_toilet_modifier)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_4",
                                common_modifiers.categorize("hh_i29", Dwelling_quality_toilet_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_5",
                                common_modifiers.categorize("hh_i34_1", Dwelling_quality_toilet_original2final))

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_6",
                                common_modifiers.categorize("hh_i34_2", Dwelling_quality_toilet_original2final))

dry_toilet_modifier = common_modifiers.take_one_with_value(
    ["Dwelling_toilet_4", "Dwelling_toilet_5", "Dwelling_toilet_6", "Dwelling_toilet_3"], 1, np.nan)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_dry",
                                dry_toilet_modifier)

# dwelling_toilet(data,source,questionnaire,'Derived_variables',hh,'Dwelling_toilet_rainy','Dwelling_toilet_dry','Dwelling_toilet_rainy','Dwelling_toilet_dry','not_list')

toilet_modifier = common_modifiers.take_one_with_value(["Dwelling_toilet_rainy", "Dwelling_toilet_dry"], 1, np.nan)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet_raw",
                                toilet_modifier)

hh_sec_i = hh_sec_i.new_feature("Dwelling_toilet", lambda row: [row["Dwelling_toilet_raw"]])

hh_sec_i = hh_sec_i.new_feature("Dwelling_quality_index", dwelling_modifiers.dwelling_quality_index())
hh_sec_i = hh_sec_i.select([
    ID_COL, "Clean_fuel", "Number_of_rooms", "Dwelling_quality_index"
])
hh_sec_i.preview()

Unnamed: 0,sdd_hhid,Clean_fuel,Number_of_rooms,Dwelling_quality_index
0,0001-001-001,No,3.0,0.0
1,0001-001-003,No,3.0,0.0
2,0001-001-004,No,1.0,1.0
3,0001-004-001,No,2.0,0.0
4,0001-004-002,No,1.0,0.0


## Assets (hh_sec_m)
split into two sections: 
- hh_sec_m
- lf_sec_02
and contains: 
- Ownership_motorized_vehicle
- Ownership_small_livestock
- Ownership_large_livestock

### Ownership modifiers

In [25]:

def get_asset_tanzania_vehicle(row: pd.Series):
    helpers.assert_column_exists_in_row(row, 'hh_m01')
    checks = [row['hh_m01'][24],
              row['hh_m01'][25], ]
    for check in checks:
        if check == 1:
            return 'Yes'
    return 'No'


def get_asset_tanzania_livestock_small(row: pd.Series):
    helpers.assert_column_exists_in_row(row, 'lf02_01')
    checks = [row['lf02_01'][6],
              row['lf02_01'][7],
              row['lf02_01'][8],
              row['lf02_01'][12],
              row['lf02_01'][13]]
    for check in checks:
        if check == 1:
            return 'Yes'
    return 'No'


def get_asset_tanzania_livestock_large(row: pd.Series):
    helpers.assert_column_exists_in_row(row, 'lf02_01')
    checks = [row['lf02_01'][0],
              row['lf02_01'][1],
              row['lf02_01'][2],
              row['lf02_01'][3],
              row['lf02_01'][4],
              row['lf02_01'][5]
              ]

    for check in checks:
        if check == 1:
            return 'Yes'
    return 'No'



In [26]:
assets = ODEDataset("Tanzania/hh_sec_m")
assets.from_csv(root.joinpath("hh_sec_m.csv")).group_by(ID_COL)

lf_sec_02 = ODEDataset("Tanzania/lf_sec_02")
lf_sec_02.from_csv(root.joinpath("lf_sec_02.csv")).group_by(ID_COL)

lf_sec_02 = lf_sec_02.select([
    ID_COL, "lf02_01"
])

assets = assets.merge(lf_sec_02, on=ID_COL)

assets = assets.new_feature("Ownership_motorized_vehicle", get_asset_tanzania_vehicle)
assets = assets.new_feature("Ownership_small_livestock", get_asset_tanzania_livestock_small)
assets = assets.new_feature("Ownership_large_livestock", get_asset_tanzania_livestock_large)

assets = assets.select([
    ID_COL, "Ownership_motorized_vehicle", "Ownership_small_livestock", "Ownership_large_livestock"
])

assets.preview()

Unnamed: 0,sdd_hhid,Ownership_motorized_vehicle,Ownership_small_livestock,Ownership_large_livestock
0,0001-001-001,No,No,No
1,0001-001-003,No,No,No
2,0001-001-004,No,No,No
3,0001-004-001,No,Yes,Yes
4,0001-004-002,No,No,No


## Appliances
in Section M (hh_sec_m)
- Presence_phone_charger
- Presence_iron
- Presence_TV
- Presence_refrigerator/freezer
- Presence_radio/stereo
- Presence_DVD_player
- Presence_fan


In [27]:
appliances = ODEDataset("Tanzania/appliances")
appliances = appliances.from_csv(root.joinpath("hh_sec_m.csv")).group_by(ID_COL)

# charger appliances are not available in the dataset
appliances = appliances.apply(common_modifiers.add_const_driver("Presence_phone_charger", np.nan))
# iron appliances are not available in the dataset
appliances = appliances.apply(common_modifiers.add_const_driver("Presence_iron", np.nan))

appliances = appliances.new_feature("Presence_refrigerator/freezer",
                                    appliances_modifiers.presence_appliances_long(
                                        'hh_m01',
                                        404, 'itemcode'))

appliances = appliances.new_feature("Presence_radio",
                                    appliances_modifiers.presence_appliances_long(
                                        'hh_m01',
                                        401, 'itemcode'))

appliances = appliances.new_feature("Presence_stereo",
                                    appliances_modifiers.presence_appliances_long(
                                        'hh_m01',
                                        423, 'itemcode'))

appliances = appliances.new_feature("Presence_radio/stereo",
                                    common_modifiers.multi_unify_presence(
                                        ['Presence_radio', 'Presence_stereo']))

appliances = appliances.new_feature("Presence_DVD_player",
                                    appliances_modifiers.presence_appliances_long(
                                        'hh_m01',
                                        407, 'itemcode'))

appliances = appliances.new_feature("Presence_fan",
                                    appliances_modifiers.presence_appliances_long(
                                        'hh_m01',
                                        438, 'itemcode'))

appliances = appliances.new_feature("Presence_TV",
                                    appliances_modifiers.presence_appliances_long(
                                        'hh_m01',
                                        406, 'itemcode'))

appliances = appliances.select([
    ID_COL, "Presence_phone_charger", "Presence_iron", "Presence_TV", "Presence_refrigerator/freezer",
    "Presence_radio/stereo", "Presence_DVD_player", "Presence_fan"
])

appliances.preview()



Unnamed: 0,sdd_hhid,Presence_phone_charger,Presence_iron,Presence_TV,Presence_refrigerator/freezer,Presence_radio/stereo,Presence_DVD_player,Presence_fan
0,0001-001-001,,,0.0,0.0,0,0.0,0.0
1,0001-001-003,,,0.0,0.0,0,0.0,0.0
2,0001-001-004,,,0.0,0.0,1,0.0,1.0
3,0001-004-001,,,0.0,0.0,0,0.0,0.0
4,0001-004-002,,,0.0,0.0,0,0.0,0.0


## Merging all the datasets


In [28]:
Tanzania = hh_sec_a.merge(hh_sec_b, ID_COL)
Tanzania = Tanzania.merge(hh_sec_c, ID_COL)
Tanzania = Tanzania.merge(hh_sec_e1, ID_COL)
Tanzania = Tanzania.merge(hh_sec_n, ID_COL)
Tanzania = Tanzania.merge(hh_sec_i, ID_COL)
Tanzania = Tanzania.merge(assets, ID_COL)
Tanzania = Tanzania.merge(appliances, ID_COL)

Tanzania = Tanzania.apply(common_modifiers.rename({
    ID_COL: "ID"
}))
Tanzania = Tanzania.apply(common_modifiers.add_const_driver_many({
   "Hours_available_electricity": np.nan,
    "Measurement_age": np.nan,
    "Tariff_payment_frequency": np.nan,
}))
Tanzania = Tanzania.select(["ID"] + constants.DRIVERS_LIST + constants.PRESENCE_LIST)
Tanzania.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Ownership_large_livestock,Clean_fuel,Tariff_payment_frequency,Presence_refrigerator/freezer,Presence_iron,Presence_fan,Presence_DVD_player,Presence_radio/stereo,Presence_phone_charger,Presence_TV
0,0001-001-001,79.0,0.0,,,171233.333333,3.0,6.0,6.0,79.0,...,No,No,,0.0,,0.0,0.0,0,,0.0
1,0001-001-003,36.0,0.0,,,76316.666667,3.0,6.0,6.0,36.0,...,No,No,,0.0,,0.0,0.0,0,,0.0
2,0001-001-004,3.0,1.0,,,423750.0,1.0,3.0,3.0,29.0,...,No,No,,0.0,,1.0,0.0,1,,0.0
3,0001-004-001,29.0,0.0,,,199066.666667,2.0,6.0,6.0,29.0,...,Yes,No,,0.0,,0.0,0.0,0,,0.0
4,0001-004-002,3.0,0.0,,,48066.666667,1.0,6.0,,28.0,...,No,No,,0.0,,0.0,0.0,0,,0.0


In [29]:
Tanzania.to_csv(root.joinpath("Tanzania.csv"))

<core.ODEDataset.ODEDataset at 0x7153c0bb4a50>