# Ethiopia dataset


In [1]:
import os
import sys
import pandas as pd

from utils.helpers import is_nan

sys.path.append("../../")  # Adds higher directory to python modules path.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import numpy as np
import pandas as pd
from core.ODEDataset import ODEDataset
from utils import common_modifiers, dwelling_modifiers, socio_modifiers, finance_modifiers, appliances_modifiers, \
    geospatial, energy_modifiers, helpers
from utils import constants
import pathlib

## Constants

In [59]:
DB_ROOT = "../../playground/data/ESMAP/Ethiopia_L/csv_data"
root = pathlib.Path(DB_ROOT)
ID_COL = "ID"
HHH_Position = 'HHH_position'
Survey_date = 2017
BIRR_TO_DOLLAR = 0.044

## Categories

In [74]:
Relation_original2final = {
    1: 'Head',
    2: 'Wife/Spouse',
    3: 'Child/adopted child',
    4: 'Grandchild',
    5: 'Niece/Nephew',
    6: 'Father/Mother',
    7: 'Sister/Brother',
    8: 'Son/Daughter-in-law',
    9: 'Brother/Sister-in-law',
    10: 'Father/Mother-in-law',
    11: 'Grandfather/mother',
    12: 'Other relative',
    13: 'Servant/servant’s relative',
    14: 'Other non-relative'
}
Education_levels_original2final = {
    0: "No schooling",
    1: "No schooling",
    2: "Primary education",
    3: "Primary education",
    30: "Primary education",
    4: "Primary education",
    5: "Primary education",
    6: "Primary education",
    7: "Primary education",
    8: "Primary education",
    9: "Primary education",
    10: "Upper secondary education",
    11: "Upper secondary education",
    12: "Upper secondary education",
    13: "Upper secondary education",
    14: "Post-secondary non-tertiary education",
    15: "Post-secondary non-tertiary education",
    16: "Bachelor's or equivalent level",
    17: "Bachelor's or equivalent level",
    18: "Not elsewhere classified",
    19: "Not elsewhere classified",
    20: "Not elsewhere classified",
    np.nan: "No schooling",
    555: "Not elsewhere classified"
}

Socio_status_original2final = {
    1: "Employee",
    2: "Employee",
    20: "Employee",
    3: "Own-account worker non-farm",
    4: "Own-account worker non-farm",
    5: "Own-account worker farm",
    6: "Own-account worker farm",
    7: "Contributing family worker",
    8: "Contributing family worker",
    9: "Other (not specified in Socio_status)",
    10: "Unemployed",
    11: "Unemployed",
    12: "Unemployed",
    13: "Unemployed",
    14: "Unemployed",
    15: "Unemployed",
    16: "Unemployed",
    19: "Other (not specified in Socio_status)",
    np.nan: 'Unemployed',
    21: "Other (not specified in Socio_status)",
    17: "Other (not specified in Socio_status)",
    18: "Other (not specified in Socio_status)",
    555: "Other (not specified in Socio_status)"
}
Wall_quality = {
    1: 0,  # "Wood and mud"
    2: 0,  # "Wood and thatch"
    3: 0,  # "Wood only"
    4: 0,  # "Stone only"
    5: 0,  # "Stone and mud"
    6: 1,  # "Stone and cement"
    7: 1,  # "Blocks, plastered with cement"
    8: 1,  # "Blocks, unplastered"
    9: 1,  # "Bricks"
    10: 0,  # "Mud bricks (traditional)"
    11: 1,  # "Steel"
    12: 0,  # "Cargo container"
    13: 0,  # "Parquet or polished wood"
    14: 0,  # "Chip wood"
    15: 0,  # "Corrugated iron sheet"
    16: 0,  # "Asbestos"
    17: 0,  # "Reed/bamboo"
    555: 0  # "Other, specify"
}
Floor_quality = {
    1: 0,  # "Mud/Dung"
    2: 0,  # "Reed/bamboo"
    3: 1,  # "Wood planks"
    4: 1,  # "Parquet or polished wood"
    5: 1,  # "Cement screed"
    6: 1,  # "Plastic tiles"
    7: 1,  # "Cement tiles"
    8: 1,  # "Brick tiles"
    9: 1,  # "Ceramic/Marble tiles"
    555: 0  # "Other, specify"
}
Roof_quality = {
    1: 0,  # "Wood and mud"
    2: 0,  # "Wood and thatch"
    3: 1,  # "Stone and Cement"
    4: 1,  # "Bricks"
    5: 0,  # "Corrugated iron sheet"
    6: 0,  # "Asbestos"
    7: 0,  # "Reed/bamboo"
    8: 0,  # "Plastic canvas"
    555: 0  # "Other, specify"
}
Toilet_quality = {
    1: 0,  # `np.nan` (interpreted as None or missing value)
    2: 1,  # "Flush to sewage"
    3: 1  # "Flush to septic tank"
}

Clean_Fuels = [
    1,  # LPG/cooking gas
    4,  # Solar
    12,  # Electric
    13,  # Biogas
    14,  # Ethanol

]

# Fixed monthly fee…………………………...……….1
# Pay based on lights and appliances used………….....2
# Utility estimates consumption……………….………3
# Other, specify……………………………………….555
# No bill for electricity……………………………….111

Tariff_payment_frequency_original2final = {
    1: "Monthly",
    2: "Other",
    3: "Other",
    555: "Other",
    111: "No bill"
}

Item_ownership_category = {
    1: "Vehicle (Car, pickup truck, etc)",
    2: "Motorcycle",
    3: "Bicycle",
    4: "Motor boat",
    5: "Other boat",
    6: "Tractor",
    7: "Domestic water pump",
    8: "Ox/Cow/bull/calves",
    9: "Water buffalo/Camel",
    10: "Horse/donkey",
    11: "Sheep",
    12: "Goat",
    13: "Pig",
    14: "Rabbit",
    15: "Fish (Aquaculture)",
    16: "Other, specify",
}
Ownership_motorized_vehicle_original2final = {
    1: 1,
    3: 0,
    7: 0,
    2: 1,
    555: 0,
    6: 0
}

Ownership_small_livestock_original2final = {
    9: 0,
    8: 0,
    12: 1,
    10: 0,
    555: 0,
    13: 1,
    11: 1
}

Ownership_large_livestock_original2final = {
    9: 0,
    8: 0,
    12: 1,
    10: 0,
    555: 0,
    13: 1,
    11: 1
}





## A HH Roster Section

- Age_HHH
- Education_level_HHH
- Socio_status_HHH
- Number_adults

In [4]:
roster = ODEDataset("Ethoipia/Section A_HHRoster")
roster.from_csv(root.joinpath("Section A_HHRoster.csv"))
roster = roster.apply(common_modifiers.rename(({
    'HHID': ID_COL,
    "A4": 'Relation_HHH',
    "A5": 'Age',
    'A9': "Highest_Education",
    'A15': "Occupation",
})))
roster = roster.group_by(ID_COL)
roster.preview()

Unnamed: 0,ID,IndividualID,quest_id,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7,...,A12,A13,A14,Occupation,A16,A17,A18,A19,A20,EAID
0,1010200116005,"[101020011600501, 101020011600502, 10102001160...","[1, 1, 1]","[1, 1, 1]","[1, 1, 1]","[2, 2, 2]","[1, 1, 1]","[16, 16, 16]","[1, 1, 1]","[5, 5, 5]",...,"[6.0, 1.0, nan]","[1.0, 1.0, nan]","[3.0, 15.0, nan]","[20.0, nan, nan]","[12.0, nan, nan]","[24.0, nan, nan]","[1700.0, nan, nan]","[2.0, nan, nan]","[nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001..."
1,1010200116014,"[101020011601401, 101020011601402, 10102001160...","[2, 2, 2, 2]","[1, 1, 1, 1]","[1, 1, 1, 1]","[2, 2, 2, 2]","[1, 1, 1, 1]","[16, 16, 16, 16]","[1, 1, 1, 1]","[14, 14, 14, 14]",...,"[6.0, 1.0, nan, nan]","[1.0, 1.0, nan, nan]","[4.0, 15.0, nan, nan]","[2.0, nan, nan, nan]","[5.0, nan, nan, nan]","[24.0, nan, nan, nan]","[1000.0, nan, nan, nan]","[2.0, nan, nan, nan]","[nan, nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001..."
2,1010200116023,"[101020011602301, 101020011602302, 10102001160...","[3, 3, 3]","[1, 1, 1]","[1, 1, 1]","[2, 2, 2]","[1, 1, 1]","[16, 16, 16]","[1, 1, 1]","[23, 23, 23]",...,"[6.0, 1.0, nan]","[1.0, 1.0, nan]","[4.0, 15.0, nan]","[20.0, nan, nan]","[12.0, nan, nan]","[26.0, nan, nan]","[1000.0, nan, nan]","[1.0, nan, nan]","[0.0, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001..."
3,1010200116032,"[101020011603201, 101020011603202, 10102001160...","[4, 4, 4, 4]","[1, 1, 1, 1]","[1, 1, 1, 1]","[2, 2, 2, 2]","[1, 1, 1, 1]","[16, 16, 16, 16]","[1, 1, 1, 1]","[32, 32, 32, 32]",...,"[6.0, 1.0, nan, nan]","[1.0, 1.0, nan, nan]","[4.0, 15.0, nan, nan]","[2.0, nan, nan, nan]","[6.0, nan, nan, nan]","[26.0, nan, nan, nan]","[1000.0, nan, nan, nan]","[2.0, nan, nan, nan]","[nan, nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001..."
4,1010200116041,"[101020011604101, 101020011604102, 10102001160...","[5, 5, 5, 5, 5]","[1, 1, 1, 1, 1]","[1, 1, 1, 1, 1]","[2, 2, 2, 2, 2]","[1, 1, 1, 1, 1]","[16, 16, 16, 16, 16]","[1, 1, 1, 1, 1]","[41, 41, 41, 41, 41]",...,"[6.0, 1.0, 6.0, nan, nan]","[1.0, 1.0, 2.0, nan, nan]","[12.0, 3.0, nan, nan, nan]","[nan, 7.0, nan, nan, nan]","[nan, 12.0, nan, nan, nan]","[nan, 26.0, nan, nan, nan]","[nan, 500.0, nan, nan, nan]","[nan, 2.0, nan, nan, nan]","[nan, nan, nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001..."


In [5]:
roster = roster.new_feature(HHH_Position,
                            socio_modifiers.extract_household_position('Relation_HHH',
                                                                       1))
roster = roster.new_feature('Age_HHH', common_modifiers.extract_feature_by_position('Age', HHH_Position))

roster = roster.new_feature('Education_level_HHH_raw', common_modifiers.extract_feature_by_position('Highest_Education',
                                                                                                    HHH_Position))

roster = roster.new_feature('Education_level_HHH',
                            common_modifiers.categorize("Education_level_HHH_raw", Education_levels_original2final))

roster = roster.new_feature('Socio_status_HHH_raw',
                            common_modifiers.extract_feature_by_position('Occupation', HHH_Position))

roster = roster.new_feature('Socio_status_HHH',
                            common_modifiers.categorize("Socio_status_HHH_raw", Socio_status_original2final))

roster = roster.new_feature('Number_adults', socio_modifiers.extract_age_groups('Age', 'adults'))

roster.preview()


Unnamed: 0,ID,IndividualID,quest_id,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7,...,A19,A20,EAID,HHH_position,Age_HHH,Education_level_HHH_raw,Education_level_HHH,Socio_status_HHH_raw,Socio_status_HHH,Number_adults
0,1010200116005,"[101020011600501, 101020011600502, 10102001160...","[1, 1, 1]","[1, 1, 1]","[1, 1, 1]","[2, 2, 2]","[1, 1, 1]","[16, 16, 16]","[1, 1, 1]","[5, 5, 5]",...,"[2.0, nan, nan]","[nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001...",0,30,10.0,Upper secondary education,20.0,Employee,2
1,1010200116014,"[101020011601401, 101020011601402, 10102001160...","[2, 2, 2, 2]","[1, 1, 1, 1]","[1, 1, 1, 1]","[2, 2, 2, 2]","[1, 1, 1, 1]","[16, 16, 16, 16]","[1, 1, 1, 1]","[14, 14, 14, 14]",...,"[2.0, nan, nan, nan]","[nan, nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001...",0,35,,,2.0,Employee,2
2,1010200116023,"[101020011602301, 101020011602302, 10102001160...","[3, 3, 3]","[1, 1, 1]","[1, 1, 1]","[2, 2, 2]","[1, 1, 1]","[16, 16, 16]","[1, 1, 1]","[23, 23, 23]",...,"[1.0, nan, nan]","[0.0, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001...",0,35,,,20.0,Employee,2
3,1010200116032,"[101020011603201, 101020011603202, 10102001160...","[4, 4, 4, 4]","[1, 1, 1, 1]","[1, 1, 1, 1]","[2, 2, 2, 2]","[1, 1, 1, 1]","[16, 16, 16, 16]","[1, 1, 1, 1]","[32, 32, 32, 32]",...,"[2.0, nan, nan, nan]","[nan, nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001...",0,38,,,2.0,Employee,2
4,1010200116041,"[101020011604101, 101020011604102, 10102001160...","[5, 5, 5, 5, 5]","[1, 1, 1, 1, 1]","[1, 1, 1, 1, 1]","[2, 2, 2, 2, 2]","[1, 1, 1, 1, 1]","[16, 16, 16, 16, 16]","[1, 1, 1, 1, 1]","[41, 41, 41, 41, 41]",...,"[nan, 2.0, nan, nan, nan]","[nan, nan, nan, nan, nan]","[1010220100116.0, 1010220100116.0, 10102201001...",0,80,,,,,1


In [6]:
roster = roster.select([
    ID_COL,
    'Number_adults',
    'Age_HHH',
    'Education_level_HHH',
    'Socio_status_HHH'
])

## Section B
- Number_of_rooms
- Years_of_HHH_in_community
- Dwelling_quality_index

In [7]:
Section_B = ODEDataset("Ethoipia/Section B")
Section_B.from_csv(root.joinpath("Section B.csv"))
Section_B = Section_B.apply(common_modifiers.rename(({
    'HHID': ID_COL,
    "B9": 'Number_of_rooms',
    'B4': 'Years_of_HHH_in_community',
}))).group_by(ID_COL).apply(
    common_modifiers.take('Number_of_rooms', 0)).apply(
    common_modifiers.take('Years_of_HHH_in_community', 0))

Section_B.preview()

Unnamed: 0,ID,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7,B1,B2,...,B17,B17_other,B18,B19,B19_a_other,B21,B22,B23,HI_Id,HI_1a
0,1010200116005,[1],[1],[2],[1],[16],[1],[5],[1],[1.0],...,[1.0],[nan],[2.0],[nan],[nan],[2.0],[nan],[nan],[05:35:04],[05:35:04]
1,1010200116014,[1],[1],[2],[1],[16],[1],[14],[1],[1.0],...,[1.0],[nan],[2.0],[nan],[nan],[2.0],[nan],[nan],[02:46:04],[02:46:04]
2,1010200116023,[1],[1],[2],[1],[16],[1],[23],[1],[1.0],...,[1.0],[nan],[2.0],[nan],[nan],[2.0],[nan],[nan],[03:38:55],[03:38:55]
3,1010200116032,[1],[1],[2],[1],[16],[1],[32],[1],[1.0],...,[1.0],[nan],[2.0],[nan],[nan],[2.0],[nan],[nan],[08:28:08],[08:28:08]
4,1010200116041,[1],[1],[2],[1],[16],[1],[41],[1],[1.0],...,[1.0],[nan],[2.0],[nan],[nan],[2.0],[nan],[nan],[02:31:15],[02:31:15]


### Dwelling Quality Index

In [8]:
Section_B = Section_B.new_feature('Dwelling_wall', dwelling_modifiers.get_quality(['B10', 'B10_other'], Wall_quality))
Section_B = Section_B.new_feature('Dwelling_roof', dwelling_modifiers.get_quality(['B11', 'B11_other'], Roof_quality))
Section_B = Section_B.new_feature('Dwelling_floor', dwelling_modifiers.get_quality(['B12', 'B12_other'], Floor_quality))
Section_B = Section_B.new_feature('Dwelling_toilet',
                                  dwelling_modifiers.get_quality(['B13', 'B13_other'], Toilet_quality))
Section_B = Section_B.new_feature('Dwelling_water', dwelling_modifiers.get_quality(['B14', 'B14_other'], Wall_quality))

Section_B = Section_B.new_feature('Dwelling_quality_index', dwelling_modifiers.dwelling_quality_index())

Section_B.preview()

Unnamed: 0,ID,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7,B1,B2,...,B22,B23,HI_Id,HI_1a,Dwelling_wall,Dwelling_roof,Dwelling_floor,Dwelling_toilet,Dwelling_water,Dwelling_quality_index
0,1010200116005,[1],[1],[2],[1],[16],[1],[5],[1],[1.0],...,[nan],[nan],[05:35:04],[05:35:04],0.0,0.0,0.5,0.0,0.0,0.0
1,1010200116014,[1],[1],[2],[1],[16],[1],[14],[1],[1.0],...,[nan],[nan],[02:46:04],[02:46:04],0.0,0.0,0.0,0.0,0.0,0.0
2,1010200116023,[1],[1],[2],[1],[16],[1],[23],[1],[1.0],...,[nan],[nan],[03:38:55],[03:38:55],0.0,0.0,0.0,0.0,0.0,0.0
3,1010200116032,[1],[1],[2],[1],[16],[1],[32],[1],[1.0],...,[nan],[nan],[08:28:08],[08:28:08],0.0,0.0,0.0,0.0,0.0,0.0
4,1010200116041,[1],[1],[2],[1],[16],[1],[41],[1],[1.0],...,[nan],[nan],[02:31:15],[02:31:15],0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
Section_B = Section_B.select([
    ID_COL, 'Number_of_rooms', 'Years_of_HHH_in_community', 'Dwelling_quality_index'
])

## Section C
connections


In [64]:
Section_C = ODEDataset("Ethoipia/Section C")
Section_C.from_csv(root.joinpath("Section C.csv"))

Section_C = Section_C.apply(common_modifiers.rename({
    "HHID": ID_COL,
    "C2": "Connected_National_Grid",
    "C38": "Connected_Local_Grid",
})).group_by(ID_COL)

Section_C.preview()



Unnamed: 0,ID,C1,Connected_National_Grid,C3,C4,C4_other,C5,C6_mm,C6_yy,C7,...,C118,C119_1,C119_1_other,C119_2,C119_2_other,C120,C121,C122,C123,C124
0,1010200116005,[1],[1],[nan],[nan],[nan],[nan],[nan],[nan],[3.0],...,[nan],[nan],[nan],[nan],[nan],[nan],[3.0],[0.0],[0.0],[1.0]
1,1010200116014,[1],[1],[nan],[nan],[nan],[nan],[nan],[nan],[1.0],...,[nan],[nan],[nan],[nan],[nan],[nan],[4.0],[nan],[nan],[nan]
2,1010200116023,[1],[1],[nan],[nan],[nan],[nan],[nan],[nan],[1.0],...,[nan],[nan],[nan],[nan],[nan],[nan],[4.0],[nan],[nan],[nan]
3,1010200116032,[1],[1],[nan],[nan],[nan],[nan],[nan],[nan],[6.0],...,[nan],[nan],[nan],[nan],[nan],[nan],[4.0],[nan],[nan],[nan]
4,1010200116041,[1],[1],[nan],[nan],[nan],[nan],[nan],[nan],[10.0],...,[nan],[nan],[nan],[nan],[nan],[nan],[3.0],[0.0],[0.0],[1.0]


In [69]:
Section_C = Section_C.new_feature('Connection_type', energy_modifiers.get_connection_type('Connected_National_Grid',
                                                                                          'Connected_Local_Grid',
                                                                                          'C121',
                                                                                          {constants.NATIONAL_GRID: 1,
                                                                                           constants.LOCAL_MINI_GRID: 1,
                                                                                           constants.SOLAR_HOME_SYSTEM: 1}))

Section_C = Section_C.new_feature('Hours_available_electricity', energy_modifiers.get_value_from_connection({
    constants.NATIONAL_GRID: 'C25_wm',
    constants.LOCAL_MINI_GRID: 'q62_wm',
    constants.SOLAR_HOME_SYSTEM: None}
))

Section_C = Section_C.new_feature('Measurement_age', energy_modifiers.get_value_from_connection({
    constants.NATIONAL_GRID: 'C7',
    constants.LOCAL_MINI_GRID: 'C41',
    constants.SOLAR_HOME_SYSTEM: None}
))


In [70]:
Section_C = Section_C.new_feature('Tariff_payment_frequency',
                                  common_modifiers.categorize('C17', Tariff_payment_frequency_original2final)).apply(
    common_modifiers.take('Tariff_payment_frequency', 0))

In [71]:
Section_C = Section_C.select([
    ID_COL, 'Connection_type', 'Hours_available_electricity', 'Measurement_age', 'Tariff_payment_frequency'
])
Section_C.preview()

Unnamed: 0,ID,Connection_type,Hours_available_electricity,Measurement_age,Tariff_payment_frequency
0,1010200116005,National grid,12.0,3.0,
1,1010200116014,National grid,12.0,1.0,Monthly
2,1010200116023,National grid,12.0,1.0,Monthly
3,1010200116032,National grid,12.0,6.0,Monthly
4,1010200116041,National grid,10.0,10.0,Monthly


## Section H
Fuel Consumption


In [13]:
Section_H = ODEDataset("Ethoipia/Section H")
Section_H.from_csv(root.joinpath("Section H.csv"))

Section_H = Section_H.apply(common_modifiers.rename({
    "HHID": ID_COL,
    "H2": "Fuel_usage",
})).group_by(ID_COL)

Section_H.preview()

Unnamed: 0,ID,Fuel_usage,H3,H4,H5,H6,H7,H8,H9,H12,H13,H14,H14_unit,H15,C2
0,1010200116005,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 7.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 8.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 1.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 4.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 200.0, -88.0, nan, nan, nan, n...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1010200116014,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 7.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 8.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 1.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 5.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 160.0, -88.0, nan, nan, nan, n...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,1010200116023,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 8.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 8.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 1.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 4.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 150.0, -88.0, nan, nan, nan, n...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1010200116032,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 7.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 8.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 1.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 5.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 150.0, -88.0, nan, nan, nan, n...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1010200116041,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 1.0, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, 7.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 8.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 1.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 4.0, -88.0, nan, nan, nan, nan...","[nan, nan, nan, 200.0, -88.0, nan, nan, nan, n...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [14]:
Section_H = Section_H.new_feature('Clean_fuel', energy_modifiers.all_fuels_clean('Fuel_usage', Clean_Fuels))

Section_H.select([
    ID_COL, 'Clean_fuel'
]).preview()

Unnamed: 0,ID,Clean_fuel
0,1010200116005,0
1,1010200116014,0
2,1010200116023,0
3,1010200116032,0
4,1010200116041,0


## Expenditures

In [60]:
Section_L = ODEDataset("Ethoipia/Section L")
Section_L.from_csv(root.joinpath("Section L_expenditure.csv"))
Section_L = Section_L.apply(common_modifiers.rename({
    "HHID": ID_COL,
})).group_by(ID_COL)

Section_L.preview(300)

Unnamed: 0,ID,Expenditure_item,L2
0,1010200116005,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, 200, 20, 100, -888, -888, -888, -888]"
1,1010200116014,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, 100, 10, 25, -888, 150, 15, -888]"
2,1010200116023,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, 150, 30, 50, -888, -888, 15, -888]"
3,1010200116032,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, 100, 15, 50, -888, 150, 20, -888]"
4,1010200116041,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, 150, 25, 50, -888, -888, 20, -888]"
...,...,...,...
295,2070501103079,"[1, 2, 3, 4, 5, 6, 7, 8]","[0, 0, 20, -888, -888, -888, 0, -888]"
296,2070501103089,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, 60, 20, -888, -888, -888, -888, -888]"
297,2070501103103,"[1, 2, 3, 4, 5, 6, 7, 8]","[0, 60, 0, 0, 0, 500, 0, 0]"
298,2070501103106,"[1, 2, 3, 4, 5, 6, 7, 8]","[-888, -888, 25, 50, 0, -888, 60, 0]"


Monthly expenditures are represented by two columns `Expenditure_item` and `L2` (as amount).
since we have 8 items we need to create 8 new columns for each item and put the amount in the respective column.

In [61]:
def extract_expense(idx):
    def inner(row: pd.Series):
        item = row['Expenditure_item'][idx]
        amounts = row['L2']
        if item == np.nan:
            return np.nan
        item = int(item) - 1
        if item < 0:
            return np.nan

        if item >= len(amounts):
            return np.nan

        value = amounts[int(item)]
        if value == np.nan:
            return np.nan

        if value < 0:
            return 0
        return value

    return inner


def calculate_monthly_expenses_ethiopia(row):
    items = row['Expenditure_item']
    expenses = []
    for k in range(len(items)):
        value = extract_expense(k)(row)
        if not helpers.is_nan(value):
            expenses.append(value)
    birr = sum(expenses) * 30  # 30 days
    return birr * BIRR_TO_DOLLAR

In [62]:

Section_L = Section_L.new_feature("Monthly_expenditure", calculate_monthly_expenses_ethiopia)

Section_L = Section_L.select([
    ID_COL, 'Monthly_expenditure'
])

Section_L.preview()

Unnamed: 0,ID,Monthly_expenditure
0,1010200116005,422.4
1,1010200116014,396.0
2,1010200116023,323.4
3,1010200116032,442.2
4,1010200116041,323.4


## Climate and Geospatial Data

In [122]:
Identification = ODEDataset("Ethoipia/Identification")
Identification.from_csv(root.joinpath("Identification_gadm.csv"))
Identification = Identification.apply(common_modifiers.rename({
    "HHID": ID_COL,
})).group_by(ID_COL)

Identification.preview()

Unnamed: 0,ID,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7new,HI_14,eaid,A0,zone_csa,wereda_name,Region,Zone,Woreda
0,1010200116005,[1],[1],[2],[1],[16],[1],[5],[10/02/17],[1010220100116.0],[3],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo]
1,1010200116014,[1],[1],[2],[1],[16],[1],[14],[10/02/17],[1010220100116.0],[4],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo]
2,1010200116023,[1],[1],[2],[1],[16],[1],[23],[10/02/17],[1010220100116.0],[3],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo]
3,1010200116032,[1],[1],[2],[1],[16],[1],[32],[10/02/17],[1010220100116.0],[4],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo]
4,1010200116041,[1],[1],[2],[1],[16],[1],[41],[11/02/17],[1010220100116.0],[5],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo]


In [123]:
gadm_level_1_df = pd.read_csv(root.joinpath('GADM_level_1.csv'))
gadm_level_2_df = pd.read_csv(root.joinpath('GADM_level_2.csv'))
gadm_level_3_df = pd.read_csv(root.joinpath('GADM_level_3.csv'))



In [124]:
Identification = Identification.apply(common_modifiers.add_const_driver("GADM_level_0", "Ethiopia"))

In [125]:

Identification = Identification.new_feature("GADM_level_1", lambda x: x['Region'][0])
Identification = Identification.new_feature("GADM_level_2", lambda x: x['Zone'][0])
Identification = Identification.new_feature("GADM_level_3", lambda x: x['Woreda'][0])



Unnamed: 0,ID,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7new,HI_14,eaid,A0,zone_csa,wereda_name,Region,Zone,Woreda,GADM_level_0,GADM_level_1,GADM_level_2,GADM_level_3
0,1010200116005,[1],[1],[2],[1],[16],[1],[5],[10/02/17],[1010220100116.0],[3],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
1,1010200116014,[1],[1],[2],[1],[16],[1],[14],[10/02/17],[1010220100116.0],[4],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
2,1010200116023,[1],[1],[2],[1],[16],[1],[23],[10/02/17],[1010220100116.0],[3],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
3,1010200116032,[1],[1],[2],[1],[16],[1],[32],[10/02/17],[1010220100116.0],[4],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
4,1010200116041,[1],[1],[2],[1],[16],[1],[41],[11/02/17],[1010220100116.0],[5],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo


In [126]:
Identification.preview()

Unnamed: 0,ID,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7new,HI_14,eaid,A0,zone_csa,wereda_name,Region,Zone,Woreda,GADM_level_0,GADM_level_1,GADM_level_2,GADM_level_3
0,1010200116005,[1],[1],[2],[1],[16],[1],[5],[10/02/17],[1010220100116.0],[3],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
1,1010200116014,[1],[1],[2],[1],[16],[1],[14],[10/02/17],[1010220100116.0],[4],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
2,1010200116023,[1],[1],[2],[1],[16],[1],[23],[10/02/17],[1010220100116.0],[3],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
3,1010200116032,[1],[1],[2],[1],[16],[1],[32],[10/02/17],[1010220100116.0],[4],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo
4,1010200116041,[1],[1],[2],[1],[16],[1],[41],[11/02/17],[1010220100116.0],[5],[1],[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo


In [128]:

Identification = Identification.new_feature("Climate_zone_lev_1",
                                      geospatial.gis_info_by_gadm_level('Climate_majority',
                                                                        gadm_level_1_df,
                                                                        'GADM_level_1',
                                                                        ))

Identification = Identification.new_feature("Climate_zone_lev_2",
                                      geospatial.gis_info_by_gadm_level('Climate_majority',
                                                                        gadm_level_2_df,
                                                                        'GADM_level_2',
                                                                        ))

Identification.preview()

Unnamed: 0,ID,HI_1,HI_2,HI_3,HI_4,HI_5,HI_6,HI_7new,HI_14,eaid,...,wereda_name,Region,Zone,Woreda,GADM_level_0,GADM_level_1,GADM_level_2,GADM_level_3,Climate_zone_lev_1,Climate_zone_lev_2
0,1010200116005,[1],[1],[2],[1],[16],[1],[5],[10/02/17],[1010220100116.0],...,[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo,5.303,
1,1010200116014,[1],[1],[2],[1],[16],[1],[14],[10/02/17],[1010220100116.0],...,[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo,5.303,
2,1010200116023,[1],[1],[2],[1],[16],[1],[23],[10/02/17],[1010220100116.0],...,[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo,5.303,
3,1010200116032,[1],[1],[2],[1],[16],[1],[32],[10/02/17],[1010220100116.0],...,[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo,5.303,
4,1010200116041,[1],[1],[2],[1],[16],[1],[41],[11/02/17],[1010220100116.0],...,[LAELAY ADIABO],[Tigray],[Addis Abeba],[Ababo],Ethiopia,Tigray,Addis Abeba,Ababo,5.303,


In [132]:
Identification = Identification.select([
    ID_COL, 'Climate_zone_lev_1', "Climate_zone_lev_2"
])

## Household Ownership
- Ownership_large_livestock
- Ownership_motorized_vehicle
- Ownership_small_livestock


In [93]:
Section_M = ODEDataset("Ethoipia/Section M")
Section_M.from_csv(root.joinpath("Section M_M1_M16.csv"))
Section_M = Section_M.apply(common_modifiers.rename({
    "HHID": ID_COL,
})).group_by(ID_COL)

Section_M.preview()

Unnamed: 0,ID,M_a,M_b,M1
0,1010200116005,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,1010200116014,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,1010200116023,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
3,1010200116032,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,1010200116041,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


In [94]:
def from_ownership_get_vehicle(row: pd.Series):
    items = row['M1']
    vehicles = []
    for k in range(len(items)):
        value = items[k]
        if value == np.nan:
            continue
        if value in Ownership_motorized_vehicle_original2final.keys():
            amount = row['M_a'][k]
            if amount > 0:
                vehicles.append(Ownership_motorized_vehicle_original2final[value])

    return vehicles[0] if len(vehicles) > 0 else np.nan

In [95]:
def from_amount_get_livestock(min_amount, max_amount, category):
    def inner(row: pd.Series):
        items = row['M1']
        values = []
        for k in range(len(items)):
            value = items[k]
            if value == np.nan:
                continue

            if value in category.keys():
                amount = row['M_a'][k]
                if min_amount <= amount <= max_amount:
                    values.append(category[value])

        return values[0] if len(values) > 0 else np.nan

    return inner

In [96]:
Section_M = Section_M.new_feature('Ownership_motorized_vehicle', from_ownership_get_vehicle)

Section_M = Section_M.new_feature('Ownership_large_livestock', from_amount_get_livestock(10, np.inf, Ownership_large_livestock_original2final))
Section_M = Section_M.new_feature('Ownership_small_livestock', from_amount_get_livestock(1, 9, Ownership_small_livestock_original2final))

Section_M = Section_M.select([
    ID_COL, 'Ownership_large_livestock', 'Ownership_motorized_vehicle', 'Ownership_small_livestock'
])
Section_M.preview(100)

Unnamed: 0,ID,Ownership_large_livestock,Ownership_motorized_vehicle,Ownership_small_livestock
0,1010200116005,,,
1,1010200116014,,,
2,1010200116023,,,
3,1010200116032,,,
4,1010200116041,,,
...,...,...,...,...
95,1021200418123,,,
96,1030300103004,,0.0,
97,1030300103017,,,
98,1030300103030,,,
