In [1]:
import os
import sys

sys.path.append("./")  # Adds higher directory to python modules path.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'


In [2]:
from core.ODEDataset import ODEDataset
import pandas as pd

pd.set_option('future.no_silent_downcasting', True)
from utils import cleaning, constants, helpers


In [3]:
filepath = "./playground/data/esmap_lsms-tanzania.csv"
missing_value = -1


In [4]:
dataset = ODEDataset("combined")
dataset.from_csv(filepath)
dataset = dataset.select([
                             "ID",
                         ] + constants.DRIVERS_LIST + constants.PRESENCE_LIST)


# Drivers

## Years_of_HHH_in_community

In [5]:
dataset = dataset.apply(cleaning.fillna("Years_of_HHH_in_community", missing_value))
dataset = dataset.apply(cleaning.astype("Years_of_HHH_in_community", int))
dataset = dataset.new_feature("Years_of_HHH_in_community_missing",
                              cleaning.add_missing_flag("Years_of_HHH_in_community", missing_value))

dataset = dataset.apply(cleaning.min_max_normalize("Years_of_HHH_in_community"))
dataset.to_dataframe()[["Years_of_HHH_in_community", "Years_of_HHH_in_community_missing"]].head()

Unnamed: 0,Years_of_HHH_in_community,Years_of_HHH_in_community_missing
0,0.0,1
1,0.0,1
2,0.023622,0
3,0.023622,0
4,0.017998,0


## Dwelling_quality_index


In [6]:

dataset = dataset.apply(cleaning.fillna("Dwelling_quality_index", missing_value))
dataset = dataset.apply(cleaning.astype("Dwelling_quality_index", int))
dataset = dataset.new_feature("Dwelling_quality_index_missing",
                              cleaning.add_missing_flag("Dwelling_quality_index", missing_value))
dataset = dataset.apply(cleaning.min_max_normalize("Dwelling_quality_index"))

dataset.to_dataframe()[["Dwelling_quality_index", "Dwelling_quality_index_missing"]].head()

Unnamed: 0,Dwelling_quality_index,Dwelling_quality_index_missing
0,0.125,0
1,0.125,0
2,0.125,0
3,0.125,0
4,0.125,0


## Hours_available_electricity

In [7]:
dataset = dataset.apply(cleaning.fillna("Hours_available_electricity", missing_value))
dataset = dataset.apply(cleaning.astype("Hours_available_electricity", int))
dataset = dataset.new_feature("Hours_available_electricity_missing",
                              cleaning.add_missing_flag("Hours_available_electricity", missing_value))
dataset = dataset.apply(cleaning.min_max_normalize("Hours_available_electricity"))

dataset.to_dataframe()[["Hours_available_electricity", "Hours_available_electricity_missing"]].head()


Unnamed: 0,Hours_available_electricity,Hours_available_electricity_missing
0,0.007812,1
1,0.007812,1
2,0.007812,1
3,0.007812,1
4,0.007812,1


## Measurement_age

In [8]:
dataset = dataset.apply(cleaning.fillna("Measurement_age", missing_value))
dataset = dataset.apply(cleaning.replace_value("Measurement_age", -99, missing_value))
dataset = dataset.apply(cleaning.astype("Measurement_age", float))
dataset = dataset.new_feature("Measurement_age_missing", cleaning.add_missing_flag("Measurement_age", missing_value))
dataset = dataset.apply(cleaning.min_max_normalize("Measurement_age"))

dataset.to_dataframe()[["Measurement_age", "Measurement_age_missing"]].head()


Unnamed: 0,Measurement_age,Measurement_age_missing
0,0.0,1
1,0.0,1
2,0.0,1
3,0.0,1
4,0.0,1


## Monthly_expenditure

In [9]:
dataset = dataset.apply(cleaning.fillna("Monthly_expenditure", missing_value))
dataset = dataset.apply(cleaning.astype("Monthly_expenditure", float))
dataset = dataset.apply(cleaning.replace_value("Monthly_expenditure", -99, missing_value))
dataset = dataset.new_feature("Monthly_expenditure_missing", cleaning.add_missing_flag("Monthly_expenditure", missing_value))
dataset = dataset.apply(cleaning.min_max_normalize("Monthly_expenditure"))

dataset.to_dataframe()[["Monthly_expenditure", "Monthly_expenditure_missing"]].head()

Unnamed: 0,Monthly_expenditure,Monthly_expenditure_missing
0,0.003872,0
1,0.003872,0
2,0.003981,0
3,0.003981,0
4,0.0,1


## Number_of_rooms

In [10]:
dataset = dataset.apply(cleaning.fillna("Number_of_rooms", missing_value))
dataset = dataset.apply(cleaning.astype("Number_of_rooms", int))
dataset = dataset.new_feature("Number_of_rooms_missing", cleaning.add_missing_flag("Number_of_rooms", missing_value))
dataset = dataset.apply(cleaning.min_max_normalize("Number_of_rooms"))

dataset.to_dataframe()[["Number_of_rooms", "Number_of_rooms_missing"]].head()



Unnamed: 0,Number_of_rooms,Number_of_rooms_missing
0,0.086957,0
1,0.086957,0
2,0.086957,0
3,0.086957,0
4,0.065217,0


## Climate_zone_lev_1

In [11]:
dataset = dataset.apply(cleaning.fillna("Climate_zone_lev_1", missing_value))
dataset = dataset.apply(cleaning.astype("Climate_zone_lev_1", float))
dataset = dataset.new_feature("Climate_zone_lev_1_missing", cleaning.add_missing_flag("Climate_zone_lev_1", missing_value))

dataset = dataset.apply(cleaning.min_max_normalize("Climate_zone_lev_1"))

dataset.to_dataframe()[["Climate_zone_lev_1", "Climate_zone_lev_1_missing"]].head()



Unnamed: 0,Climate_zone_lev_1,Climate_zone_lev_1_missing
0,0.188229,0
1,0.188229,0
2,0.188229,0
3,0.188229,0
4,0.188229,0


## Climate_zone_lev_2

In [12]:
dataset = dataset.apply(cleaning.fillna("Climate_zone_lev_2", missing_value))
dataset = dataset.apply(cleaning.astype("Climate_zone_lev_2", float))

dataset = dataset.new_feature("Climate_zone_lev_2_missing", cleaning.add_missing_flag("Climate_zone_lev_2", missing_value))

dataset = dataset.apply(cleaning.min_max_normalize("Climate_zone_lev_2"))

dataset.to_dataframe()[["Climate_zone_lev_2", "Climate_zone_lev_2_missing"]].head()



Unnamed: 0,Climate_zone_lev_2,Climate_zone_lev_2_missing
0,0.0,1
1,0.0,1
2,0.185917,0
3,0.185917,0
4,0.185917,0


## Age_HHH

In [13]:
dataset = dataset.apply(cleaning.fillna("Age_HHH", missing_value))
dataset = dataset.apply(cleaning.astype("Age_HHH", int))
dataset = dataset.new_feature("Age_HHH_missing", cleaning.add_missing_flag("Age_HHH", missing_value))

dataset = dataset.apply(cleaning.min_max_normalize("Age_HHH"))
dataset.to_dataframe()[["Age_HHH", "Age_HHH_missing"]].head()


Unnamed: 0,Age_HHH,Age_HHH_missing
0,0.0,1
1,0.0,1
2,0.0,1
3,0.0,1
4,0.0,1


## HH_with_home_business

In [14]:
dataset = dataset.apply(cleaning.fillna("HH_with_home_business", 'missing'))
dataset = dataset.apply(cleaning.astype("HH_with_home_business", str))
dataset.value_counts("HH_with_home_business")

HH_with_home_business
No         4750
missing    4474
Yes        1494
Name: count, dtype: int64

## Education_level_HHH

In [15]:

dataset = dataset.apply(cleaning.fillna("Education_level_HHH", "Other"))
dataset.to_dataframe()["Education_level_HHH"].value_counts()

Education_level_HHH
Other                                    3619
Primary education                        3502
Upper secondary education                1273
Lower secondary education                 963
Post-secondary non-tertiary education     469
Bachelor's or equivalent level            404
No schooling                              245
Short-cycle tertiary education            117
Doctoral or equivalent level               57
Not elsewhere classified                   56
Secondary education                        10
Master's or equivalent level                3
Name: count, dtype: int64

## Socio_status_HHH

In [16]:
dataset = dataset.apply(cleaning.fillna("Socio_status_HHH", "Other (not specified in Socio_status)"))

dataset = dataset.apply(cleaning.replace_value("Socio_status_HHH", "Other (not specified in Socio_status)", "Other"))

dataset.to_dataframe()["Socio_status_HHH"].value_counts()

Socio_status_HHH
Own-account worker farm              1994
Employee                             1932
Other                                1821
Unemployed                           1679
Employee non-farm                    1097
Own-account worker non-farm          1062
Contributing family worker            501
Worker not classifiable by status     405
Employee farm                         187
Employer                               40
Name: count, dtype: int64

 ## Number_adults


In [17]:
dataset = dataset.apply(cleaning.fillna("Number_adults", -1))
dataset = dataset.apply(cleaning.astype("Number_adults", int))
dataset = dataset.new_feature("Number_adults_missing", cleaning.add_missing_flag("Number_adults", -1))

dataset.to_dataframe()[["Number_adults", "Number_adults_missing"]].head()


Unnamed: 0,Number_adults,Number_adults_missing
0,34,0
1,34,0
2,59,0
3,59,0
4,57,0


## Ownership_motorized_vehicle


In [18]:
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "1.0", 'Yes'))
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "0.0", 'No'))
dataset = dataset.apply(cleaning.fillna("Ownership_motorized_vehicle", 'missing'))
dataset = dataset.apply(cleaning.astype("Ownership_motorized_vehicle", str))

dataset.value_counts("Ownership_motorized_vehicle")

Ownership_motorized_vehicle
Yes        4982
missing    4216
No         1520
Name: count, dtype: int64

## Ownership_small_livestock


In [19]:
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "1.0", 'Yes'))
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "0.0", 'No'))
dataset = dataset.apply(cleaning.fillna("Ownership_small_livestock", 'missing'))
dataset = dataset.apply(cleaning.astype("Ownership_small_livestock", str))

dataset.value_counts("Ownership_small_livestock")

Ownership_small_livestock
Yes        5102
No         2876
missing    2740
Name: count, dtype: int64

## Ownership_large_livestock


In [20]:
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "1.0", 'Yes'))
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "0.0", 'No'))
dataset = dataset.apply(cleaning.fillna("Ownership_large_livestock", 'missing'))
dataset = dataset.apply(cleaning.astype("Ownership_large_livestock", str))

dataset.value_counts("Ownership_large_livestock")

Ownership_large_livestock
Yes        5070
missing    4231
No         1417
Name: count, dtype: int64

## Clean_fuel


In [21]:
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "0", 'No'))
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "1", 'Yes'))
dataset = dataset.apply(cleaning.fillna("Clean_fuel", 'missing'))
dataset = dataset.apply(cleaning.astype("Clean_fuel", str))
dataset.value_counts("Clean_fuel")

Clean_fuel
No         10002
Yes          596
missing      120
Name: count, dtype: int64

## Tariff_payment_frequency

In [22]:
dataset = dataset.apply(cleaning.fillna("Tariff_payment_frequency", "Other"))
dataset.to_dataframe()["Tariff_payment_frequency"].value_counts()

Tariff_payment_frequency
Other             8856
Monthly           1576
Every 2 weeks      194
Weekly              49
No bill             38
Every 6 months       4
Every 3 months       1
Name: count, dtype: int64

# Appliances


In [23]:
PRESENCE_LIST = [
    'Presence_refrigerator/freezer', 'Presence_iron', 'Presence_fan',
    'Presence_DVD_player', 'Presence_radio/stereo',
    'Presence_phone_charger',
    'Presence_TV'
]

for p in PRESENCE_LIST:
    dataset = dataset.apply(cleaning.fillna(p, -1))
    value_counts = dataset.value_counts(p)
    print("Missing values for", p, value_counts[-1])

Missing values for Presence_refrigerator/freezer 697
Missing values for Presence_iron 1976
Missing values for Presence_fan 794
Missing values for Presence_DVD_player 794
Missing values for Presence_radio/stereo 792
Missing values for Presence_phone_charger 1184
Missing values for Presence_TV 2


In [24]:
dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Years_of_HHH_in_community_missing,Dwelling_quality_index_missing,Hours_available_electricity_missing,Measurement_age_missing,Monthly_expenditure_missing,Number_of_rooms_missing,Climate_zone_lev_1_missing,Climate_zone_lev_2_missing,Age_HHH_missing,Number_adults_missing
0,1001_3,0.0,0.125,0.007812,0.0,0.003872,0.086957,0.188229,0.0,0.0,...,1,0,1,1,0,0,0,1,1,0
1,1001_3,0.0,0.125,0.007812,0.0,0.003872,0.086957,0.188229,0.0,0.0,...,1,0,1,1,0,0,0,1,1,0
2,1026_3,0.023622,0.125,0.007812,0.0,0.003981,0.086957,0.188229,0.185917,0.0,...,0,0,1,1,0,0,0,0,1,0
3,1026_3,0.023622,0.125,0.007812,0.0,0.003981,0.086957,0.188229,0.185917,0.0,...,0,0,1,1,0,0,0,0,1,0
4,1027_2,0.017998,0.125,0.007812,0.0,0.0,0.065217,0.188229,0.185917,0.0,...,0,0,1,1,1,0,0,0,1,0


In [25]:
dataset.to_csv("./playground/data/dataset_normalized.csv")

<core.ODEDataset.ODEDataset at 0x7206863af390>