In [1]:
import os
import sys
sys.path.append("./")  # Adds higher directory to python modules path.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'


In [2]:
from core.ODEDataset import ODEDataset
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from utils import cleaning, constants, helpers


In [3]:
filepath = "./playground/data/esmap_lsms-tanzania.csv"


In [4]:
dataset = ODEDataset("combined")
dataset.from_csv(filepath)
dataset = dataset.select([
                             "ID",
                         ] + constants.DRIVERS_LIST + constants.PRESENCE_LIST)


# Drivers

## Years_of_HHH_in_community

In [5]:
dataset = dataset.apply(cleaning.fillna("Years_of_HHH_in_community", -1))
dataset = dataset.apply(cleaning.astype("Years_of_HHH_in_community", int))
dataset.value_counts("Years_of_HHH_in_community")

Years_of_HHH_in_community
-1     3580
 1      515
 20     392
 10     386
 2      382
       ... 
 86       1
 81       1
 90       1
 98       1
 89       1
Name: count, Length: 92, dtype: int64

## Dwelling_quality_index


In [6]:
dataset = dataset.apply(cleaning.fillna("Dwelling_quality_index", -1))
dataset = dataset.apply(cleaning.astype("Dwelling_quality_index", int))
dataset.value_counts("Dwelling_quality_index")

Dwelling_quality_index
 0    9020
 1    1332
-1     204
 5      56
 4      51
 6      23
 7      20
 2       6
 3       6
Name: count, dtype: int64

## Hours_available_electricity

In [7]:
dataset = dataset.apply(cleaning.fillna("Hours_available_electricity", -1))
dataset = dataset.apply(cleaning.astype("Hours_available_electricity", int))
dataset = dataset.new_feature("Hours_available_electricity_missing",cleaning.add_missing_flag("Hours_available_electricity", -1))

dataset.to_dataframe()[["Hours_available_electricity", "Hours_available_electricity_missing"]].head()


Unnamed: 0,Hours_available_electricity,Hours_available_electricity_missing
0,-1,1
1,-1,1
2,-1,1
3,-1,1
4,-1,1


## Measurement_age

In [8]:
dataset = dataset.apply(cleaning.fillna("Measurement_age", -1))
dataset = dataset.apply(cleaning.replace_value("Measurement_age", -99, -1))
dataset = dataset.apply(cleaning.astype("Measurement_age", float))
dataset = dataset.new_feature("Measurement_age_missing", cleaning.add_missing_flag("Measurement_age", -1))
dataset.value_counts("Measurement_age")



Measurement_age
-1.0      6356
 1.0       864
 3.0       573
 4.0       496
 2.0       474
          ... 
 59.0        1
 54.0        1
 49.0        1
 46.0        1
 888.0       1
Name: count, Length: 62, dtype: int64

## Monthly_expenditure

In [9]:
dataset = dataset.apply(cleaning.fillna("Monthly_expenditure", -1))
dataset = dataset.apply(cleaning.astype("Monthly_expenditure", float))
dataset = dataset.apply(cleaning.replace_value("Monthly_expenditure", -99, -1))

dataset.value_counts("Monthly_expenditure")

Monthly_expenditure
-1.000000       628
 0.000000       389
 66.000000       96
 132.000000      80
 26.400000       70
               ... 
 2384.666667      1
 2648.833333      1
 2619.166667      1
 1149.166667      1
 8333.333333      1
Name: count, Length: 5236, dtype: int64

## Number_of_rooms

In [10]:
dataset = dataset.apply(cleaning.fillna("Number_of_rooms", -1))
dataset = dataset.apply(cleaning.astype("Number_of_rooms", int))
dataset.value_counts("Number_of_rooms")


Number_of_rooms
 2     3364
 3     2610
 1     2455
 4     1092
 5      455
-1      281
 6      214
 7       71
 12      57
 8       57
 9       18
 0       17
 10      11
 13       6
 11       4
 15       2
 14       2
 16       1
 45       1
Name: count, dtype: int64

## Climate_zone_lev_1

In [11]:
dataset = dataset.apply(cleaning.fillna("Climate_zone_lev_1", -1))
dataset = dataset.apply(cleaning.astype("Climate_zone_lev_1", float))
dataset.value_counts("Climate_zone_lev_1")



Climate_zone_lev_1
 11.000000    2976
 3.000000     1987
 8.731500     1089
 6.850500      982
 10.163250     707
 14.643750     624
 6.000000      356
 9.000000      223
 5.303000      204
 15.000000     195
 15.779750     192
 14.463750     180
 14.370750     132
 2.000000      127
 4.000000      124
 12.000000     120
 1.000000      110
 11.739750     104
 11.753500      96
 12.134000      60
 10.854250      48
 18.890500      27
 16.195750      23
-1.000000       14
 20.065250       5
 13.762500       3
 18.240000       3
 19.306000       2
 16.407250       2
 20.250750       1
 19.993023       1
 17.910250       1
Name: count, dtype: int64

## Climate_zone_lev_2

In [12]:
dataset = dataset.apply(cleaning.fillna("Climate_zone_lev_2", -1))
dataset = dataset.apply(cleaning.astype("Climate_zone_lev_2", float))

dataset = dataset.new_feature("Climate_zone_lev_2_missing", cleaning.add_missing_flag("Climate_zone_lev_2", -1))

dataset.to_dataframe()[["Climate_zone_lev_2", "Climate_zone_lev_2_missing"]].head()



Unnamed: 0,Climate_zone_lev_2,Climate_zone_lev_2_missing
0,-1.0,1
1,-1.0,1
2,3.0,0
3,3.0,0
4,3.0,0


## Age_HHH

In [13]:
dataset = dataset.apply(cleaning.fillna("Age_HHH", -1))
dataset = dataset.apply(cleaning.astype("Age_HHH", int))
dataset.value_counts("Age_HHH")

Age_HHH
-1     1450
 35     557
 40     479
 45     446
 30     391
       ... 
 91       2
 94       1
 15       1
 96       1
 98       1
Name: count, Length: 84, dtype: int64

## HH_with_home_business

In [14]:
dataset.value_counts("HH_with_home_business")

HH_with_home_business
No     4750
Yes    1494
Name: count, dtype: int64

## Education_level_HHH

In [15]:
dataset = dataset.apply(cleaning.categorize("Education_level_HHH"))
dataset.value_counts("Education_level_HHH")

Education_level_HHH
1     3502
2     1273
3      963
4      469
5      404
6      245
7      117
8       57
9       56
10      10
11       3
Name: count, dtype: int64

## Socio_status_HHH

In [16]:
dataset = dataset.apply(cleaning.categorize("Socio_status_HHH"))
dataset.value_counts("Socio_status_HHH")

Socio_status_HHH
1     1994
2     1932
3     1679
4     1097
5     1062
6      501
7      407
8      405
9      187
10      40
Name: count, dtype: int64

 ## Number_adults


In [17]:
dataset = dataset.apply(cleaning.fillna("Number_adults", -1))
dataset = dataset.apply(cleaning.astype("Number_adults", int))
dataset.value_counts("Number_adults")

Number_adults
2     3136
1     1694
3     1606
0     1351
4      953
      ... 
19       2
59       2
17       1
57       1
15       1
Name: count, Length: 61, dtype: int64

## Ownership_motorized_vehicle


In [18]:
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "Yes", 1.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "No", 0.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "1.0", 1.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "0.0", 0.0))
dataset = dataset.apply(cleaning.astype("Ownership_motorized_vehicle", float))
dataset = dataset.apply(cleaning.fillna("Ownership_motorized_vehicle", -1))
dataset.value_counts("Ownership_motorized_vehicle")

Ownership_motorized_vehicle
 1.0    4982
-1.0    4216
 0.0    1520
Name: count, dtype: int64

## Ownership_small_livestock


In [19]:
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "Yes", 1.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "No", 0.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "1.0", 1.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "0.0", 0.0))
dataset = dataset.apply(cleaning.astype("Ownership_small_livestock", float))
dataset = dataset.apply(cleaning.fillna("Ownership_small_livestock", -1))
dataset.value_counts("Ownership_small_livestock")

Ownership_small_livestock
 1.0    5102
 0.0    2876
-1.0    2740
Name: count, dtype: int64

## Ownership_large_livestock


In [20]:
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "Yes", 1.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "No", 0.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "1.0", 1.0))
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "0.0", 0.0))
dataset = dataset.apply(cleaning.astype("Ownership_large_livestock", float))
dataset = dataset.apply(cleaning.fillna("Ownership_large_livestock", -1))
dataset.value_counts("Ownership_large_livestock")

Ownership_large_livestock
 1.0    5070
-1.0    4231
 0.0    1417
Name: count, dtype: int64

## Clean_fuel


In [21]:
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "0", 0.0))
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "1", 1.0))
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "Yes", 1.0))
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "No", 0.0))
dataset = dataset.apply(cleaning.astype("Clean_fuel", float))
dataset = dataset.apply(cleaning.fillna("Clean_fuel", -1))
dataset.value_counts("Clean_fuel")



Clean_fuel
 0.0    10002
 1.0      596
-1.0      120
Name: count, dtype: int64

## Tariff_payment_frequency

In [22]:
dataset = dataset.apply(cleaning.fillna("Tariff_payment_frequency", "Other"))
dataset = dataset.apply(cleaning.categorize("Tariff_payment_frequency"))

# Appliances


In [23]:
PRESENCE_LIST = [
    'Presence_refrigerator/freezer', 'Presence_iron', 'Presence_fan',
    'Presence_DVD_player', 'Presence_radio/stereo',
    'Presence_phone_charger',
    'Presence_TV'
]

for p in PRESENCE_LIST:
    dataset = dataset.apply(cleaning.fillna(p, -1))
    value_counts = dataset.value_counts(p)
    print("Missing values for", p, value_counts[-1])

Missing values for Presence_refrigerator/freezer 697
Missing values for Presence_iron 1976
Missing values for Presence_fan 794
Missing values for Presence_DVD_player 794
Missing values for Presence_radio/stereo 792
Missing values for Presence_phone_charger 1184
Missing values for Presence_TV 2


In [24]:
dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Presence_refrigerator/freezer,Presence_iron,Presence_fan,Presence_DVD_player,Presence_radio/stereo,Presence_phone_charger,Presence_TV,Hours_available_electricity_missing,Measurement_age_missing,Climate_zone_lev_2_missing
0,1001_3,-1,0,-1,-1.0,63600.0,3,3.0,-1.0,-1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1
1,1001_3,-1,0,-1,-1.0,63600.0,3,3.0,-1.0,-1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1
2,1026_3,20,0,-1,-1.0,65400.0,3,3.0,3.0,-1,...,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1,1,0
3,1026_3,20,0,-1,-1.0,65400.0,3,3.0,3.0,-1,...,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1,1,0
4,1027_2,15,0,-1,-1.0,-1.0,2,3.0,3.0,-1,...,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1,1,0


In [25]:
dataset.to_csv("./playground/data/combined_dataset_cleaned.csv")

<core.ODEDataset.ODEDataset at 0x7eadd0666a90>