In [1]:
import os
import sys

sys.path.append("./")  # Adds higher directory to python modules path.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'


In [2]:
from core.ODEDataset import ODEDataset
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

pd.set_option('future.no_silent_downcasting', True)
from utils import cleaning, constants


In [3]:
src_path = "./data/combined-datasets.csv"
missing_value = -1


In [4]:
dataset = ODEDataset("combined")
dataset.from_csv(src_path)
dataset = dataset.select([
                             "ID",
                         ] + constants.DRIVERS_LIST + constants.PRESENCE_LIST)


In [5]:
# import numpy as np
# from matplotlib import pyplot as plt

# df = dataset.to_dataframe()

# original = df['Years_of_HHH_in_community']

# df['Years_of_HHH_in_community'] = df['Years_of_HHH_in_community'].interpolate()
# mean = df['Years_of_HHH_in_community'].mean()
# df['Years_of_HHH_in_community'] = df['Years_of_HHH_in_community'].fillna(mean)
# inerpolated = df['Years_of_HHH_in_community']
# # Create x-values for plotting
# x = range(len(df))

# plt.figure(figsize=(20, 15))
# plt.scatter(x, original, label='Before Interpolation', marker='o', color='r')
# plt.scatter(x, inerpolated, label='After Interpolation', marker='+', color='b')

# plt.legend()
# plt.show()

# Drivers

## Years_of_HHH_in_community

In [6]:

dataset = dataset.apply(cleaning.astype("Years_of_HHH_in_community", np.float32))
dataset = dataset.apply(cleaning.interpolate("Years_of_HHH_in_community"))
dataset = dataset.apply(cleaning.fillna_with_mean("Years_of_HHH_in_community"))

dataset = dataset.apply(cleaning.standardize("Years_of_HHH_in_community"))
dataset.to_dataframe()[["Years_of_HHH_in_community"]].head()

Unnamed: 0,Years_of_HHH_in_community
0,-0.47616
1,-0.47616
2,-0.477477
3,-0.477477
4,-0.483108


## Dwelling_quality_index


In [7]:

dataset = dataset.apply(cleaning.astype("Dwelling_quality_index", np.float32))
dataset = dataset.apply(cleaning.interpolate("Dwelling_quality_index"))
dataset = dataset.apply(cleaning.fillna_with_mean("Dwelling_quality_index"))

dataset = dataset.apply(cleaning.standardize("Dwelling_quality_index"))

dataset.value_counts("Dwelling_quality_index")

Dwelling_quality_index
-0.500000    5222
-0.364865    1332
-0.391892    1112
-0.418919     930
-0.445946     720
             ... 
-0.475430       1
-0.480344       1
-0.485258       1
-0.490172       1
 0.013514       1
Name: count, Length: 62, dtype: int64

## Hours_available_electricity

In [8]:


dataset = dataset.apply(cleaning.astype("Hours_available_electricity", np.float32))
dataset = dataset.apply(cleaning.interpolate("Hours_available_electricity"))
dataset = dataset.apply(cleaning.fillna_with_mean("Hours_available_electricity"))

dataset = dataset.apply(cleaning.standardize("Hours_available_electricity"))

dataset.value_counts("Hours_available_electricity")

Hours_available_electricity
-0.464286    2753
-0.468750    1817
-0.477679     516
-0.491071     346
-0.473214     255
             ... 
-0.464379       1
-0.464410       1
-0.464441       1
-0.464472       1
 0.013393       1
Name: count, Length: 1889, dtype: int64

## Measurement_age

In [9]:
dataset = dataset.apply(cleaning.astype("Measurement_age", np.float32))
dataset = dataset.apply(cleaning.interpolate("Measurement_age"))
dataset = dataset.apply(cleaning.fillna_with_mean("Measurement_age"))

dataset = dataset.apply(cleaning.standardize("Measurement_age"))

dataset.value_counts("Measurement_age")

Measurement_age
-0.398683    1372
-0.395643     927
-0.396657     902
-0.397670     599
-0.389564     231
             ... 
-0.396518       1
-0.396564       1
-0.396610       1
-0.396698       1
-0.389058       1
Name: count, Length: 3403, dtype: int64

## Monthly_expenditure

In [10]:
dataset = dataset.apply(cleaning.astype("Monthly_expenditure", np.float32))
dataset = dataset.apply(cleaning.interpolate("Monthly_expenditure"))
dataset = dataset.apply(cleaning.fillna_with_mean("Monthly_expenditure"))

dataset = dataset.apply(cleaning.standardize("Monthly_expenditure"))

dataset.value_counts("Monthly_expenditure")

Monthly_expenditure
-0.500000    389
-0.499996     96
-0.499992     81
-0.499998     70
-0.499995     63
            ... 
-0.499942      1
-0.499822      1
-0.499887      1
-0.499881      1
-0.499493      1
Name: count, Length: 5081, dtype: int64

## Number_of_rooms

In [11]:
dataset = dataset.apply(cleaning.astype("Number_of_rooms", np.float32))
dataset = dataset.apply(cleaning.interpolate("Number_of_rooms"))
dataset = dataset.apply(cleaning.fillna_with_mean("Number_of_rooms"))

dataset = dataset.apply(cleaning.standardize("Number_of_rooms"))

dataset.value_counts("Number_of_rooms")


Number_of_rooms
-0.455556    3377
-0.433333    2650
-0.477778    2456
-0.411111    1101
-0.388889     455
             ... 
-0.451587       1
-0.452381       1
-0.453175       1
-0.453968       1
-0.388109       1
Name: count, Length: 175, dtype: int64

## Climate_zone_lev_1

In [12]:

dataset = dataset.apply(cleaning.astype("Climate_zone_lev_1", np.float32))
dataset = dataset.apply(cleaning.interpolate("Climate_zone_lev_1"))
dataset = dataset.apply(cleaning.fillna_with_mean("Climate_zone_lev_1"))

dataset = dataset.apply(cleaning.standardize("Climate_zone_lev_1"))

dataset.value_counts("Climate_zone_lev_1")



Climate_zone_lev_1
 0.019460    2976
-0.396108    1987
-0.098379    1089
-0.196090     982
-0.024006     707
 0.208739     624
-0.240270     356
-0.084432     223
-0.276476     204
 0.227244     195
 0.267749     192
 0.199388     180
 0.194557     132
-0.448054     129
-0.344162     124
 0.071406     120
-0.500000     110
 0.057887     104
 0.058602      96
 0.078367      60
 0.011889      48
 0.429340      27
 0.289359      23
 0.490364       5
 0.395550       3
 0.162961       3
 0.450924       2
 0.300345       2
-0.474027       2
 0.500000       1
-0.478356       1
-0.456712       1
-0.461040       1
-0.465369       1
-0.469698       1
-0.495671       1
-0.482685       1
-0.487013       1
-0.491342       1
 0.378420       1
 0.486612       1
-0.452383       1
Name: count, dtype: int64

## Climate_zone_lev_2

In [13]:
dataset = dataset.apply(cleaning.astype("Climate_zone_lev_2", np.float32))
dataset = dataset.apply(cleaning.interpolate("Climate_zone_lev_2"))
dataset = dataset.apply(cleaning.fillna_with_mean("Climate_zone_lev_2"))

dataset = dataset.apply(cleaning.standardize("Climate_zone_lev_2"))

dataset.value_counts("Climate_zone_lev_2")



Climate_zone_lev_2
-0.397514    1095
 0.063671     319
-0.243786     211
-0.090058     191
-0.448757     182
             ... 
-0.292938       1
-0.292981       1
-0.293023       1
-0.293066       1
-0.434116       1
Name: count, Length: 7849, dtype: int64

## Age_HHH

In [14]:
dataset = dataset.apply(cleaning.astype("Age_HHH", np.float32))
dataset = dataset.apply(cleaning.interpolate("Age_HHH"))
dataset = dataset.apply(cleaning.fillna_with_mean("Age_HHH"))

dataset = dataset.apply(cleaning.standardize("Age_HHH"))

dataset.value_counts("Age_HHH")




Age_HHH
-0.157647    1274
-0.259036     557
-0.198795     480
-0.138554     447
-0.319277     391
             ... 
 0.475904       1
-0.144578       1
-0.156626       1
-0.289157       1
-0.100402       1
Name: count, Length: 193, dtype: int64

## HH_with_home_business

In [15]:
def interpolate_and_fillna_with_mean_and_standarize(dataset, column):
    
    cols = [c for c in dataset.get_columns() if column in c]
    for c in cols:
        dataset = dataset.apply(cleaning.astype(c, np.float32))
        dataset = dataset.apply(cleaning.interpolate(c))
        dataset = dataset.apply(cleaning.fillna_with_mean(c))
        dataset = dataset.apply(cleaning.standardize(c))
        
        
    return dataset
    

In [16]:
dataset = dataset.apply(cleaning.astype("HH_with_home_business", str))

# add_one_hot_encoding
dataset = dataset.apply(cleaning.add_one_hot_encoding("HH_with_home_business"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "HH_with_home_business")

dataset = dataset.drop_columns(["HH_with_home_business_nan"])

dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Tariff_payment_frequency,Presence_refrigerator/freezer,Presence_iron,Presence_fan,Presence_DVD_player,Presence_radio/stereo,Presence_phone_charger,Presence_TV,HH_with_home_business_No,HH_with_home_business_Yes
0,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.5,0.5
1,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.5,0.5
2,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,,,,,,,0.0,0.0,0.5,-0.5
3,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,,,,,,,0.0,0.0,0.5,-0.5
4,1027_2,-0.483108,-0.5,-0.464103,-0.394716,-0.497578,-0.455556,-0.396108,-0.397514,-0.157647,...,,,,,,,0.0,0.0,-0.5,0.5


## Education_level_HHH

In [17]:
dataset = dataset.apply(cleaning.fillna("Education_level_HHH", "Other"))
merged_categories = {
    'Other': "Not classified",
    "Not elsewhere classified": "Not classified",

    "No schooling": "No schooling",

    'Primary education': "Primary",

    'Upper secondary education': "Secondary",
    "Secondary education": "Secondary",
    'Lower secondary education': "Secondary",

    'Post-secondary non-tertiary education': "Tertiary",
    "Bachelor's or equivalent level": "Tertiary",
    "Short-cycle tertiary education": "Tertiary",
    "Doctoral or equivalent level": "Tertiary",
    "Master's or equivalent level": "Tertiary",

}
dataset = dataset.new_feature("Education_level_HHH",cleaning.merge_categories("Education_level_HHH", merged_categories))
dataset = dataset.apply(cleaning.add_one_hot_encoding("Education_level_HHH"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Education_level_HHH")

## Socio_status_HHH

In [18]:
dataset = dataset.apply(cleaning.fillna("Socio_status_HHH", "Other (not specified in Socio_status)"))

dataset = dataset.apply(cleaning.replace_value("Socio_status_HHH", "Other (not specified in Socio_status)", "Other"))

merged_categories = {
    'Own-account worker farm': "Worker",
    'Employee': "Worker",
    'Unemployed': "Unemployed",
    "Other": "Other",

    'Employee non-farm': "Worker",
    'Own-account worker non-farm': "Worker",
    'Contributing family worker': "Worker",
    'Worker not classifiable by status': "Worker",
    'Employee farm': "Worker",

    'Employer': "Employer",
}
dataset = dataset.new_feature("Socio_status_HHH",cleaning.merge_categories("Socio_status_HHH", merged_categories))
dataset = dataset.apply(cleaning.add_one_hot_encoding("Socio_status_HHH"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Socio_status_HHH")


 ## Number_adults


In [19]:
dataset = dataset.apply(cleaning.astype("Number_adults", np.float32))
dataset = dataset.apply(cleaning.interpolate("Number_adults"))
dataset = dataset.apply(cleaning.fillna_with_mean("Number_adults"))

dataset = dataset.apply(cleaning.standardize("Number_adults"))




## Ownership_motorized_vehicle


In [20]:
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "1.0", 'Yes'))
dataset = dataset.apply(cleaning.replace_value("Ownership_motorized_vehicle", "0.0", 'No'))

dataset = dataset.apply(cleaning.astype("Ownership_motorized_vehicle", str))

# add_one_hot_encoding
dataset = dataset.apply(cleaning.add_one_hot_encoding("Ownership_motorized_vehicle"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Ownership_motorized_vehicle")

dataset = dataset.drop_columns(["Ownership_motorized_vehicle_nan"])

dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Education_level_HHH_Not classified,Education_level_HHH_Primary,Education_level_HHH_Secondary,Education_level_HHH_Tertiary,Socio_status_HHH_Employer,Socio_status_HHH_Other,Socio_status_HHH_Unemployed,Socio_status_HHH_Worker,Ownership_motorized_vehicle_No,Ownership_motorized_vehicle_Yes
0,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,0.5,0.5,-0.5
1,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,0.5,0.5,-0.5
2,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,-0.5,0.5
3,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,-0.5,0.5
4,1027_2,-0.483108,-0.5,-0.464103,-0.394716,-0.497578,-0.455556,-0.396108,-0.397514,-0.157647,...,-0.5,-0.5,0.5,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5


## Ownership_small_livestock


In [21]:
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "1.0", 'Yes'))
dataset = dataset.apply(cleaning.replace_value("Ownership_small_livestock", "0.0", 'No'))
dataset = dataset.apply(cleaning.astype("Ownership_small_livestock", str))

# add_one_hot_encoding
dataset = dataset.apply(cleaning.add_one_hot_encoding("Ownership_small_livestock"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Ownership_small_livestock")

dataset = dataset.drop_columns(["Ownership_small_livestock_nan"])

dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Education_level_HHH_Secondary,Education_level_HHH_Tertiary,Socio_status_HHH_Employer,Socio_status_HHH_Other,Socio_status_HHH_Unemployed,Socio_status_HHH_Worker,Ownership_motorized_vehicle_No,Ownership_motorized_vehicle_Yes,Ownership_small_livestock_No,Ownership_small_livestock_Yes
0,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,-0.5,-0.5,-0.5,0.5,0.5,-0.5,-0.5,0.5
1,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,-0.5,-0.5,-0.5,0.5,0.5,-0.5,-0.5,0.5
2,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,-0.5,0.5,-0.5,0.5
3,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,-0.5,0.5,-0.5,0.5
4,1027_2,-0.483108,-0.5,-0.464103,-0.394716,-0.497578,-0.455556,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5


## Ownership_large_livestock


In [22]:
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "1.0", 'Yes'))
dataset = dataset.apply(cleaning.replace_value("Ownership_large_livestock", "0.0", 'No'))

dataset = dataset.apply(cleaning.astype("Ownership_large_livestock", str))

# add_one_hot_encoding
dataset = dataset.apply(cleaning.add_one_hot_encoding("Ownership_large_livestock"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Ownership_large_livestock")

dataset = dataset.drop_columns(["Ownership_large_livestock_nan"])

dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Socio_status_HHH_Employer,Socio_status_HHH_Other,Socio_status_HHH_Unemployed,Socio_status_HHH_Worker,Ownership_motorized_vehicle_No,Ownership_motorized_vehicle_Yes,Ownership_small_livestock_No,Ownership_small_livestock_Yes,Ownership_large_livestock_No,Ownership_large_livestock_Yes
0,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,-0.5,0.5,0.5,-0.5,-0.5,0.5,0.5,-0.5
1,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,-0.5,0.5,0.5,-0.5,-0.5,0.5,0.5,-0.5
2,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,-0.5,-0.5,0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5
3,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,-0.5,-0.5,0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5
4,1027_2,-0.483108,-0.5,-0.464103,-0.394716,-0.497578,-0.455556,-0.396108,-0.397514,-0.157647,...,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5,-0.5,0.5


## Clean_fuel


In [23]:
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "0", 'No'))
dataset = dataset.apply(cleaning.replace_value("Clean_fuel", "1", 'Yes'))

dataset = dataset.apply(cleaning.astype("Clean_fuel", str))

# add_one_hot_encoding
dataset = dataset.apply(cleaning.add_one_hot_encoding("Clean_fuel"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Clean_fuel")

dataset = dataset.drop_columns(["Clean_fuel_nan"])


## Tariff_payment_frequency

In [24]:
dataset = dataset.apply(cleaning.fillna("Tariff_payment_frequency", "not_classified"))
merged_categories = {
    'not_classified': "other",
    'Other': "other",
    'Monthly': "Monthly or More",
    'Every 2 weeks': "Weekly or More",
    'Weekly': "Weekly or More",
    'No bill': "other",
    'Every 6 months': "Monthly or More",
    'Every 3 months': "Monthly or More",
}
dataset = dataset.new_feature("Tariff_payment_frequency",cleaning.merge_categories("Tariff_payment_frequency", merged_categories))

dataset = dataset.apply(cleaning.add_one_hot_encoding("Tariff_payment_frequency"))

dataset = interpolate_and_fillna_with_mean_and_standarize(dataset, "Tariff_payment_frequency")

In [25]:
dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Ownership_motorized_vehicle_Yes,Ownership_small_livestock_No,Ownership_small_livestock_Yes,Ownership_large_livestock_No,Ownership_large_livestock_Yes,Clean_fuel_No,Clean_fuel_Yes,Tariff_payment_frequency_Monthly or More,Tariff_payment_frequency_Weekly or More,Tariff_payment_frequency_other
0,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,0.5,0.5,-0.5,0.5,-0.5,-0.5,-0.5,0.5
1,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,0.5,0.5,-0.5,0.5,-0.5,-0.5,-0.5,0.5
2,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,0.5,-0.5,0.5,0.5,-0.5,-0.5,-0.5,0.5
3,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,0.5,-0.5,0.5,0.5,-0.5,-0.5,-0.5,0.5
4,1027_2,-0.483108,-0.5,-0.464103,-0.394716,-0.497578,-0.455556,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,0.5,-0.5,0.5,0.5,-0.5,-0.5,-0.5,0.5


# Appliances


In [26]:
PRESENCE_LIST = [
    'Presence_refrigerator/freezer', 'Presence_iron', 'Presence_fan',
    'Presence_DVD_player', 'Presence_radio/stereo',
    'Presence_phone_charger',
    'Presence_TV'
]

for p in PRESENCE_LIST:
    dataset = dataset.apply(cleaning.fillna(p, -1))
    value_counts = dataset.value_counts(p)
    print("Missing values for", p, value_counts[-1])

Missing values for Presence_refrigerator/freezer 697
Missing values for Presence_iron 1976
Missing values for Presence_fan 794
Missing values for Presence_DVD_player 794
Missing values for Presence_radio/stereo 792
Missing values for Presence_phone_charger 1184
Missing values for Presence_TV 2


In [27]:
dataset.preview()

Unnamed: 0,ID,Years_of_HHH_in_community,Dwelling_quality_index,Hours_available_electricity,Measurement_age,Monthly_expenditure,Number_of_rooms,Climate_zone_lev_1,Climate_zone_lev_2,Age_HHH,...,Ownership_motorized_vehicle_Yes,Ownership_small_livestock_No,Ownership_small_livestock_Yes,Ownership_large_livestock_No,Ownership_large_livestock_Yes,Clean_fuel_No,Clean_fuel_Yes,Tariff_payment_frequency_Monthly or More,Tariff_payment_frequency_Weekly or More,Tariff_payment_frequency_other
0,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,0.5,0.5,-0.5,0.5,-0.5,-0.5,-0.5,0.5
1,1001_3,-0.47616,-0.5,-0.464103,-0.394716,-0.496128,-0.433333,-0.396108,-0.155227,-0.157647,...,-0.5,-0.5,0.5,0.5,-0.5,0.5,-0.5,-0.5,-0.5,0.5
2,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,0.5,-0.5,0.5,0.5,-0.5,-0.5,-0.5,0.5
3,1026_3,-0.477477,-0.5,-0.464103,-0.394716,-0.496019,-0.433333,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,0.5,-0.5,0.5,0.5,-0.5,-0.5,-0.5,0.5
4,1027_2,-0.483108,-0.5,-0.464103,-0.394716,-0.497578,-0.455556,-0.396108,-0.397514,-0.157647,...,0.5,-0.5,0.5,-0.5,0.5,0.5,-0.5,-0.5,-0.5,0.5


# Save the dataset

In [28]:
from sklearn.model_selection import train_test_split

df = dataset.to_dataframe()
df = df.drop(columns=["ID"])
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=412)
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=412)

In [29]:
print("Training")
for p in PRESENCE_LIST:
    value_counts = train_df.value_counts(p)
    print( value_counts)



Training
Presence_refrigerator/freezer
 0.0    5207
 1.0     814
-1.0     409
Name: count, dtype: int64
Presence_iron
 0.0    4051
 1.0    1221
-1.0    1158
Name: count, dtype: int64
Presence_fan
 0.0    4511
 1.0    1453
-1.0     466
Name: count, dtype: int64
Presence_DVD_player
 0.0    5266
 1.0     698
-1.0     466
Name: count, dtype: int64
Presence_radio/stereo
 0.0    4546
 1.0    1419
-1.0     465
Name: count, dtype: int64
Presence_phone_charger
 0.0    4495
 1.0    1242
-1.0     693
Name: count, dtype: int64
Presence_TV
 0.0    5296
 1.0    1133
-1.0       1
Name: count, dtype: int64


In [30]:
print("Validation")
for p in PRESENCE_LIST:
    value_counts = val_df.value_counts(p)
    print(value_counts)



Validation
Presence_refrigerator/freezer
 0.0    1736
 1.0     286
-1.0     122
Name: count, dtype: int64
Presence_iron
 0.0    1322
 1.0     420
-1.0     402
Name: count, dtype: int64
Presence_fan
 0.0    1506
 1.0     500
-1.0     138
Name: count, dtype: int64
Presence_DVD_player
 0.0    1758
 1.0     248
-1.0     138
Name: count, dtype: int64
Presence_radio/stereo
 0.0    1535
 1.0     471
-1.0     138
Name: count, dtype: int64
Presence_phone_charger
 0.0    1442
 1.0     438
-1.0     264
Name: count, dtype: int64
Presence_TV
0.0    1735
1.0     409
Name: count, dtype: int64


In [31]:
print("Testing")
for p in PRESENCE_LIST:
    value_counts = test_df.value_counts(p)
    print(value_counts)


Testing
Presence_refrigerator/freezer
 0.0    1707
 1.0     271
-1.0     166
Name: count, dtype: int64
Presence_iron
 0.0    1319
-1.0     416
 1.0     409
Name: count, dtype: int64
Presence_fan
 0.0    1448
 1.0     506
-1.0     190
Name: count, dtype: int64
Presence_DVD_player
 0.0    1742
 1.0     212
-1.0     190
Name: count, dtype: int64
Presence_radio/stereo
 0.0    1487
 1.0     468
-1.0     189
Name: count, dtype: int64
Presence_phone_charger
 0.0    1517
 1.0     400
-1.0     227
Name: count, dtype: int64
Presence_TV
 0.0    1778
 1.0     365
-1.0       1
Name: count, dtype: int64


In [32]:
train_df.to_csv("./data/normalized_train_set.csv", index=False)
val_df.to_csv("./data/normalized_val_set.csv", index=False)
test_df.to_csv("./data/normalized_test_set.csv", index=False)