In [2]:
import pandas as pd
import numpy as np
import json
import category_encoders as ce

In [18]:
# Transform into ratio
cols_post = [
'end_date', 'renewvia_id', 'interviewed_before',
'connection_period', 'country', 'gender', 'age',
'occupation_change', 'household_income_change', 
'avg_household_income', 'household_headcount_change',
'girls_schooling_change','boys_schooling_change', 
'school_performance_change',
'girls_unschooled_reasons', 'boys_unschooled_reasons',
'household_business_owners',
'business_type', 'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'power_sources',
'power_sources_usage', 'power_sources_primary',
'appliances_count', 'appliances_count_change', 'cellphones_count',
'appliances_type', 'appliances_type_addition',
'light_hours_current', 'light_primary_sources', 'kerosene_lamp_usage_change',
'kerosene_lamps_count', 'kerosene_lamp_usage_time',
'kerosene_lamps_cost', 'cooking_energy_sources',
'cooking_fuel_collection_time',
'cooking_fuel_responsible', 'cooking_energy_cost',
'applicances_charging_sources',
'applicances_charging_cost', 'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'feel_safer_with_minigrid',
'phone_charge_location', 
'phone_charge_frequency', 'phone_charge_cost',
'phone_charge_travel_distance', 'water_source', 'clean_drinking_water',
'clean_drinking_water_source', 'community_clean_water_source',
'water_collection_travel_distance', 'water_collection_time',
'water_collection_responsible', 'avg_person_age_water_collection',
'water_cost', 'clinic_travel_distance',
'clinic_electricity_access_minigrid', 
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
'minigrid_access_productivity_improvement',
'minigrid_effect_on_household_female', 
'customerAccountNumber', 'tariff'
]

df = pd.read_csv("datasets_clean/hs_post_annotated.csv", usecols=cols_post)
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

# Count of multiple selection columns
df['business_use_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['business_from_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)

df['appliances_type_addition'] = df['appliances_type_addition'].map(lambda x: x.replace(" ", ";"))
df['appliances_addition_count'] = df['appliances_type_addition'].map(lambda x: 
                                                                     len(x[:-1].split(";"))
                                                                     if type(x) == str
                                                                     else 0)

df.head()

Unnamed: 0,renewvia_id,interviewed_before,connection_period,country,age,gender,occupation_change,household_income_change,avg_household_income,household_headcount_change,...,better_access_health_minigrid,minigrid_access_life_improvement,minigrid_access_productivity_improvement,minigrid_effect_on_household_female,end_date,customerAccountNumber,tariff,business_owners_count,business_owners_female,appliances_addition_count
0,131230,no,12-24 months,kenya,52.0,male,yes,"yes, it has increased",2000.0,"yes, it's increased",...,yes,yes,happy together,easy working,2022-07-20,131230.0,residential,1,0,3
1,131543,no,12-24 months,kenya,30.0,female,no,"no, it is the same",200.0,no,...,yes,"no, it's the same",increased an change life,not available,2022-07-20,131543.0,residential,0,0,1
2,131273,yes,over 36 months,kenya,33.0,male,yes,"yes, it has increased",300.0,"yes, it's increased",...,yes,yes,eating early,no,2022-07-20,131273.0,residential,0,0,3
3,131140,no,12-24 months,kenya,30.0,female,yes,"yes, it has increased",200.0,no,...,yes,"no, it's the same",drinking cold water,no affected any body,2022-07-20,131140.0,residential,0,0,2
4,131117,yes,12-24 months,kenya,32.0,female,yes,"no, it is the same",1.0,no,...,yes,"no, it's the same",very safe,not available,2022-07-20,131117.0,residential,0,0,1


In [19]:
cols_replace = [
    'interviewed_before',
    'connection_period',
     'household_income_change',
    'household_headcount_change',
     'girls_schooling_change',
     'boys_schooling_change',
     'school_performance_change',
    'power_sources_usage',
    'kerosene_lamp_usage_change',
    'kerosene_lamp_usage_time',
    'kerosene_lamps_cost',
    'feel_unsafe_reasons',
    'applicances_charging_sources',
    'cooking_fuel_collection_time',
    'cooking_energy_sources',
    'cooking_energy_cost',
    'phone_charge_frequency',
    'community_lights','water_source',
     'clean_drinking_water_source',
    'water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'avg_person_age_water_collection',
    'water_cost',
    'clinic_travel_distance',
    'phone_charge_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
    'minigrid_access_life_improvement',
    'tariff',
    
]

def replace_str(string, chars_dict):
    for new, old in chars_dict.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x, chars_dict= {'_': ['- ',' (', 
                                                                        ' ', '-'],
                                                                '': ['.)', ',', ")", 
                                                                     '/', 'u\n', 
                                                                     "'", ".", ";"],
                                                                }))
    
    

In [20]:
# Numeric conversion
df.replace(['nan', 'uninditifed', 'personal'], np.nan, inplace=True)
cols_num_change = ['avg_household_income', 
                   'light_hours_current', 'age']
rep_map_num =  {'': [",", "naira", " ", "n",
                    'hours', 'hrs', 'about', 
                     'year', ' year', 'years', ' years'],
                }

df['kerosene_lamps_count'] = df['kerosene_lamps_count'].replace(['none', 'no', 'o'],
                                         np.nan)
for col in cols_num_change:
    # df[col].replace('none', np.nan, inplace=True)
    df[col] = df[col].apply(lambda x: replace_str(x, rep_map_num) 
                            if type(x) == str else x)

In [21]:
# Convert to numeric columns
cols_num = ['avg_household_income',  'appliances_count', 
           'cellphones_count', 'light_hours_current', 
            'kerosene_lamps_count', 'appliances_addition_count',
           'business_owners_count', 'business_owners_female']


for col in cols_num:
    if col == 'light_hours_current':
        df[col].replace({"8-12": "8", "3-4":"3"}, inplace=True)
    df[col] = df[col].astype(float).astype('Int64')
    
df['business_use_minigrid'].replace('yes ', 'yes', inplace=True)
df['interviewed_before'].replace('not_sure', np.nan, inplace=True)
df['clean_drinking_water_source'].replace('clean_community_source_', 
                                          'clean_community_source', inplace=True)
df.head()

Unnamed: 0,renewvia_id,interviewed_before,connection_period,country,age,gender,occupation_change,household_income_change,avg_household_income,household_headcount_change,...,better_access_health_minigrid,minigrid_access_life_improvement,minigrid_access_productivity_improvement,minigrid_effect_on_household_female,end_date,customerAccountNumber,tariff,business_owners_count,business_owners_female,appliances_addition_count
0,131230,no,12_24_months,kenya,52.0,male,yes,yes_it_has_increased,2000,yes_its_increased,...,yes,yes,happy together,easy working,2022-07-20,131230.0,residential,1,0,3
1,131543,no,12_24_months,kenya,30.0,female,no,no_it_is_the_same,200,no,...,yes,no_its_the_same,increased an change life,not available,2022-07-20,131543.0,residential,0,0,1
2,131273,yes,over_36_months,kenya,33.0,male,yes,yes_it_has_increased,300,yes_its_increased,...,yes,yes,eating early,no,2022-07-20,131273.0,residential,0,0,3
3,131140,no,12_24_months,kenya,30.0,female,yes,yes_it_has_increased,200,no,...,yes,no_its_the_same,drinking cold water,no affected any body,2022-07-20,131140.0,residential,0,0,2
4,131117,yes,12_24_months,kenya,32.0,female,yes,no_it_is_the_same,1,no,...,yes,no_its_the_same,very safe,not available,2022-07-20,131117.0,residential,0,0,1


In [22]:
for col in list(df.columns):
    uq = df[col].unique()
    if len(uq) < 10:
        print(col, uq)

interviewed_before ['no' 'yes' nan]
connection_period ['12_24_months' 'over_36_months' '24_36_months' '6_12_months' '1_3_months'
 '3_6_months' nan]
country ['kenya' 'nigeria' nan]
gender ['male' 'female' nan]
occupation_change ['yes' 'no' nan]
household_income_change ['yes_it_has_increased' 'no_it_is_the_same' 'yes_it_has_decreased' nan]
household_headcount_change ['yes_its_increased' 'no' nan 'yes_its_decreased'
 'no_it_has_stayed_the_same']
girls_schooling_change ['no_its_the_same' 'yes_its_increased' 'yes_its_decreased' nan]
boys_schooling_change ['no_its_the_same' 'yes_its_increased' nan 'yes_its_decreased']
school_performance_change ['yes_its_gotten_better' 'no_its_the_same' nan 'yes_its_gotten_worse']
boys_unschooled_reasons ['they all attend school' 'not old enough' 'other' 'household chores'
 'too old' nan 'lacking money for school fees' 'childcare'
 'health reasons']
household_business_owners ['adult male;' '' 'adult female;' nan 'adult male;adult female;'
 'adult female;adult

In [27]:

cats_map = list()
main_cols = []
# Inplace encoding of binary variables
cols_bin_inplace = [
'interviewed_before','occupation_change', 
 'business_recent', 'business_from_minigrid',
    'business_use_minigrid',
 'appliances_count_change',
 'community_lights', 'home_exterior_lights',
 'exterior_lights_minigrid', 'clean_drinking_water',
 'clinic_electricity_access_minigrid',
 'clinic_refrigeration_access',
 'better_access_health_minigrid'
]

for col in cols_bin_inplace:
    cats_map.append({"col":col, "mapping": {'no':0, 'yes':1, 
                                            'option 1': 0, 
                                           'option 2': 1}})

main_cols.extend(cols_bin_inplace)
# Map encoding values for 'switch' changes (decrease, neutral, increase) 
cols_ord_change = [
 'household_income_change', 'girls_schooling_change',
 'boys_schooling_change', 'school_performance_change',
 'kerosene_lamp_usage_change', 'minigrid_access_life_improvement'
]

main_cols.extend(cols_ord_change)
# Creating the mapping for each categroical variable with ordinality
for col in cols_ord_change:
    val_ord = dict()
    cats = list(df[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif cat == 'yes' or any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_map.append({"col":col, "mapping": val_ord})

# Map encoding for variables with scale with max > 1
cols_ord_scale = {
'kerosene_lamps_cost': ['nothing', '0_200_nkes', '200_600_nkes', 
                             '600_1000_nkes', '1000_1400_nkes',
                            '1400_nkes_and_above', ],
'cooking_fuel_collection_time': ['no_need_to_collect_fuel',
                                 'less_than_1_hour', '1_2_hours', 
                                 '3_5_hours', 'greater_than_5_hours', ],
'cooking_energy_cost':  ['nothing_we_use_minigrid_power',
                                 '0_1000_nkes', '1000_1500_nkes', 
                                 '1500_2000_nkes','2000_3000_nkes', 
                                 '3000_4000_nkes'],
'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
                  'neither_safe_nor_unsafe', 
                   'somewhat_safe','very_safe',],
'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
                                  'neither_safe_nor_unsafe', 
                                 'somewhat_safe','very_safe',], 
'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
                 'clear_water_source_fresh_spring_lake_etc', 
                 'community_well_or_pump',
                 'at_home_tap', ],    
'clean_drinking_water_source' : ["boiled_water",
                              "bottled_water",
                              "treated__filtered_water",
                              "clean_community_source"],
'water_collection_travel_distance': ['no_need_to_travel_at_home_water_supply',
                                     'less_than_1_km', '1_2_km', 
                                      '2_5_km', '5_10_km',
                                      'greater_than_10_km', ],
'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
                           '2_3_hours', '3_4_hours', 
                           'greater_than_4_hours', ],
'water_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
                        '500_3000_nkes', '3000_5000_nkes',
                        '5000_nkes_and_above',],
'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
                            'between_2_3_km', 'between_3_5_km',
                           'greater_than_5_km', ],
'phone_charge_cost': ['nothing_we_use_minigrid_power', '0_100_nkes', '100_500_nkes',
                              '500_750_nkes',  '750_1000_nkes',  
                             '1000_nkes_and_above'],
}

main_cols.extend(list(cols_ord_scale.keys()))
for col, values in cols_ord_scale.items():
    val_ord = dict()
    rng = len(values)
    for idx, val in enumerate(values):
        val_ord[val] = idx+1
    cats_map.append({"col":col, "mapping": val_ord})
    
#Ordinal Encoding
df_subset = df[main_cols]
enc_ord = ce.OrdinalEncoder(mapping=cats_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

# df_subset
df_subset = enc_ord.fit_transform(df_subset)
df.to_csv("data_encoded/household_post_survey_encoded.csv")

Unnamed: 0,interviewed_before,occupation_change,business_recent,business_from_minigrid,business_use_minigrid,appliances_count_change,community_lights,home_exterior_lights,exterior_lights_minigrid,clean_drinking_water,...,cooking_energy_cost,feel_safe_dark,feel_safe_if_exterior_lights,water_source,clean_drinking_water_source,water_collection_travel_distance,water_collection_time,water_cost,clinic_travel_distance,phone_charge_cost
0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,5.0,5.0,3.0,4.0,2.0,1.0,1.0,,3.0
1,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,3.0,5.0,5.0,4.0,,3.0,1.0,1.0,2.0,
2,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,4.0,5.0,5.0,4.0,,2.0,1.0,3.0,3.0,2.0
3,0.0,1.0,0.0,,0.0,1.0,0.0,1.0,1.0,1.0,...,3.0,1.0,5.0,3.0,4.0,2.0,1.0,1.0,,2.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,5.0,4.0,4.0,4.0,2.0,1.0,1.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,1.0,0.0,1.0,,0.0,0.0,1.0,1.0,1.0,1.0,...,4.0,1.0,5.0,3.0,4.0,2.0,1.0,1.0,2.0,1.0
2599,1.0,0.0,0.0,,0.0,0.0,1.0,0.0,,1.0,...,5.0,1.0,5.0,3.0,4.0,2.0,1.0,1.0,2.0,1.0
2600,1.0,0.0,0.0,,0.0,1.0,1.0,0.0,,1.0,...,6.0,1.0,4.0,3.0,4.0,2.0,1.0,1.0,3.0,1.0
2601,1.0,0.0,0.0,,0.0,1.0,1.0,0.0,,1.0,...,6.0,1.0,5.0,3.0,4.0,2.0,1.0,1.0,1.0,1.0
