In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
# Transform into ratio
filter_cols_post = [
"renewvia_account_number", "interviewed_before",
'occupation_change', 'houlsehold_income_change',
'avg_monthly_household_income', 'female_schooling_change',
'male_schooling_change', 'school_performance_change',
'household_business_owners', 
'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'electronics_count',
'electronics_count_change', 'cellphones_count',
'light_hours_current', "kerosene_lamp_usage_change",
'kerosene_lamp_usage_count', 'kerosene_lamp_usage_cost',
'cooking_fuel_collection_time', 'cooking_energy_monthly_cost',
'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 
'phone_charge_monthly_cost', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'water_collection_travel_distance',
'water_collection_time', 'water_monthly_cost',
'clinic_travel_distance', 'clinic_electricity_access_minigrid',
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
]


hs_post_com = pd.read_csv("initial_clean/household_post_connection_commcare.csv", 
                          usecols=filter_cols_post
                         )
hs_post_ms = pd.read_csv("initial_clean/household_post_connection_ms_form.csv", 
                        usecols=filter_cols_post, 
                         encoding = "ISO-8859-1")
# hs_post_com.head()
df = pd.concat([hs_post_com, hs_post_ms])
df.rename(columns={'renewvia_account_number':'renewvia_id'},inplace=True)
df.head()

Unnamed: 0,renewvia_id,interviewed_before,occupation_change,houlsehold_income_change,avg_monthly_household_income,female_schooling_change,male_schooling_change,school_performance_change,household_business_owners,business_recent,...,clean_drinking_water,clean_drinking_water_source,water_collection_travel_distance,water_collection_time,water_monthly_cost,clinic_travel_distance,clinic_electricity_access_minigrid,clinic_refrigeration_access,better_access_health_minigrid,minigrid_access_life_improvement
0,,,,,,,,,,,...,,,,,,,,,,
1,521168.0,no,no,yes_it_has_decreased,"20,000 naira",no_its_the_same,no_its_the_same,yes_its_gotten_better,,,...,yes,treated__filtered_water,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,no_its_the_same
2,521039.0,yes,yes,yes_it_has_increased,50000,yes_its_increased,yes_its_increased,yes_its_gotten_better,adult_male,yes,...,yes,treated__filtered_water,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,yes
3,521055.0,no,no,no_it_is_the_same,40000,no_its_the_same,no_its_the_same,no_its_the_same,,,...,yes,treated__filtered_water,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,yes
4,521090.0,no,no,yes_it_has_increased,"20,000 Naira",no_its_the_same,no_its_the_same,yes_its_gotten_better,adult_female,yes,...,yes,clean_community_source,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-5_km,yes,yes,yes,yes


In [3]:

# No. of business owners
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if (type(x) == str) 
                                                                            and ('adult_female' in x) 
                                                                            else 0)

# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

cols_change = [
    'interviewed_before',
     'houlsehold_income_change',
     'female_schooling_change',
     'male_schooling_change',
     'school_performance_change',
    'kerosene_lamp_usage_change',
    'kerosene_lamp_usage_cost',
    'cooking_fuel_collection_time',
    'cooking_energy_monthly_cost',
    'community_lights','water_source',
     'clean_drinking_water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'water_monthly_cost',
    'clinic_travel_distance',
    'phone_charge_monthly_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
    'minigrid_access_life_improvement'
]

def replace_str(string, replace_chars = {'_': ['- ',' (', ' ', '-'],
                                        '': ['.)', ',', '/', 'u\n', 
                                             "'", "."],
                                        }):
    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))

In [4]:
# replace values
replace_mapping = {
    'interviewed_before': ['not_sure', np.nan],
    'business_from_minigrid': [['maybe','option 1','option 2'],
                                [np.nan, 'yes', 'no']],
    
    'kerosene_lamp_usage_count' : [['none', 'no', 'o',], 
                                    ['0', '0', '0']],
    
    'kerosene_lamp_usage_cost' : [['copy_1_of_nkes', 'copy_2_of_nkes', 
                                    'copy_3_of_nkes','nkes', 
                                   'nkes_and_above'], 
                                ['nothing', '0_200_nkes', 
                                '200_600_nkes', '1000_1400_nkes',
                                '1400_nkes_and_above']],
    
    'cooking_fuel_collection_time': [['copy_1_of_hours','hours'],
                                     ['less_than_1_hour',  '3_5_hours',]],
    
    'cooking_energy_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 
                                      'copy_3_of_nkes', 'copy_4_of_nkes',
                                     'nkes', 
                                     'nothing__we_use_minigrid_power',
                                    'nothing___we_use_minigrid_power'], 
                                    ['0_1000_nkes', '1000_1500_nkes', 
                                    '1500_2000_nkes', '2000_3000_nkes', 
                                     '3000_4000_nkes',
                                    'nothing_we_use_minigrid_power',
                                    'nothing_we_use_minigrid_power']],
    
    'phone_charge_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 'copy_3_of_nkes', 
                                   'nkes', 'n_kes_and_above', '1000_n_and_above',
                                   'nothing__we_use_minigrid_power'
                                  ], 
                                    ['0_100_nkes','100_500_nkes', '500_750_nkes',
                                     '750_1000_nkes',
                                     '1000_nkes_and_above', '1000_nkes_and_above',
                                    'nothing_we_use_minigrid_power']],
    
    'water_collection_travel_distance': [['copy_1_of_km','km', 
                                          'no_need_to_travel___at_home_water_supply',
                                         'no_need_to_travel__at_home_water_supply'],
                                         ['less_than_1_km', '5_10_km',
                                         'no_need_to_travel_at_home_water_supply',
                                         'no_need_to_travel_at_home_water_supply']],
    
    'water_collection_time': [['copy_1_of_hours', 'copy_2_of_hours',
                               'hours'],
                             ['less_than_1_hour','1_2_hours', 
                              '3_4_hours',]],
    
    'water_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 
                            'nkes', 'nkes_and_above'], 
                          ['i_dont_pay_its_free', '0_500_nkes', 
                          '3000_5000_nkes', '5000_nkes_and_above',]],
    'clinic_travel_distance': ['less_than_1km', 'less_than_1_km']  
}

for col, mapping in replace_mapping.items():
    # print(col, mapping)
    df[col] = df[col].replace(mapping[0], mapping[1])

In [5]:
df.replace('nan', np.nan, inplace=True)
cols_num_change = ['avg_monthly_household_income', 'light_hours_current']
rep_map_num =  {'': [",", "naira", " ", "n",
                    'hours', 'hrs', 'about'],
                }

for col in cols_num_change:
    df[col] = df[col].apply(lambda x: replace_str(x, rep_map_num) 
                            if type(x) == str else np.nan)
    
# Convert to numeric columns
cols_num = ['avg_monthly_household_income',  'electronics_count', 
           'cellphones_count', 'light_hours_current', 
            'kerosene_lamp_usage_count',
           'business_owners_count', 'business_owners_female']

for col in cols_num:
    df[col] = df[col].astype(float).astype('Int64')
    
df.head()
df.to_csv("household_post_survey_clean.csv")

In [6]:
# Encoding binary variables
cols_cat_bin = [
'interviewed_before', 'occupation_change',  
'business_recent', 'business_from_minigrid',
'business_use_minigrid', 
'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 
'clean_drinking_water', 
'clinic_electricity_access_minigrid',
'clinic_refrigeration_access', 
'better_access_health_minigrid',
'business_owners_female'
               ]

#Binary Encoding
enc_bin = ce.BinaryEncoder(cols=cols_cat_bin, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)

df = enc_bin.fit_transform(df)

In [8]:
cols_ord = [
 'houlsehold_income_change',
 'female_schooling_change',
 'male_schooling_change',
 'school_performance_change',
 'kerosene_lamp_usage_change',
'minigrid_access_life_improvement'
]
# Creating the mapping for each categroical variable with ordinality
cats_ord_map = list()
for col in cols_ord:
    val_ord = dict()
    cats = list(df[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_ord_map.append({"col":col, "mapping": val_ord})

cols_ord_map = {
'kerosene_lamp_usage_cost': ['nothing', '0_200_nkes', '200_600_nkes', 
                             '600_1000_nkes', '1000_1400_nkes',
                            '1400_nkes_and_above', ],
'cooking_fuel_collection_time': ['no_need_to_collect_fuel',
                                 'less_than_1_hour', '1_2_hours', 
                                 '3_5_hours', 'greater_than_5_hours', ],
'cooking_energy_monthly_cost':  ['nothing_we_use_minigrid_power',
                                 '0_1000_nkes', '1000_1500_nkes', 
                                 '1500_2000_nkes','2000_3000_nkes', 
                                 '3000_4000_nkes'],
'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
                  'neither_safe_nor_unsafe', 
                   'somewhat_safe','very_safe',],
'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
                                  'neither_safe_nor_unsafe', 
                                 'somewhat_safe','very_safe',], 
'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
                 'clear_water_source_fresh_spring_lake_etc', 
                 'community_well_or_pump',
                 'at_home_tap', ],    
'water_collection_travel_distance': ['no_need_to_travel_at_home_water_supply',
                                     'less_than_1_km', '1_2_km', 
                                      '2_5_km', '5_10_km',
                                      'greater_than_10_km', ],
'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
                           '2_3_hours', '3_4_hours', 
                           'greater_than_4_hours', ],
'water_monthly_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
                        '500_3000_nkes', '3000_5000_nkes',
                        '5000_nkes_and_above',],
'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
                            'between_2_3_km', 'between_3_5_km',
                           'greater_than_5_km', ],
'phone_charge_monthly_cost': ['nothing_we_use_minigrid_power', '0_100_nkes', '100_500_nkes',
                              '500_750_nkes',  '750_1000_nkes',  
                             '1000_nkes_and_above'],
'minigrid_access_life_improvement': ['no_its_the_same', 'yes']
}

for col, values in cols_ord_map.items():
    val_ord = dict()
    rng = len(values)
    for idx, val in enumerate(values):
        val_ord[val] = idx+1
    cats_ord_map.append({"col":col, "mapping": val_ord})
    
    
cats_ord_map

[{'col': 'houlsehold_income_change',
  'mapping': {'yes_it_has_decreased': -1,
   'yes_it_has_increased': 1,
   'no_it_is_the_same': 0}},
 {'col': 'female_schooling_change',
  'mapping': {'no_its_the_same': 0,
   'yes_its_increased': 1,
   'yes_its_decreased': -1}},
 {'col': 'male_schooling_change',
  'mapping': {'no_its_the_same': 0,
   'yes_its_increased': 1,
   'yes_its_decreased': -1}},
 {'col': 'school_performance_change',
  'mapping': {'yes_its_gotten_better': 1, 'no_its_the_same': 0}},
 {'col': 'kerosene_lamp_usage_change',
  'mapping': {'yes_its_decreased': -1,
   'no_its_the_same': 0,
   'yes_its_increased': 1}},
 {'col': 'minigrid_access_life_improvement',
  'mapping': {'no_its_the_same': 0}},
 {'col': 'kerosene_lamp_usage_cost',
  'mapping': {'nothing': 1,
   '0_200_nkes': 2,
   '200_600_nkes': 3,
   '600_1000_nkes': 4,
   '1000_1400_nkes': 5,
   '1400_nkes_and_above': 6}},
 {'col': 'cooking_fuel_collection_time',
  'mapping': {'no_need_to_collect_fuel': 1,
   'less_than_1_h

In [None]:
# #Ordinal Encoding
# enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
#                             handle_unknown="return_nan",
#                             handle_missing="return_nan",
#                             return_df=True)

# df = enc_ord.fit_transform(df)
# # print(df.info())
# # df.head()
# df.to_csv("household_post_survey_encoded.csv")