In [1]:
import pandas as pd
import numpy as np
import json
import category_encoders as ce

In [2]:
# Transform into ratio
filter_cols_post = [
"renewvia_id", "interviewed_before",
'occupation_change', 'houlsehold_income_change',
'avg_monthly_household_income', 'female_schooling_change',
'male_schooling_change', 'school_performance_change',
'household_business_owners', 
'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'electronics_count',
'electronics_count_change', 'cellphones_count',
'appliances_addition',
'light_hours_current', "kerosene_lamp_usage_change",
'kerosene_lamp_usage_count', 'kerosene_lamp_usage_cost',
'cooking_fuel_collection_time', 'cooking_energy_monthly_cost',
'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 
'phone_charge_monthly_cost', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'water_collection_travel_distance',
'water_collection_time', 'water_monthly_cost',
'clinic_travel_distance', 'clinic_electricity_access_minigrid',
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
]


hs_post_com = pd.read_csv("data_clean/household_post_connection_commcare.csv", 
                          usecols=filter_cols_post
                         )
hs_post_ms = pd.read_csv("data_clean/household_post_connection_ms_form.csv", 
                        usecols=filter_cols_post, 
                         encoding = "ISO-8859-1")
# hs_post_com.head()
df = pd.concat([hs_post_com, hs_post_ms])
# df.rename(columns={'renewvia_account_number':'renewvia_id'},inplace=True)
df.head()

Unnamed: 0,renewvia_id,interviewed_before,occupation_change,houlsehold_income_change,avg_monthly_household_income,female_schooling_change,male_schooling_change,school_performance_change,household_business_owners,business_recent,...,clean_drinking_water,clean_drinking_water_source,water_collection_travel_distance,water_collection_time,water_monthly_cost,clinic_travel_distance,clinic_electricity_access_minigrid,clinic_refrigeration_access,better_access_health_minigrid,minigrid_access_life_improvement
0,,,,,,,,,,,...,,,,,,,,,,
1,521168.0,no,no,yes_it_has_decreased,"20,000 naira",no_its_the_same,no_its_the_same,yes_its_gotten_better,,,...,yes,treated__filtered_water,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,no_its_the_same
2,521039.0,yes,yes,yes_it_has_increased,50000,yes_its_increased,yes_its_increased,yes_its_gotten_better,adult_male,yes,...,yes,treated__filtered_water,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,yes
3,521055.0,no,no,no_it_is_the_same,40000,no_its_the_same,no_its_the_same,no_its_the_same,,,...,yes,treated__filtered_water,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,yes
4,521090.0,no,no,yes_it_has_increased,"20,000 Naira",no_its_the_same,no_its_the_same,yes_its_gotten_better,adult_female,yes,...,yes,clean_community_source,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-5_km,yes,yes,yes,yes


In [3]:
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
n_col = len(col_answers_map['mapping'])
    
for i in range(0, n_col):
    name = col_answers_map['mapping'][i]['col_new_name']
    if name in list(df.columns):
        vals_original = col_answers_map['mapping'][i]['original_answers']
        vals_formatted = col_answers_map['mapping'][i]['formatted_answers']
        df[name].replace(vals_formatted, vals_original, inplace=True)
    else:
        continue
    
    
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

# Count of multiple selection columns
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if (type(x) == str) 
                                                                            and ('adult_female' in x) 
                                                                            else 0)

df['appliances_addition'].replace(" ", ";", inplace=True)
df['appliances_addition_count'] = df['appliances_addition'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)

In [4]:
cols_replace = [
    'interviewed_before',
     'houlsehold_income_change',
     'female_schooling_change',
     'male_schooling_change',
     'school_performance_change',
    'kerosene_lamp_usage_change',
    'kerosene_lamp_usage_cost',
    'cooking_fuel_collection_time',
    'cooking_energy_monthly_cost',
    'community_lights','water_source',
     'clean_drinking_water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'water_monthly_cost',
    'clinic_travel_distance',
    'phone_charge_monthly_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
    'minigrid_access_life_improvement'
]

def replace_str(string, chars_dict):
    for new, old in chars_dict.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x, chars_dict= {'_': ['- ',' (', 
                                                                        ' ', '-'],
                                                                '': ['.)', ',', 
                                                                     '/', 'u\n', 
                                                                     "'", "."],
                                                                }))

In [5]:
# Numeric conversion
df.replace('nan', np.nan, inplace=True)
cols_num_change = ['avg_monthly_household_income', 
                   'light_hours_current',]
rep_map_num =  {'': [",", "naira", " ", "n",
                    'hours', 'hrs', 'about'],
                }

df['kerosene_lamp_usage_count'] = df['kerosene_lamp_usage_count'].replace(['none', 'no', 'o'],
                                         [np.nan, np.nan, np.nan])
for col in cols_num_change:
    # df[col].replace('none', np.nan, inplace=True)
    df[col] = df[col].apply(lambda x: replace_str(x, rep_map_num) 
                            if type(x) == str else x)
    
# Convert to numeric columns
cols_num = ['avg_monthly_household_income',  'electronics_count', 
           'cellphones_count', 'light_hours_current', 
            'kerosene_lamp_usage_count',
           'business_owners_count', 'business_owners_female']

for col in cols_num:
    df[col] = df[col].astype(float).astype('Int64')
    
df['business_use_minigrid'].replace('yes ', 'yes', inplace=True)
df['interviewed_before'].replace('not_sure', np.nan, inplace=True)
df['clean_drinking_water_source'].replace('clean_community_source_', 
                                          'clean_community_source', inplace=True)

In [6]:
# print(df.info())
df.drop(columns=['household_business_owners', 'appliances_addition'], inplace=True)
# df.head()

In [7]:
df.to_csv("data_clean/household_post_survey_clean.csv")

In [8]:
cats_map = list()
# Inplace encoding of binary variables
cols_bin_inplace = [
'interviewed_before','occupation_change', 
 'business_recent', 'business_from_minigrid',
 'business_use_minigrid', 'electronics_count_change',
 'community_lights', 'home_exterior_lights',
 'exterior_lights_minigrid', 'clean_drinking_water',
 'clinic_electricity_access_minigrid',
 'clinic_refrigeration_access',
 'better_access_health_minigrid'
]

for col in cols_bin_inplace:
    cats_map.append({"col":col, "mapping": {'no':0, 'yes':1}})
    
# Map encoding values for 'switch' changes (decrease, neutral, increase) 
cols_ord_change = [
 'houlsehold_income_change', 'female_schooling_change',
 'male_schooling_change', 'school_performance_change',
 'kerosene_lamp_usage_change', 'minigrid_access_life_improvement'
]
# Creating the mapping for each categroical variable with ordinality
for col in cols_ord_change:
    val_ord = dict()
    cats = list(df[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif cat == 'yes' or any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_map.append({"col":col, "mapping": val_ord})

# Map encoding for variables with scale with max > 1
cols_ord_scale = {
'kerosene_lamp_usage_cost': ['nothing', '0_200_nkes', '200_600_nkes', 
                             '600_1000_nkes', '1000_1400_nkes',
                            '1400_nkes_and_above', ],
'cooking_fuel_collection_time': ['no_need_to_collect_fuel',
                                 'less_than_1_hour', '1_2_hours', 
                                 '3_5_hours', 'greater_than_5_hours', ],
'cooking_energy_monthly_cost':  ['nothing_we_use_minigrid_power',
                                 '0_1000_nkes', '1000_1500_nkes', 
                                 '1500_2000_nkes','2000_3000_nkes', 
                                 '3000_4000_nkes'],
'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
                  'neither_safe_nor_unsafe', 
                   'somewhat_safe','very_safe',],
'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
                                  'neither_safe_nor_unsafe', 
                                 'somewhat_safe','very_safe',], 
'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
                 'clear_water_source_fresh_spring_lake_etc', 
                 'community_well_or_pump',
                 'at_home_tap', ],    
'clean_drinking_water_source' : ["boiled_water",
                              "bottled_water",
                              "treated__filtered_water",
                              "clean_community_source"],
'water_collection_travel_distance': ['no_need_to_travel_at_home_water_supply',
                                     'less_than_1_km', '1_2_km', 
                                      '2_5_km', '5_10_km',
                                      'greater_than_10_km', ],
'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
                           '2_3_hours', '3_4_hours', 
                           'greater_than_4_hours', ],
'water_monthly_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
                        '500_3000_nkes', '3000_5000_nkes',
                        '5000_nkes_and_above',],
'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
                            'between_2_3_km', 'between_3_5_km',
                           'greater_than_5_km', ],
'phone_charge_monthly_cost': ['nothing_we_use_minigrid_power', '0_100_nkes', '100_500_nkes',
                              '500_750_nkes',  '750_1000_nkes',  
                             '1000_nkes_and_above'],
}

for col, values in cols_ord_scale.items():
    val_ord = dict()
    rng = len(values)
    for idx, val in enumerate(values):
        val_ord[val] = idx+1
    cats_map.append({"col":col, "mapping": val_ord})

In [9]:
#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

df = enc_ord.fit_transform(df)
# print(df.info())
# df.head()

In [10]:
df.to_csv("data_encoded/household_post_survey_encoded.csv")