In [2]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [5]:
# Transform into ratio
cols_pre = [
'connection_period',
'renewvia_id', 'country',
'nigeria_community', 'kenya_community', 'age', 'gender',
'occupation', 'primary_provider', 'primary_provider_occupation',
'employement_type', 'avg_household_income', 'household_headcount',
'adult_headcount', 'girls_headcount', 'boys_headcount', 'girls_age',
'boys_age', 'girls_schooling', 'girls_unschooled_reasons',
'boys_schooling',
'boys_unschooled_reasons', 
'household_business_owners', 'minigrid_signup_primary_reason',
'minigrid_signup_secondary_reason', 'power_sources',
'power_sources_usage',
'power_sources_primary',
'appliances_count', 'cellphones_count', 'appliances_type',
'appliances_explain', 'appliances_addition_type',
'light_hours_current',
'light_primary_sources', 'kerosene_lamps_count',
'kerosene_lamp_usage_time', 'kerosene_lamps_cost',
'cooking_energy_source', 
'cooking_fuel_collection_time', 'cooking_fuel_responsible',
'cooking_energy_cost', 'applicances_charging_sources',
'applicances_charging_cost',
'feel_safe_dark', 'community_lights', 'home_exterior_lights',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'phone_charge_location',
'phone_charge_frequency',
'phone_charge_cost', 'phone_charge_travel_distance', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_collection_responsible',
'avg_person_age_water_collection', 'water_cost',
'other_household_activities', 'clinic_travel_distance',
'clinic_electricity_access',
'clinic_refrigeration_access', 'end_date', 'customerAccountNumber',
'tariff', 
'occupation_secondary_provider',
'cooking_energy_sources', 'community_clean_water_source'
]

#Importing the datasets
df = pd.read_csv("datasets_clean/hs_pre_annotated.csv", usecols=cols_pre)
df.replace('nan', np.nan, inplace=True)
df.head()

Unnamed: 0,connection_period,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,primary_provider,primary_provider_occupation,...,other_household_activities,clinic_travel_distance,clinic_electricity_access,clinic_refrigeration_access,end_date,customerAccountNumber,tariff,occupation_secondary_provider,cooking_energy_sources,community_clean_water_source
0,,501121,Nigeria,Akipelai,,,,,,,...,,,,,2021-09-28,501121,Residential,,,
1,3-6_months,570063,Nigeria,Balep,,45,Male,Farming,Yes,,...,Other food processing,less than 1 km,no,No,2021-10-24,570063,Residential,,,
2,3-6_months,570028,Nigeria,Balep,,52 years,Male,Education,Yes,,...,Processing ugali/gari,less than 1 km,no,No,2021-10-24,570028,Residential,,,
3,3-6_months,570097,Nigeria,Balep,,27 years,Male,Education,No,Farming,...,Processing palm oil,less than 1 km,no,No,2021-10-25,570097,Residential,,,
4,,570046,Nigeria,Balep,,30,Male,Education,Yes,Education,...,Other food processing,less than 1 km,no,No,2021-10-26,570046,Residential,,,


In [3]:
# No. of business owners
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if (type(x) == str) 
                                                                            and ('adult_female' in x) 
                                                                            else 0)

# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

cols_change = [
    'cooking_fuel_collection_time',
     'cooking_energy_monthly_cost',
     'applicances_charging_monthly_cost',
     'phone_charge_monthly_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
     'community_lights','water_source',
     'clean_drinking_water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'water_monthly_cost',
    'clinic_travel_distance']

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    


In [4]:
# replace values
replace_mapping = {
    'cooking_fuel_collection_time': [['copy_1_of_hours','hours'],
                                     ['less_than_1_hour',  '3_5_hours',]],
    
    'cooking_energy_monthly_cost' : [['0_1000_knes', 'copy_1_of_nkes', 
                                      '1000_1500_knes', 'copy_2_of_nkes', 
                                      '1500_2000_knes', 'copy_3_of_nkes',
                                      '2000_3000_knes','copy_4_of_nkes',
                                     'nkes', ], 
                                    ['0_1000_nkes', '0_1000_nkes', 
                                     '1000_1500_nkes', '1000_1500_nkes',
                                    '1500_2000_nkes', '1500_2000_nkes', 
                                    '2000_3000_nkes', '2000_3000_nkes',
                                    '3000_4000_nkes']],
    
    'applicances_charging_monthly_cost' : [['copy_1_of_nkes', 'copy_2_of_nkes', 
                                          'copy_3_of_nkes','copy_4_of_nkes',
                                           'nkes',], 
                                            ['0_150_nkes', '150_1000_nkes', 
                                            '1000_3000_nkes', '3000_4000_nkes',
                                            '4000_6000_nkes']],
    
    'phone_charge_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 'copy_3_of_nkes', 
                                   'nkes', 'nkes_and_above', '1000_n_and_above', 
                                  ], 
                                    ['0_100_nkes','100_500_nkes', '500_750_nkes',
                                     '750_1000_nkes',
                                     '1000_nkes_and_above', '1000_nkes_and_above']],
    
    'community_lights': [['street_lights', 'no_none', 'other'], 
                        ['yes', 'no', 'no']],
    
    'clean_drinking_water_source': ['clean_community_source_','clean_community_source',],
    
    'water_collection_travel_distance': [['copy_1_of_km','copy_2_of_km','km', ],
                                         ['less_than_1_km',  '1_2_km', '5_10_km']],
    
    'water_collection_time': [['copy_1_of_hours','hours'],
                             ['less_than_1_hour',  '3_4_hours',]],
    
    'water_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 
                            'nkes', 'nkes_and_above'], 
                          ['i_dont_pay_its_free', '0_500_nkes', 
                          '3000_5000_nkes', '5000_nkes_and_above',]],
    'clinic_travel_distance': ['less_than_1km', 'less_than_1_km']
    
}

for col, mapping in replace_mapping.items():
    df[col] = df[col].replace(mapping[0], mapping[1])

# df.head()
df.to_csv("data_clean/household_pre_survey_clean.csv")

In [5]:
# Convert to numeric columns
cols_num = ['avg_monthly_household_income', 'household_headcount', 
            'female_schooling', 'male_schooling', 'electronics_count', 
           'cellphones_count', 'light_hours_current', 'kerosene_lamp_usage_count',
           'business_owners_count', 'business_owners_female']

for col in cols_num:
    # print(col)
    # df[col] = df[col].fillna(-1)
    df[col] = df[col].astype(float).astype('Int64')
    

# Encoding binary variables
cols_cat_bin = ['community_lights', 'home_exterior_lights', 
                'clinic_electricity_access', 
                'clinic_refrigeration_access',
                ]

#Binary Encoding
enc_bin = ce.BinaryEncoder(cols=cols_cat_bin, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)

df = enc_bin.fit_transform(df)

# cols_cat_nom = ['cooking_energy_sources', 'phone_charge_location']

In [6]:
cols_ord_map = {
'cooking_fuel_collection_time': ['less_than_1_hour', '1_2_hours', 
                                 '3_5_hours', 'greater_than_5_hours', ],
'cooking_energy_monthly_cost':  ['0_1000_nkes', '1000_1500_nkes', 
                                 '1500_2000_nkes','2000_3000_nkes', 
                                 '3000_4000_nkes'],
'applicances_charging_monthly_cost': ['0_150_nkes', '150_1000_nkes', 
                                     '1000_3000_nkes', '3000_4000_nkes', 
                                    '4000_6000_nkes'],
'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
                  'neither_safe_nor_unsafe', 
                   'somewhat_safe','very_safe',],
'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
                                  'neither_safe_nor_unsafe', 
                                 'somewhat_safe','very_safe',], 
'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
                 'clear_water_source_fresh_spring_lake_etc', 
                 'community_well_or_pump',
                 'at_home_tap', ],    
'water_collection_travel_distance': ['less_than_1_km', '1_2_km', 
                                      '2_5_km', '5_10_km',
                                      'greater_than_10_km', ],
'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
                           '2_3_hours', '3_4_hours', 
                           'greater_than_4_hours', ],
'water_monthly_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
                        '500_3000_nkes', '3000_5000_nkes',
                        '5000_nkes_and_above',],
'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
                            'between_2_3_km', 'between_3_5_km',
                           'greater_than_5_km', ],
'phone_charge_monthly_cost': ['0_100_nkes', '100_500_nkes',
                              '500_750_nkes',  '750_1000_nkes',  
                             '1000_nkes_and_above'],
}
cats_ord_map = list()
for col, values in cols_ord_map.items():
    val_ord = dict()
    rng = len(values)
    for idx, val in enumerate(values):
        val_ord[val] = idx+1
    cats_ord_map.append({"col":col, "mapping": val_ord})

    
#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

df = enc_ord.fit_transform(df)
# print(df.info())
# df.head()
df.drop(columns=['household_business_owners'], inplace=True)
df.to_csv("data_encoded/household_pre_survey_encoded.csv")