In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
# Transform into ratio
cols_pre = [
'connection_period',
'renewvia_id', 'country',
'nigeria_community', 'kenya_community', 'age', 'gender',
'occupation', 'primary_provider', 'primary_provider_occupation',
'employement_type', 'avg_household_income', 'household_headcount',
'adult_headcount', 'girls_headcount', 'boys_headcount', 'girls_age',
'boys_age', 'girls_schooling', 'girls_unschooled_reasons',
'boys_schooling',
'boys_unschooled_reasons', 
'household_business_owners', 'minigrid_signup_primary_reason',
'minigrid_signup_secondary_reason', 'power_sources',
'power_sources_usage',
'power_sources_primary',
'appliances_count', 'cellphones_count', 'appliances_type',
'appliances_explain', 'appliances_addition_type',
'light_hours_current',
'light_primary_sources', 'kerosene_lamps_count',
'kerosene_lamp_usage_time', 'kerosene_lamps_cost',
'cooking_energy_source', 
'cooking_fuel_collection_time', 'cooking_fuel_responsible',
'cooking_energy_cost', 'applicances_charging_sources',
'applicances_charging_cost',
'feel_safe_dark', 'community_lights', 'home_exterior_lights',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'phone_charge_location',
'phone_charge_frequency',
'phone_charge_cost', 'phone_charge_travel_distance', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_collection_responsible',
'avg_person_age_water_collection', 'water_cost',
'other_household_activities', 'clinic_travel_distance',
'clinic_electricity_access',
'clinic_refrigeration_access', 'end_date', 'customerAccountNumber',
'tariff', 
'occupation_secondary_provider',
'cooking_energy_sources', 'community_clean_water_source'
]

#Importing the datasets
df = pd.read_csv("datasets_clean/hs_pre_annotated.csv", usecols=cols_pre)
df.replace('nan', np.nan, inplace=True)
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

# Count of multiple selection columns
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)

df.head()

Unnamed: 0,connection_period,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,primary_provider,primary_provider_occupation,...,clinic_electricity_access,clinic_refrigeration_access,end_date,customerAccountNumber,tariff,occupation_secondary_provider,cooking_energy_sources,community_clean_water_source,business_owners_count,business_owners_female
0,,501121,nigeria,akipelai,,,,,,,...,,,2021-09-28,501121,residential,,,,1,0
1,3-6_months,570063,nigeria,balep,,45,male,farming,yes,,...,no,no,2021-10-24,570063,residential,,,,1,1
2,3-6_months,570028,nigeria,balep,,52 years,male,education,yes,,...,no,no,2021-10-24,570028,residential,,,,1,1
3,3-6_months,570097,nigeria,balep,,27 years,male,education,no,farming,...,no,no,2021-10-25,570097,residential,,,,1,0
4,,570046,nigeria,balep,,30,male,education,yes,education,...,no,no,2021-10-26,570046,residential,,,,1,0


In [3]:
# news = []
# for col in list(df.columns):
#     uq = df[col].unique()
#     if len(uq) < 10:
#         print(col, uq)
#         news.append(col)

# news

In [4]:
cols_change = [
    'connection_period',
 'country',
 'nigeria_community',
 'gender',
 'primary_provider',
 'employement_type',
 'boys_unschooled_reasons',
 'household_business_owners',
 'power_sources_usage',
 'kerosene_lamp_usage_time',
 'kerosene_lamps_cost',
 'cooking_energy_source',
 'cooking_fuel_collection_time',
 'cooking_energy_cost',
 'applicances_charging_sources',
 'feel_safe_dark',
 'community_lights',
 'home_exterior_lights',
 'feel_safe_if_exterior_lights',
 'feel_unsafe_reasons',
 'phone_charge_location',
 'phone_charge_frequency',
 'phone_charge_travel_distance',
 'water_source',
 'clean_drinking_water',
 'clean_drinking_water_source',
 'water_collection_travel_distance',
 'water_collection_time',
 'water_collection_responsible',
 'avg_person_age_water_collection',
 'water_cost',
 'clinic_travel_distance',
 'clinic_electricity_access',
 'clinic_refrigeration_access',
 'tariff',
 'cooking_energy_sources',]

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    
df.head()

Unnamed: 0,connection_period,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,primary_provider,primary_provider_occupation,...,clinic_electricity_access,clinic_refrigeration_access,end_date,customerAccountNumber,tariff,occupation_secondary_provider,cooking_energy_sources,community_clean_water_source,business_owners_count,business_owners_female
0,,501121,nigeria,akipelai,,,,,,,...,,,2021-09-28,501121,residential,,,,1,0
1,3_6_months,570063,nigeria,balep,,45,male,farming,yes,,...,no,no,2021-10-24,570063,residential,,,,1,1
2,3_6_months,570028,nigeria,balep,,52 years,male,education,yes,,...,no,no,2021-10-24,570028,residential,,,,1,1
3,3_6_months,570097,nigeria,balep,,27 years,male,education,no,farming,...,no,no,2021-10-25,570097,residential,,,,1,0
4,,570046,nigeria,balep,,30,male,education,yes,education,...,no,no,2021-10-26,570046,residential,,,,1,0


In [5]:
# # replace values
# replace_mapping = {
#     'cooking_fuel_collection_time': [['copy_1_of_hours','hours'],
#                                      ['less_than_1_hour',  '3_5_hours',]],
    
#     'cooking_energy_monthly_cost' : [['0_1000_knes', 'copy_1_of_nkes', 
#                                       '1000_1500_knes', 'copy_2_of_nkes', 
#                                       '1500_2000_knes', 'copy_3_of_nkes',
#                                       '2000_3000_knes','copy_4_of_nkes',
#                                      'nkes', ], 
#                                     ['0_1000_nkes', '0_1000_nkes', 
#                                      '1000_1500_nkes', '1000_1500_nkes',
#                                     '1500_2000_nkes', '1500_2000_nkes', 
#                                     '2000_3000_nkes', '2000_3000_nkes',
#                                     '3000_4000_nkes']],
    
#     'applicances_charging_monthly_cost' : [['copy_1_of_nkes', 'copy_2_of_nkes', 
#                                           'copy_3_of_nkes','copy_4_of_nkes',
#                                            'nkes',], 
#                                             ['0_150_nkes', '150_1000_nkes', 
#                                             '1000_3000_nkes', '3000_4000_nkes',
#                                             '4000_6000_nkes']],
    
#     'phone_charge_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 'copy_3_of_nkes', 
#                                    'nkes', 'nkes_and_above', '1000_n_and_above', 
#                                   ], 
#                                     ['0_100_nkes','100_500_nkes', '500_750_nkes',
#                                      '750_1000_nkes',
#                                      '1000_nkes_and_above', '1000_nkes_and_above']],
    
#     'community_lights': [['street_lights', 'no_none', 'other'], 
#                         ['yes', 'no', 'no']],
    
#     'clean_drinking_water_source': ['clean_community_source_','clean_community_source',],
    
#     'water_collection_travel_distance': [['copy_1_of_km','copy_2_of_km','km', ],
#                                          ['less_than_1_km',  '1_2_km', '5_10_km']],
    
#     'water_collection_time': [['copy_1_of_hours','hours'],
#                              ['less_than_1_hour',  '3_4_hours',]],
    
#     'water_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 
#                             'nkes', 'nkes_and_above'], 
#                           ['i_dont_pay_its_free', '0_500_nkes', 
#                           '3000_5000_nkes', '5000_nkes_and_above',]],
#     'clinic_travel_distance': ['less_than_1km', 'less_than_1_km']
    
# }

# for col, mapping in replace_mapping.items():
#     df[col] = df[col].replace(mapping[0], mapping[1])

# # df.head()
# # df.to_csv("data_clean/household_pre_survey_clean.csv")

In [6]:
# Convert to numeric columns
cats_map = list()
main_cols = ['renewvia_id', ]
cols_num = ['avg_household_income', 'household_headcount', 
            'girls_schooling', 'boys_schooling', 'appliances_count', 
           'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
           'business_owners_count', 'business_owners_female']

main_cols.extend(cols_num)
for col in cols_num:
    # print(col)
    # df[col] = df[col].fillna(-1)
    df[col] = df[col].astype(float).astype('Int64')
    

# Encoding binary variables
cols_bin = ['community_lights', 'home_exterior_lights', 
                'clinic_electricity_access', 
                'clinic_refrigeration_access',
                ]

main_cols.extend(cols_bin)
#Binary Encoding
for col in cols_bin:
    cats_map.append({"col":col, "mapping": {'no':0, 'yes':1, 
                                            'option 1': 0, 
                                           'option 2': 1}})

# cols_cat_nom = ['cooking_energy_sources', 'phone_charge_location']

In [7]:
cols_ord_scale = {
'cooking_fuel_collection_time': ['less_than_1_hour', '1_2_hours', 
                                 '3_5_hours', 'greater_than_5_hours', ],
'cooking_energy_cost':  ['0_1000_nkes', '1000_1500_nkes', 
                                 '1500_2000_nkes','2000_3000_nkes', 
                                 '3000_4000_nkes'],
'applicances_charging_cost': ['0_150_nkes', '150_1000_nkes', 
                                     '1000_3000_nkes', '3000_4000_nkes', 
                                    '4000_6000_nkes'],
'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
                  'neither_safe_nor_unsafe', 
                   'somewhat_safe','very_safe',],
'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
                                  'neither_safe_nor_unsafe', 
                                 'somewhat_safe','very_safe',], 
'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
                 'clear_water_source_fresh_spring_lake_etc', 
                 'community_well_or_pump',
                 'at_home_tap', ],    
'water_collection_travel_distance': ['less_than_1_km', '1_2_km', 
                                      '2_5_km', '5_10_km',
                                      'greater_than_10_km', ],
'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
                           '2_3_hours', '3_4_hours', 
                           'greater_than_4_hours', ],
'water_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
                        '500_3000_nkes', '3000_5000_nkes',
                        '5000_nkes_and_above',],
'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
                            'between_2_3_km', 'between_3_5_km',
                           'greater_than_5_km', ],
'phone_charge_cost': ['0_100_nkes', '100_500_nkes',
                              '500_750_nkes',  '750_1000_nkes',  
                             '1000_nkes_and_above'],
}

main_cols.extend(list(cols_ord_scale.keys()))

for col, values in cols_ord_scale.items():
    val_ord = dict()
    rng = len(values)
    for idx, val in enumerate(values):
        val_ord[val] = idx+1
    cats_map.append({"col":col, "mapping": val_ord})

    
#Ordinal Encoding
df_subset = df[main_cols]
enc_ord = ce.OrdinalEncoder(mapping=cats_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

# df_subset
df_subset = enc_ord.fit_transform(df_subset)
# df_subset
# # df.drop(columns=['household_business_owners'], inplace=True)
df_subset.to_csv("datasets_encoded/hs_subset_pre_survey_encoded.csv")