In [16]:
import pandas as pd
import numpy as np
import json
import collections
import re
import category_encoders as ce

In [67]:
# Transform into ratio
cols_pre = [
'connection_period',
'renewvia_id', 'country',
'nigeria_community', 'kenya_community', 'age', 'gender',
'occupation', 'primary_provider', 'primary_provider_occupation',
'employement_type', 'avg_household_income', 'household_headcount',
'adult_headcount', 'girls_headcount', 'boys_headcount', 'girls_age',
'boys_age', 'girls_schooling', 'girls_unschooled_reasons',
'boys_schooling',
'boys_unschooled_reasons', 
'household_business_owners', 'minigrid_signup_primary_reason',
'minigrid_signup_secondary_reason', 'power_sources',
'power_sources_usage',
'power_sources_primary',
'appliances_count', 'cellphones_count', 'appliances_type',
'appliances_explain', 'appliances_addition_type',
'light_hours_current',
'light_primary_sources', 'kerosene_lamps_count',
'kerosene_lamp_usage_time', 'kerosene_lamps_cost',
'cooking_energy_source', 
'cooking_fuel_collection_time', 'cooking_fuel_responsible',
'cooking_energy_cost', 'applicances_charging_sources',
'applicances_charging_cost',
'feel_safe_dark', 'community_lights', 'home_exterior_lights',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'phone_charge_location',
'phone_charge_frequency',
'phone_charge_cost', 'phone_charge_travel_distance', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_collection_responsible',
'avg_person_age_water_collection', 'water_cost',
'other_household_activities', 'clinic_travel_distance',
'clinic_electricity_access',
'clinic_refrigeration_access', 'end_date', 'customerAccountNumber',
'tariff', 
'occupation_secondary_provider',
'cooking_energy_sources', 'community_clean_water_source'
]

#Importing the datasets
df = pd.read_csv("datasets_clean/initial_annotated.csv", usecols=cols_pre)
print(len(df))
df.replace('nan', np.nan, inplace=True)
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

# Count of multiple selection columns
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)
df['avg_person_age_water_collection'].replace('15 years or older', 
                                              '15 -18 years old', 
                                              inplace=True)
df.head()

3952


Unnamed: 0,connection_period,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,primary_provider,primary_provider_occupation,...,clinic_electricity_access,clinic_refrigeration_access,end_date,customerAccountNumber,tariff,occupation_secondary_provider,cooking_energy_sources,community_clean_water_source,business_owners_count,business_owners_female
0,,501121.0,nigeria,akipelai,,,,,,,...,,,2021-09-28,501121.0,residential,,,,1,0
1,3-6_months,570063.0,nigeria,balep,,45,male,farming,yes,,...,no,no,2021-10-24,570063.0,residential,,,,1,1
2,3-6_months,570028.0,nigeria,balep,,52 years,male,education,yes,,...,no,no,2021-10-24,570028.0,residential,,,,1,1
3,3-6_months,,nigeria,balep,,23 years,male,farming,yes,education,...,no,no,2021-10-24,,,,,,1,1
4,3-6_months,570097.0,nigeria,balep,,27 years,male,education,no,farming,...,no,no,2021-10-25,570097.0,residential,,,,1,0


In [68]:
# news = []
# for col in list(df.columns):
#     uq = df[col].unique()
#     if len(uq) < 10:
#         print(col, uq)
#         news.append(col)

# # news

In [69]:
df['connection_period'].unique()

array(['nan', '3-6_months', '6-12 months', 'choice5', 'choice6',
       'over 36 months', '12-24 months', '1-3 months', '3-6 months',
       '24-36 months'], dtype=object)

In [62]:
cols_change = [
    'connection_period',
 'country',
 'nigeria_community',
 'gender',
 'primary_provider',
 'employement_type',
 'boys_unschooled_reasons',
 'household_business_owners',
 'power_sources_usage',
 'kerosene_lamp_usage_time',
 'kerosene_lamps_cost',
 'cooking_energy_source',
 'cooking_fuel_collection_time',
 'cooking_energy_cost',
 'applicances_charging_sources',
 'feel_safe_dark',
 'community_lights',
 'home_exterior_lights',
 'feel_safe_if_exterior_lights',
 'feel_unsafe_reasons',
 'phone_charge_location',
 'phone_charge_frequency',
 'phone_charge_travel_distance',
 'water_source',
 'clean_drinking_water',
 'clean_drinking_water_source',
 'water_collection_travel_distance',
 'water_collection_time',
 'water_collection_responsible',
 'avg_person_age_water_collection',
 'water_cost',
 'clinic_travel_distance',
 'clinic_electricity_access',
 'clinic_refrigeration_access',
 'tariff',
 'cooking_energy_sources',]

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-', "__"],
                    '': [")", ";", '.)', ',', '/', 
                         'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    
df.head()

Unnamed: 0,connection_period,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,primary_provider,primary_provider_occupation,...,clinic_electricity_access,clinic_refrigeration_access,end_date,customerAccountNumber,tariff,occupation_secondary_provider,cooking_energy_sources,community_clean_water_source,business_owners_count,business_owners_female
0,,501121.0,nigeria,akipelai,,,,,,,...,,,2021-09-28,501121.0,residential,,,,1,0
1,3_6_months,570063.0,nigeria,balep,,45,male,farming,yes,,...,no,no,2021-10-24,570063.0,residential,,,,1,1
2,3_6_months,570028.0,nigeria,balep,,52 years,male,education,yes,,...,no,no,2021-10-24,570028.0,residential,,,,1,1
3,3_6_months,,nigeria,balep,,23 years,male,farming,yes,education,...,no,no,2021-10-24,,,,,,1,1
4,3_6_months,570097.0,nigeria,balep,,27 years,male,education,no,farming,...,no,no,2021-10-25,570097.0,residential,,,,1,0


In [63]:
# Convert to numeric columns
cats_map = list()
main_cols = ['renewvia_id', ]
cols_num = ['avg_household_income', 'household_headcount', 
            'girls_schooling', 'boys_schooling', 'appliances_count', 
            'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
            'business_owners_count', 'business_owners_female']

main_cols.extend(cols_num)
for col in cols_num:
    # print(col)
    df[col] = df[col].astype(str)
    df[col] = df[col].str.extract('(\d+)', expand=False)
    df[col] = df[col].astype(float).astype('Int64')
    

# Encoding binary variables
cols_bin = ['community_lights', 'home_exterior_lights', 
                'clinic_electricity_access', 
                'clinic_refrigeration_access',
                ]

main_cols.extend(cols_bin)
#Binary Encoding
for col in cols_bin:
    cats_map.append({"col":col, "mapping": {'no':0, 'yes':1, 
                                            'option 1': 0, 
                                           'option 2': 1}})

# cols_cat_nom = ['cooking_energy_sources', 'phone_charge_location']

In [64]:
cols = list(df.columns)
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
    
answers = col_answers_map['col_answers_map']
n_col = len(answers)
    
c_map = []
for i in range(0, n_col):
    col = answers[i]['new_name']
    mapping = answers[i]['mapping']
    if col in cols and mapping != {}:
        vals = list(mapping.values())
        new_vals = [replace_str(v.lower()) for v in vals]
        # print(col, new_vals)
        # for col, values in cols_ord_scale.items():
        if "yes" in new_vals and "no" in new_vals:
            c_map.append({"col":col, "mapping": {'no':0, 'yes':1, 
                                            'option 1': 0, 
                                           'option 2': 1}})
        else:
            val_ord = dict()
            rng = len(new_vals)
            for idx, val in enumerate(new_vals):
                val_ord[val] = idx+1
            c_map.append({"col":col, "mapping": val_ord})
            
# c_map

In [65]:
for col_map in c_map:
    name = col_map['col']
    mapping = col_map['mapping']
    df[name] = df[name].replace(mapping)
    
    
news = []
for col in list(df.columns):
    uq = df[col].unique()
    if len(uq) < 10:
        print(col, uq)
        news.append(col)

connection_period ['nan' 3 2 'choice5' 'choice6' 7 5 '1_3_months' 6]
country [1 'nan' 2]
nigeria_community [1 5 6 9 4 'nan' 2 3 7]
gender ['nan' 2 1]
primary_provider ['nan' 1 0]
employement_type ['nan' 1 3 4 5 2]
boys_unschooled_reasons ['nan' 1 8 3 7 4 6 2 5]
household_business_owners ['nan' 3 2 '' 'adult_maleadult_female' 'adult_femaleadult_male']
power_sources_usage ['nan' 3 2 4 1]
kerosene_lamp_usage_time ['nan' 2 1 3 4]
kerosene_lamps_cost ['nan' 3 2 1 5 4]
cooking_energy_source ['nan' 3 2 6 1 4 5]
cooking_fuel_collection_time ['nan' 2 1 3 4]
cooking_energy_cost ['nan' 1 5 2 3 4]
applicances_charging_sources ['nan' 2 1 3 4 5 6]
feel_safe_dark ['nan' 5 4 3 1 2]
community_lights ['nan' 2 1 3 'yes' 'no']
home_exterior_lights ['nan' 0 1]
feel_safe_if_exterior_lights ['nan' 1 2 4 5 3]
feel_unsafe_reasons ['nan' 2 1 3 5 4
 'unsafe_travel_to_obtain_water_supplies_and_charging_phones']
phone_charge_location ['nan' 1 3 2 4]
phone_charge_frequency ['nan' 1 2 3 4]
phone_charge_travel_distan

In [46]:
# cols_ord_scale = {
# 'cooking_fuel_collection_time': ['less_than_1_hour', '1_2_hours', 
#                                  '3_5_hours', 'greater_than_5_hours', ],
# 'cooking_energy_cost':  ['0_1000_nkes', '1000_1500_nkes', 
#                                  '1500_2000_nkes','2000_3000_nkes', 
#                                  '3000_4000_nkes'],
# 'applicances_charging_cost': ['0_150_nkes', '150_1000_nkes', 
#                                      '1000_3000_nkes', '3000_4000_nkes', 
#                                     '4000_6000_nkes'],
# 'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
#                   'neither_safe_nor_unsafe', 
#                    'somewhat_safe','very_safe',],
# 'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
#                                   'neither_safe_nor_unsafe', 
#                                  'somewhat_safe','very_safe',], 
# 'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
#                  'clear_water_source_fresh_spring_lake_etc', 
#                  'community_well_or_pump',
#                  'at_home_tap', ],    
# 'water_collection_travel_distance': ['less_than_1_km', '1_2_km', 
#                                       '2_5_km', '5_10_km',
#                                       'greater_than_10_km', ],
# 'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
#                            '2_3_hours', '3_4_hours', 
#                            'greater_than_4_hours', ],
# 'water_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
#                         '500_3000_nkes', '3000_5000_nkes',
#                         '5000_nkes_and_above',],
# 'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
#                             'between_2_3_km', 'between_3_5_km',
#                            'greater_than_5_km', ],
# 'phone_charge_cost': ['0_100_nkes', '100_500_nkes',
#                               '500_750_nkes',  '750_1000_nkes',  
#                              '1000_nkes_and_above'],
# }

# main_cols.extend(list(cols_ord_scale.keys()))

# for col, values in cols_ord_scale.items():
#     val_ord = dict()
#     rng = len(values)
#     for idx, val in enumerate(values):
#         val_ord[val] = idx+1
#     cats_map.append({"col":col, "mapping": val_ord})

    
#Ordinal Encoding
# df_subset = df[main_cols]
# enc_ord = ce.OrdinalEncoder(mapping=c_map, 
#                             handle_unknown="return_nan",
#                             handle_missing="return_nan",
#                             return_df=True)

# # df_subset
# enc_ord.fit_transform(df)
# # df.drop(columns=['household_business_owners'], inplace=True)
# df_subset.to_csv("datasets_encoded/hs_subset_pre_survey_encoded.csv")

Unnamed: 0,connection_period,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,primary_provider,primary_provider_occupation,...,clinic_electricity_access,clinic_refrigeration_access,end_date,customerAccountNumber,tariff,occupation_secondary_provider,cooking_energy_sources,community_clean_water_source,business_owners_count,business_owners_female
0,,501121,1.0,,,,,,,,...,,,2021-09-28,501121.0,residential,,,,1,0
1,,570063,1.0,,,45,2.0,,,,...,0.0,0.0,2021-10-24,570063.0,residential,,,,1,1
2,,570028,1.0,,,52 years,2.0,8.0,,,...,0.0,0.0,2021-10-24,570028.0,residential,,,,1,1
3,,,1.0,,,23 years,2.0,,,,...,0.0,0.0,2021-10-24,,,,,,1,1
4,,570097,1.0,,,27 years,2.0,8.0,,,...,0.0,0.0,2021-10-25,570097.0,residential,,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3947,,,2.0,,,33.0,1.0,,,,...,1.0,1.0,2022-12-24,,,,firewood,,0,0
3948,,,2.0,,,29.0,2.0,,,,...,1.0,1.0,2022-12-24,,,,firewood,,0,0
3949,,,1.0,,,36.0,2.0,,,,...,,,2021-07-24,,,,firewood,,0,0
3950,,,2.0,,,29.0,2.0,,,,...,1.0,1.0,2022-12-24,,,,charcoal,,0,0
