In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
# Transform into ratio
filter_cols = [
'renewvia_id',
'country',
'community',
'connection_period',
'business_type',
'operation_status',
'electricity_sources_non_minigrid',
'appliances_addition',
'kerosene_usage_change',
'diesel_usage_change',
'operations_hours_change',
'hours_increase',
'clean_drinking_water_access',
'new_prod_serv_add',
'workforce_change',
'workforce_change_female',
'weekly_monthly_earnings',
'health_offering_change',
'school_attendance_change',
'school_attendance_performance',
]

# Importing the commecial dataset
df = pd.read_excel("data_clean/commercial_post_connection_merged.xlsx", 
                   usecols=filter_cols)
df.head()

Unnamed: 0,renewvia_id,country,community,connection_period,business_type,operation_status,electricity_sources_non_minigrid,appliances_addition,kerosene_usage_change,diesel_usage_change,operations_hours_change,hours_increase,clean_drinking_water_access,new_prod_serv_add,workforce_change,workforce_change_female,weekly_monthly_earnings,health_offering_change,school_attendance_change,school_attendance_performance
0,131206,Kenya,Kalobeyei Settlement,Longer than 2 years,Shop,,"No, just Renewvia Minigrid",Cold storage - refrigerator or freezer;Compute...,"Yes, it has decreased","Yes, it has decreased","Yes, they have increased",1-2 hours daily,"Yes, it has increased",Yes,"No, the number has remained the same",,"Yes, they have increased",None of these;,"No, we have not noticed a change","No, there has not been a noticable change"
1,131231,Kenya,Kalobeyei Settlement,Longer than 2 years,Other business,,"No, just Renewvia Minigrid",Cold storage - refrigerator or freezer;Exterio...,"Yes, it has decreased","Yes, it has decreased","Yes, they have increased",1-2 hours daily,"Yes, it has increased",Yes,"Yes, we have added workers",,"Yes, they have increased",None of these;,"No, we have not noticed a change","No, there has not been a noticable change"
2,131542,Kenya,Kalobeyei Settlement,Longer than 2 years,Shop,,"No, just Renewvia Minigrid",Cold storage - refrigerator or freezer;,"Yes, it has decreased","Yes, it has decreased","Yes, they have increased",1-2 hours daily,"Yes, it has increased",Yes,"No, the number has remained the same",,"Yes, they have increased",None of these;,"No, we have not noticed a change","No, there has not been a noticable change"
3,131220,Kenya,Kalobeyei Settlement,Longer than 2 years,Shop,,"No, just Renewvia Minigrid",Cold storage - refrigerator or freezer;Compute...,"Yes, it has increased","Yes, it has increased","Yes, they have increased",2-4 hours daily,"Yes, it has increased",Yes,"Yes, we have added workers",,"Yes, they have increased",None of these;,"No, we have not noticed a change","No, there has not been a noticable change"
4,131485,Kenya,Kalobeyei Settlement,Longer than 2 years,Shop,,"No, just Renewvia Minigrid",No additional appliances;,"Yes, it has decreased","Yes, it has decreased","Yes, they have increased",1-2 hours daily,"Yes, it has increased",No,"No, the number has remained the same",,"Yes, they have increased",None of these;,"No, we have not noticed a change","No, there has not been a noticable change"


In [3]:
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

cols_change = [
'community',
 'connection_period',
 'business_type',
 'operation_status',
 'electricity_sources_non_minigrid',
 'appliances_addition',
 'kerosene_usage_change',
 'diesel_usage_change',
 'operations_hours_change',
 'hours_increase',
 'clean_drinking_water_access',
 'new_prod_serv_add',
 'workforce_change',
 'weekly_monthly_earnings',
 'health_offering_change',
 'school_attendance_change',
 'school_attendance_performance'
]

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    
    
# replace values
replace_mapping = {
    'operation_status': ['no_it_closed', 'no_its_closed'],
    'connection_period': [['years', 'months'], ['nan', 'nan']],
    
    'hours_increase': [['copy_1_of_hours_daily', 'hours_daily'],
                       ['1_2_hours_daily', 'nan']],
    
    'school_attendance_performance': ['no_there_has_not_been_a_noticable_change',
                            'no_there_has_not_been_a_noticeable_change'],
}

for col, mapping in replace_mapping.items():
    # print(col, mapping)
    df[col] = df[col].replace(mapping[0], mapping[1])
    

df["workforce_change_female"] = df["workforce_change_female"].astype(float).astype('Int64')

# No. of additional appliances
df['appliances_addition_count'] = df['appliances_addition'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
# No. of new services offered by clinic
df['health_offering_change_count'] = df['health_offering_change'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)

df.replace('nan', np.nan, inplace=True)
# Drop list of appliances and services
df.drop(['appliances_addition', 'health_offering_change'], axis=1, inplace=True)
df.to_csv('data_clean/commercial_post_connection_clean.csv')
df.head()

Unnamed: 0,renewvia_id,country,community,connection_period,business_type,operation_status,electricity_sources_non_minigrid,kerosene_usage_change,diesel_usage_change,operations_hours_change,hours_increase,clean_drinking_water_access,new_prod_serv_add,workforce_change,workforce_change_female,weekly_monthly_earnings,school_attendance_change,school_attendance_performance,appliances_addition_count,health_offering_change_count
0,131206,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,1_2_hours_daily,yes_it_has_increased,yes,no_the_number_has_remained_the_same,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change,4,2
1,131231,kenya,kalobeyei_settlement,longer_than_2_years,other_business,,no_just_renewvia_minigrid,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,1_2_hours_daily,yes_it_has_increased,yes,yes_we_have_added_workers,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change,5,2
2,131542,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,1_2_hours_daily,yes_it_has_increased,yes,no_the_number_has_remained_the_same,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change,2,2
3,131220,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,yes_it_has_increased,yes_it_has_increased,yes_they_have_increased,2_4_hours_daily,yes_it_has_increased,yes,yes_we_have_added_workers,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change,6,2
4,131485,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,1_2_hours_daily,yes_it_has_increased,no,no_the_number_has_remained_the_same,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change,2,2


In [4]:
cols_ord = [
"operation_status",
"kerosene_usage_change",
"diesel_usage_change",
"operations_hours_change",
"clean_drinking_water_access",
"workforce_change",
"weekly_monthly_earnings",
"school_attendance_change",
"school_attendance_performance",
]

# Creating the mapping for each categroical variable with ordinality
cats_ord_map = list()
for col in cols_ord:
    val_ord = dict()
    cats = list(df[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_ord_map.append({"col":col, "mapping": val_ord})

In [5]:
#Binary Encoding
# enc_bin = ce.BinaryEncoder(cols =["new_prod_serv_add"], 
#                             handle_unknown="return_nan",
#                             handle_missing="return_nan", 
#                             return_df=True)
# df = enc_bin.fit_transform(df)

#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

df = enc_ord.fit_transform(df)
df.to_csv("data_encoded/commercial_post_survey_encoded.csv")
df.head()

Unnamed: 0,renewvia_id,country,community,connection_period,business_type,operation_status,electricity_sources_non_minigrid,kerosene_usage_change,diesel_usage_change,operations_hours_change,hours_increase,clean_drinking_water_access,new_prod_serv_add,workforce_change,workforce_change_female,weekly_monthly_earnings,school_attendance_change,school_attendance_performance,appliances_addition_count,health_offering_change_count
0,131206,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,-1.0,-1.0,1.0,1_2_hours_daily,1.0,yes,0.0,,1.0,0.0,0.0,4,2
1,131231,kenya,kalobeyei_settlement,longer_than_2_years,other_business,,no_just_renewvia_minigrid,-1.0,-1.0,1.0,1_2_hours_daily,1.0,yes,1.0,,1.0,0.0,0.0,5,2
2,131542,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,-1.0,-1.0,1.0,1_2_hours_daily,1.0,yes,0.0,,1.0,0.0,0.0,2,2
3,131220,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,1.0,1.0,1.0,2_4_hours_daily,1.0,yes,1.0,,1.0,0.0,0.0,6,2
4,131485,kenya,kalobeyei_settlement,longer_than_2_years,shop,,no_just_renewvia_minigrid,-1.0,-1.0,1.0,1_2_hours_daily,1.0,no,0.0,,1.0,0.0,0.0,2,2
