In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
# Transform into ratio
filter_cols_pre = [
"renewvia_id", "avg_monthly_household_income",
"household_headcount", "female_schooling",
"male_schooling", "household_business_owners",
"electronics_count", "cellphones_count",
"light_hours_current", "kerosene_lamp_usage_count",
"cooking_energy_sources", "cooking_fuel_collection_time",
"cooking_energy_monthly_cost", "applicances_charging_monthly_cost",
"feel_safe_dark", "community_lights", "home_exterior_lights",
"feel_safe_if_exterior_lights", 
# "phone_charge_location",
"phone_charge_monthly_cost", "water_source",
"clean_drinking_water", "clean_drinking_water_source",
"water_collection_travel_distance", "water_collection_time",
"water_monthly_cost", "clinic_travel_distance",
"clinic_electricity_access", 
    # "clinic_open_hours",
"clinic_refrigeration_access",
]

#Importing the datasets
hs_pre_com = pd.read_csv("datasets_clean/household_pre_connection_commcare.csv", 
                         usecols=filter_cols_pre)
hs_pre_ms = pd.read_csv("datasets_clean/household_pre_connection_ms_form.csv", 
                        usecols=filter_cols_pre, encoding = "ISO-8859-1")
df = pd.concat([hs_pre_com, hs_pre_ms])
df.replace('nan', np.nan, inplace=True)
df.head()

In [None]:
# No. of business owners
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if (type(x) == str) 
                                                                            and ('adult_female' in x) 
                                                                            else 0)

# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

cols_change = [
    'cooking_fuel_collection_time',
     'cooking_energy_monthly_cost',
     'applicances_charging_monthly_cost',
     'phone_charge_monthly_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
     'community_lights','water_source',
     'clean_drinking_water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'water_monthly_cost',
    'clinic_travel_distance']

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    


In [4]:
# replace values
replace_mapping = {
    'cooking_fuel_collection_time': [['copy_1_of_hours','hours'],
                                     ['less_than_1_hour',  '3_5_hours',]],
    
    'cooking_energy_monthly_cost' : [['0_1000_knes', 'copy_1_of_nkes', 
                                      '1000_1500_knes', 'copy_2_of_nkes', 
                                      '1500_2000_knes', 'copy_3_of_nkes',
                                      '2000_3000_knes','copy_4_of_nkes',
                                     'nkes', ], 
                                    ['0_1000_nkes', '0_1000_nkes', 
                                     '1000_1500_nkes', '1000_1500_nkes',
                                    '1500_2000_nkes', '1500_2000_nkes', 
                                    '2000_3000_nkes', '2000_3000_nkes',
                                    '3000_4000_nkes']],
    
    'applicances_charging_monthly_cost' : [['copy_1_of_nkes', 'copy_2_of_nkes', 
                                          'copy_3_of_nkes','copy_4_of_nkes',
                                           'nkes',], 
                                            ['0_150_nkes', '150_1000_nkes', 
                                            '1000_3000_nkes', '3000_4000_nkes',
                                            '4000_6000_nkes']],
    
    'phone_charge_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 'copy_3_of_nkes', 
                                   'nkes', 'nkes_and_above', '1000_n_and_above', 
                                  ], 
                                    ['0_100_nkes','100_500_nkes', '500_750_nkes',
                                     '750_1000_nkes',
                                     '1000_nkes_and_above', '1000_nkes_and_above']],
    
    'community_lights': [['street_lights', 'no_none', 'other'], 
                        ['yes', 'no', 'no']],
    
    'clean_drinking_water_source': ['clean_community_source_','clean_community_source',],
    
    'water_collection_travel_distance': [['copy_1_of_km','copy_2_of_km','km', ],
                                         ['less_than_1_km',  '1_2_km', '5_10_km']],
    
    'water_collection_time': [['copy_1_of_hours','hours'],
                             ['less_than_1_hour',  '3_4_hours',]],
    
    'water_monthly_cost': [['copy_1_of_nkes', 
                           'copy_2_of_nkes', 'nkes', 'nkes_and_above'], 
                          ['i_dont_pay_its_free', '0_500_nkes', 
                          '3000_5000_nkes', '5000_nkes_and_above',]],
    'clinic_travel_distance': ['less_than_1km', 'less_than_1_km']
    
}

for col, mapping in replace_mapping.items():
    df[col] = df[col].replace(mapping[0], mapping[1])

# list(replace_mapping.keys())
                           
# for col in list(df.columns)[5:]:
#     cats = df[col].unique()
#     print("\t ",col, list(cats))

df.head()

Unnamed: 0,renewvia_id,avg_monthly_household_income,household_headcount,female_schooling,male_schooling,household_business_owners,electronics_count,cellphones_count,light_hours_current,kerosene_lamp_usage_count,...,clean_drinking_water,clean_drinking_water_source,water_collection_travel_distance,water_collection_time,water_monthly_cost,clinic_travel_distance,clinic_electricity_access,clinic_refrigeration_access,business_owners_count,business_owners_female
0,501121.0,,,,,,,,,,...,,,,,,,,,0,0
1,570063.0,27000.0,3.0,,1.0,adult_female,5.0,3.0,5.0,,...,yes,bottled_water,less_than_1_km,less_than_1_hour,3000_5000_nkes,less_than_1_km,no,no,1,1
2,570028.0,30000.0,9.0,2.0,4.0,adult_female,5.0,8.0,5.0,,...,yes,boiled_water,less_than_1_km,,i_dont_pay_its_free,less_than_1_km,no,no,1,1
3,,30000.0,2.0,,,adult_female,4.0,2.0,5.0,,...,yes,clean_community_source,less_than_1_km,less_than_1_hour,i_dont_pay_its_free,less_than_1_km,no,no,1,1
4,570097.0,38000.0,7.0,2.0,3.0,,5.0,7.0,6.0,,...,yes,bottled_water,5_10_km,less_than_1_hour,i_dont_pay_its_free,less_than_1_km,no,no,0,0


In [6]:
# Convert to numeric columns
cols_num = ['avg_monthly_household_income', 'household_headcount', 
            'female_schooling', 'male_schooling', 'electronics_count', 
           'cellphones_count', 'light_hours_current', 'kerosene_lamp_usage_count',
           'business_owners_count', 'business_owners_female']

for col in cols_num:
    # print(col)
    # df[col] = df[col].fillna(-1)
    df[col] = df[col].astype(float).astype('Int64')
    

# Encoding binary variables
cols_cat_bin = ['community_lights', 'home_exterior_lights', 
                'clinic_electricity_access', 
                'clinic_refrigeration_access',
                'business_owners_female']

#Binary Encoding
enc_bin = ce.BinaryEncoder(cols=cols_cat_bin, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)

df = enc_bin.fit_transform(df)

# cols_cat_nom = ['cooking_energy_sources', 'phone_charge_location']

In [8]:
cols_ord_map = {
'cooking_fuel_collection_time': ['less_than_1_hour', '1_2_hours', 
                                 '3_5_hours', 'greater_than_5_hours', ],
'cooking_energy_monthly_cost':  ['0_1000_nkes', '1000_1500_nkes', 
                                 '1500_2000_nkes','2000_3000_nkes', 
                                 '3000_4000_nkes'],
'applicances_charging_monthly_cost': ['0_150_nkes', '150_1000_nkes', 
                                     '1000_3000_nkes', '3000_4000_nkes', 
                                    '4000_6000_nkes'],
'feel_safe_dark': ['very_unsafe', 'somewhat_unsafe', 
                  'neither_safe_nor_unsafe', 
                   'somewhat_safe','very_safe',],
'feel_safe_if_exterior_lights': ['very_unsafe', 'somewhat_unsafe', 
                                  'neither_safe_nor_unsafe', 
                                 'somewhat_safe','very_safe',], 
'water_source': ['dirty_water_source_pond_contaminated_well_etc', 
                 'clear_water_source_fresh_spring_lake_etc', 
                 'community_well_or_pump',
                 'at_home_tap', ],    
'water_collection_travel_distance': ['less_than_1_km', '1_2_km', 
                                      '2_5_km', '5_10_km',
                                      'greater_than_10_km', ],
'water_collection_time':  ['less_than_1_hour', '1_2_hours', 
                           '2_3_hours', '3_4_hours', 
                           'greater_than_4_hours', ],
'water_monthly_cost':  ['i_dont_pay_its_free', '0_500_nkes',  
                        '500_3000_nkes', '3000_5000_nkes',
                        '5000_nkes_and_above',],
'clinic_travel_distance':  ['less_than_1_km', 'between_1_2_km', 
                            'between_2_3_km', 'between_3_5_km',
                           'greater_than_5_km', ]
}
cats_ord_map = list()
for col, values in cols_ord_map.items():
    val_ord = dict()
    rng = len(values)
    for idx, val in enumerate(values):
        val_ord[val] = idx+1
    cats_ord_map.append({"col":col, "mapping": val_ord})

    
#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

df = enc_ord.fit_transform(df)
# print(df.info())
df.head()

Unnamed: 0,renewvia_id,avg_monthly_household_income,household_headcount,female_schooling,male_schooling,household_business_owners,electronics_count,cellphones_count,light_hours_current,kerosene_lamp_usage_count,...,water_collection_time,water_monthly_cost,clinic_travel_distance,clinic_electricity_access_0,clinic_electricity_access_1,clinic_refrigeration_access_0,clinic_refrigeration_access_1,business_owners_count,business_owners_female_0,business_owners_female_1
0,501121.0,,,,,,,,,,...,,,,,,,,0,0.0,1.0
1,570063.0,27000.0,3.0,,1.0,adult_female,5.0,3.0,5.0,,...,,,,0.0,1.0,0.0,1.0,1,1.0,0.0
2,570028.0,30000.0,9.0,2.0,4.0,adult_female,5.0,8.0,5.0,,...,,,,0.0,1.0,0.0,1.0,1,1.0,0.0
3,,30000.0,2.0,,,adult_female,4.0,2.0,5.0,,...,,,,0.0,1.0,0.0,1.0,1,1.0,0.0
4,570097.0,38000.0,7.0,2.0,3.0,,5.0,7.0,6.0,,...,,,,0.0,1.0,0.0,1.0,0,0.0,1.0


In [20]:
# Transform into ratio
filter_cols_post = [
"renewvia_account_number", "interviewed_before",
'occupation_change', 'houlsehold_income_change',
'avg_monthly_household_income', 'female_schooling_change',
'male_schooling_change', 'school_performance_change',
'household_business_owner', 
'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'electronics_count',
'electronics_count_change', 'cellphones_count',
'cellphones_count_change', 'appliances_count_addition',
'light_hours_current', "kerosene_lamp_usage_change",
'kerosene_lamp_usage_count', 'kerosene_lamp_usage_cost',
'cooking_fuel_collection_time', 'cooking_energy_monthly_cost',
'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 'phone_charge_frequency',
'phone_charge_monthly_cost', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'community_clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_monthly_cost',
'clinic_travel_distance', 'clinic_electricity_access_minigrid',
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
]


hs_post_com = pd.read_csv("datasets_clean/household_post_connection_commcare.csv", 
                          usecols=filter_cols_post
                         )
hs_post_ms = pd.read_csv("datasets_clean/household_post_connection_ms_form.csv", 
                        usecols=filter_cols_post, 
                         encoding = "ISO-8859-1")
# hs_post_com.head()
df = pd.concat([hs_post_com, hs_post_ms])
df.rename(columns={'renewvia_account_number':'renewvia_id'},inplace=True)
df.replace('nan', np.nan, inplace=True)
df.head()

Unnamed: 0,renewvia_id,interviewed_before,occupation_change,houlsehold_income_change,avg_monthly_household_income,female_schooling_change,male_schooling_change,school_performance_change,household_business_owner,business_recent,...,clean_drinking_water_source,community_clean_water_source,water_collection_travel_distance,water_collection_time,water_monthly_cost,clinic_travel_distance,clinic_electricity_access_minigrid,clinic_refrigeration_access,better_access_health_minigrid,minigrid_access_life_improvement
0,,,,,,,,,,,...,,,,,,,,,,
1,521168.0,no,no,yes_it_has_decreased,"20,000 naira",no_its_the_same,no_its_the_same,yes_its_gotten_better,none,,...,treated__filtered_water,Tap,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,no_its_the_same
2,521039.0,yes,yes,yes_it_has_increased,50000,yes_its_increased,yes_its_increased,yes_its_gotten_better,adult_male,yes,...,treated__filtered_water,Community Bore-hole,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,yes
3,521055.0,no,no,no_it_is_the_same,40000,no_its_the_same,no_its_the_same,no_its_the_same,none,,...,treated__filtered_water,community bore-hole,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-3_km,no,no,yes,yes
4,521090.0,no,no,yes_it_has_increased,"20,000 Naira",no_its_the_same,no_its_the_same,yes_its_gotten_better,adult_female,yes,...,clean_community_source,Well,no_need_to_travel_-_at_home_water_supply,less_than_1_hour,i_dont_pay_its_free,between_2-5_km,yes,yes,yes,yes


In [21]:
for col in list(df.columns)[1:10]:
    cats = df[col].unique()
    print("\t ",col, list(cats))


	  interviewed_before [nan, 'no', 'yes', 'not_sure', 'No', 'Yes', 'Not sure']
	  occupation_change [nan, 'no', 'yes', 'Yes', 'No']
	  houlsehold_income_change [nan, 'yes_it_has_decreased', 'yes_it_has_increased', 'no_it_is_the_same', 'Yes, it has increased', 'No, it is the same', 'Yes, it has decreased']
	  avg_monthly_household_income [nan, '20,000 naira', '50000', '40000', '20,000 Naira', '30,000 Naira', '10,000 Naira', '15,000 Naira', '5,000 Naira', '7,000 Naira', '30,000', '40,000 Naira', '80,000 naira', '30,000 naira', '5,000 naira', '200,000 naira', '40,000 naira', '35,000 naira', '45,000 naira', '50,000 naira', '15,000 naira', '40, 000 Naira', '10,000', '4,000', '3,000', '5,000', '20,000', '40,000', '10,000N', '20,000N', '15,000N', '100,000N', '30,000N', '5,000N', '40,000N', '50,000N', 'Personal', 'personal', '25,000N', '8,000N', '60,000N', '6,000N', '25,000 naira', '10,000 naira', '300,000 naira', '80,000', '44,000N', '35,000N', '33,000N', '100,000', '20,000.00', '20,000  naira

In [None]:
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

cols_change = [
    'cooking_fuel_collection_time',
     'cooking_energy_monthly_cost',
     'applicances_charging_monthly_cost',
     'phone_charge_monthly_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
     'community_lights','water_source',
     'clean_drinking_water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'water_monthly_cost',
    'clinic_travel_distance']

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    


In [None]:
 # '': [ 'skip',
 # 'it is opening throughout the week',
 # 'opening outpatients service all time',
 # "i don't know ",
 # 'havw no idea ',
 # 'everyday ',
 # 'same time opening outpatients service only',
 # "i don't know about it",
 # " i don't know",
 # 'all time opening outpatients service',
 # "i don't know about it because am not working there",
 # 'not know well',
 # "i don't know about it maybe those work there they knowing better than me",
 # 'all time opening',
 # 'no idea ',
 # 'all time opening outpatients service only',
 # 'boiled',
 #     ]

In [None]:
# ['12 hours',
#  '8am',
#  '12 yours',
#  '8pm',
#  '9',
#  '7am',
#  '10',
#  '8',
#  '10hrs',
#  '7am-4pm',
#  '8am-10pm',
#  '8am-5pm',
#  '8am-6pm',
#  '7am-5pm',
#  '7am-6pm',
#  '10hrd',
#  'rain catchment',
#  '7am-4am',
#  '8am - 10pm',
#  '7am - 7pm',
#  '7am-7pm',
#  '7:30am-5pm',
#  'u\n7am-5:30pm',
#  '12',
#  '7',
#   '0',
#  '5',
#  '4',
#   '2',
#   '9-aug',
#  '8am-6am',
#  '6am-4pm',
#  '4-jul',
#  '6',
#  '7am-6am',
#  '8am-8pm',
#  '6amp',
#  '8amm',
#  '8am to 5pm',
#  '8am/5pm',
#  '2am/8pm',
#  '8hrs',
#  '8hours',
#  '8.00am-5.00pm',
#  '8am/pm',
#  'open all times ',
#  '24 hours ',
#  '24',
#  'all times ',
#  "i don't know",
#  'all time',
#  'no',
#  '3',
#  '23',
#  '11',
#  '1',
#  '8 am to 6 pm',
#  '8:am',
#  '8:00 am',
#  '8:00am',
#  '8am to 6pm',
#  '8am  to 6pm',
#  '8:00m',
#  '8;00am',
#  '8am to 6pm ',
#  '8:30am ',
#  '24hrs',
#  '16',
#  '18',
#  'tap',
#  'morning to noon',
#  '13',
#  '22',
#  'from 6am to 6 pm',
#  'from six to six',
#  '12hours',
#  '8am to 4pm',
#  '12hrs',
#  '12hours a day',
#  '0600hrs',
#  '12hrs a day',
#  '8: am',
#  '8am to ypm',
#  '8am5pm',
#  '8am/5pm ',
#  '24 hrs',
#  'not known ',
#  'all time ',
#  'o',
#  '8-jul',
#  '10-sep',
#  'borehole',
#  'e',
#  'w',
#  'no ',
#  '25',
#  '6am to 4pm',
#  '12 hrs',
#  '22hrs',
#  '12hours a day ',
#  '6hrs a day',
#  '8hrs a day',
#  '¹2',
#  ',1',
#  '14',
#  '8:30am']