In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
# Transform into ratio
filter_cols = [
"renewvia_id", "avg_monthly_household_income",
"household_headcount", "female_schooling",
"male_schooling", "household_business_owners",
"electronics_count", "cellphones_count",
"light_hours_current", "kerosene_lamp_usage_count",
"cooking_energy_sources", "cooking_fuel_collection_time",
"cooking_energy_monthly_cost", "applicances_charging_monthly_cost",
"feel_safe_dark", "community_lights", "home_exterior_lights",
"feel_safe_if_exterior_lights", "phone_charge_location",
"phone_charge_monthly_cost", "water_source",
"clean_drinking_water", "clean_drinking_water_source",
"water_collection_travel_distance", "water_collection_time",
"water_monthly_cost", "clinic_travel_distance",
"clinic_electricity_access", 
    # "clinic_open_hours",
"clinic_refrigeration_access",
]
# hs_pre.to_csv("datasets_clean/household_pre_merged.csv")

#Set outliers aside
# df.drop(df[df['household_headcount'] >= 20].index, inplace=True)
# df.drop(df[df['electronics_count'] >= 30].index, inplace=True)

In [3]:
hs_pre_com = pd.read_csv("datasets_clean/household_pre_connection_commcare.csv", usecols=filter_cols)
hs_pre_ms = pd.read_csv("datasets_clean/household_pre_connection_ms_form.csv", 
                        usecols=filter_cols, encoding = "ISO-8859-1")
df = pd.concat([hs_pre_com, hs_pre_ms])

# No. of business owners
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)

# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

cols_change = [
    'cooking_fuel_collection_time',
     'cooking_energy_monthly_cost',
     'applicances_charging_monthly_cost',
     'phone_charge_monthly_cost',
     'feel_safe_dark',
     'feel_safe_if_exterior_lights',
     'community_lights','water_source',
     'clean_drinking_water_source',
     'water_collection_travel_distance',
    'water_collection_time',
    'water_monthly_cost',
    'clinic_travel_distance']

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", "."]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))

In [9]:
# replace values
replace_mapping = {
    'cooking_fuel_collection_time': ['copy_1_of_hours', 'less_than_1_hour'],
    
    'cooking_energy_monthly_cost' : [['0_1000_knes', 'copy_1_of_nkes', 
                                      '1000_1500_knes', 'copy_2_of_nkes', 
                                      '1500_2000_knes', 'copy_3_of_nkes',
                                      '2000_3000_knes','copy_4_of_nkes',], 
                                    ['0_1000_nkes', '0_1000_nkes', 
                                     '1000_1500_nkes', '1000_1500_nkes',
                                    '1500_2000_nkes', '1500_2000_nkes', 
                                    '2000_3000_nkes', '2000_3000_nkes',]],
    
    'applicances_charging_monthly_cost' : [['copy_1_of_nkes', 'copy_2_of_nkes', 
                                          'copy_3_of_nkes','copy_4_of_nkes',], 
                                            ['0_150_nkes', '150_1000_nkes', 
                                            '1000_3000_nkes', '3000_4000_nkes',]],
    
    'phone_charge_monthly_cost': [['copy_1_of_nkes', 'copy_2_of_nkes', 'copy_3_of_nkes', 
                                   'nkes_and_above', '1000_n_and_above' ], 
                                    ['0_100_nkes','100_500_nkes', '500_750_nkes', 
                                     '1000_nkes_and_above', '1000_nkes_and_above']],
    
    'community_lights': [['street_lights', 'no_none', 'other'], 
                        ['yes', 'no', 'no']],
    
    'clean_drinking_water_source': ['clean_community_source_','clean_community_source',],
    
    'water_collection_travel_distance': [['copy_1_of_km','copy_2_of_km'],
                                         ['less_than_1_km',  '1_2_km',]],
    
    'water_collection_time': ['copy_1_of_hours', 'less_than_1_hour'],
    
    'water_monthly_cost': [['copy_1_of_nkes', 
                           'copy_2_of_nkes', 'nkes_and_above'], 
                          ['i_dont_pay_its_free', '0_500_nkes', 
                          '5000_nkes_and_above',]],
    'clinic_travel_distance': ['less_than_1km', 'less_than_1_km']
    
}

for col, mapping in replace_mapping.items():
    df[col] = df[col].replace(mapping[0], mapping[1])

# list(replace_mapping.keys())
                           
for col in list(df.columns)[1:]:
    cats = df[col].unique()
    print("\t-",col, list(cats))

# df.head()

	- avg_monthly_household_income ['nan', '27000.0', '30000.0', '38000.0', '15000.0', '40000.0', '35000.0', '52000.0', '81000.0', '95000.0', '45000.0', '20000.0', '50000.0', '26000.0', '70000.0', '8000.0', '39000.0', '200000.0', '60000.0', '90000.0', '80000.0', '130000.0', '100000.0', '65000.0', '10000.0', '85000.0', '150000.0', '28000.0', '25000.0', '120000.0', '12000.0', '56000.0', '48000.0', '36000.0', '24000.0', '18000.0', '6000.0', '42000.0', '43000.0', '55000.0', '14000.0', '78000.0', '75000.0', '250000.0', '29000.0', '49000.0', '86000.0', '34000.0', '16000.0', '22000.0', '68000.0', '33000.0', '53000.0', '82000.0', '59000.0', '83000.0', '98000.0', '63000.0', '160000.0', '62000.0', '140000.0', '58000.0', '72000.0', '17000.0', '51000.0', '47000.0', '2000.0', '152000.0', '2.0', '57000.0', '32.0', '3.0', '23.0', '300000.0', '40.0', '32000.0', '92000.0', '170000.0', '26.0', '124000.0', '220000.0', '180000.0', '34.0', '5000.0', '145000.0', '400000.0', '60500.0', '23000.0', '1.0', '20.0',

In [5]:
 # '': [ 'skip',
 # 'it is opening throughout the week',
 # 'opening outpatients service all time',
 # "i don't know ",
 # 'havw no idea ',
 # 'everyday ',
 # 'same time opening outpatients service only',
 # "i don't know about it",
 # " i don't know",
 # 'all time opening outpatients service',
 # "i don't know about it because am not working there",
 # 'not know well',
 # "i don't know about it maybe those work there they knowing better than me",
 # 'all time opening',
 # 'no idea ',
 # 'all time opening outpatients service only',
 # 'boiled',
 #     ]

In [6]:
# ['12 hours',
#  '8am',
#  '12 yours',
#  '8pm',
#  '9',
#  '7am',
#  '10',
#  '8',
#  '10hrs',
#  '7am-4pm',
#  '8am-10pm',
#  '8am-5pm',
#  '8am-6pm',
#  '7am-5pm',
#  '7am-6pm',
#  '10hrd',
#  'rain catchment',
#  '7am-4am',
#  '8am - 10pm',
#  '7am - 7pm',
#  '7am-7pm',
#  '7:30am-5pm',
#  'u\n7am-5:30pm',
#  '12',
#  '7',
#   '0',
#  '5',
#  '4',
#   '2',
#   '9-aug',
#  '8am-6am',
#  '6am-4pm',
#  '4-jul',
#  '6',
#  '7am-6am',
#  '8am-8pm',
#  '6amp',
#  '8amm',
#  '8am to 5pm',
#  '8am/5pm',
#  '2am/8pm',
#  '8hrs',
#  '8hours',
#  '8.00am-5.00pm',
#  '8am/pm',
#  'open all times ',
#  '24 hours ',
#  '24',
#  'all times ',
#  "i don't know",
#  'all time',
#  'no',
#  '3',
#  '23',
#  '11',
#  '1',
#  '8 am to 6 pm',
#  '8:am',
#  '8:00 am',
#  '8:00am',
#  '8am to 6pm',
#  '8am  to 6pm',
#  '8:00m',
#  '8;00am',
#  '8am to 6pm ',
#  '8:30am ',
#  '24hrs',
#  '16',
#  '18',
#  'tap',
#  'morning to noon',
#  '13',
#  '22',
#  'from 6am to 6 pm',
#  'from six to six',
#  '12hours',
#  '8am to 4pm',
#  '12hrs',
#  '12hours a day',
#  '0600hrs',
#  '12hrs a day',
#  '8: am',
#  '8am to ypm',
#  '8am5pm',
#  '8am/5pm ',
#  '24 hrs',
#  'not known ',
#  'all time ',
#  'o',
#  '8-jul',
#  '10-sep',
#  'borehole',
#  'e',
#  'w',
#  'no ',
#  '25',
#  '6am to 4pm',
#  '12 hrs',
#  '22hrs',
#  '12hours a day ',
#  '6hrs a day',
#  '8hrs a day',
#  '¹2',
#  ',1',
#  '14',
#  '8:30am']