In [1]:
import pandas as pd
import numpy as np
import json
import category_encoders as ce

In [2]:
# Transform into ratio
cols_post = [
'end_date', 'renewvia_id', 'interviewed_before',
'connection_period', 'country', 'gender', 'age',
'occupation_change', 'household_income_change', 
'avg_household_income', 'household_headcount_change',
'girls_schooling_change','boys_schooling_change', 
'school_performance_change',
'girls_unschooled_reasons', 'boys_unschooled_reasons',
'household_business_owners',
'business_type', 'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'power_sources',
'power_sources_usage', 'power_sources_primary',
'appliances_count', 'appliances_count_change', 'cellphones_count',
'appliances_type', 'appliances_type_addition',
'light_hours_current', 'light_primary_sources', 'kerosene_lamp_usage_change',
'kerosene_lamps_count', 'kerosene_lamp_usage_time',
'kerosene_lamps_cost', 'cooking_energy_sources',
'cooking_fuel_collection_time',
'cooking_fuel_responsible', 'cooking_energy_cost',
'applicances_charging_sources',
'applicances_charging_cost', 'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'feel_safer_with_minigrid',
'phone_charge_location', 
'phone_charge_frequency', 'phone_charge_cost',
'phone_charge_travel_distance', 'water_source', 'clean_drinking_water',
'clean_drinking_water_source', 'community_clean_water_source',
'water_collection_travel_distance', 'water_collection_time',
'water_collection_responsible', 'avg_person_age_water_collection',
'water_cost', 'clinic_travel_distance',
'clinic_electricity_access_minigrid', 
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
'minigrid_access_productivity_improvement',
'minigrid_effect_on_household_female', 
'customerAccountNumber', 'tariff'
]

df = pd.read_csv("datasets_annotated/hs_post_annotated.csv", usecols=cols_post)
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

# Count of multiple selection columns
df['business_use_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['business_from_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)

df['appliances_type_addition'] = df['appliances_type_addition'].map(lambda x: x.replace(" ", ";"))
df['appliances_addition_count'] = df['appliances_type_addition'].map(lambda x: 
                                                                     len(x[:-1].split(";"))
                                                                     if type(x) == str
                                                                     else 0)

df['connection_period'].replace('1-3 months', 
                                        '0-3 months', 
                                        inplace=True)

df['avg_person_age_water_collection'].replace('15 years or older', 
                                              '15 -18 years old', 
                                              inplace=True)

df['clinic_travel_distance'].replace('less than 1km', 
                                      'less than 1 km', 
                                      inplace=True)

trim_cols = ['clean_drinking_water_source', 'phone_charge_travel_distance']
for c in trim_cols:
    df[c] = df[c].str.rstrip()
    
cols_num = ['avg_household_income',
            'appliances_count', 
            'appliances_addition_count',
            'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
            'business_owners_count', 'business_owners_female']

df[cols_num].head()

Unnamed: 0,avg_household_income,appliances_count,appliances_addition_count,cellphones_count,light_hours_current,kerosene_lamps_count,business_owners_count,business_owners_female
0,2000.0,1.0,3,2.0,24.0,0,1,0
1,200.0,,1,2.0,200.0,24,0,0
2,300.0,3.0,3,1.0,2.0,0,0,0
3,200.0,3.0,2,2.0,4.0,1,0,0
4,1.0,24.0,1,3.0,24.0,1,0,0


In [3]:
df.replace('nan', np.nan, inplace=True)
def replace_str(value):
    replace_chars = {'': [")", '.)', ',', '/', 'ikom'
                         'u\n', "'", ".0", ' years',],
                    '_': ['- ',' (', ' ', '-', "__"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            if type(value) == str:
                value = value.replace(c, new).rstrip()
            else:
                continue

    return value

for col in list(df.columns):
    df[col] = df[col].apply(lambda x: replace_str(x))
    if col in cols_num:
        if col == 'light_hours_current':
            df[col].replace({"8-12": "8", "3-4":"3"}, inplace=True)
        df[col] = df[col].astype(str)
        df[col] = df[col].str.extract('(\d+)', expand=False)
        df[col] = df[col].astype(float).astype('Int64')
    else:
        continue

# df[cols_num].head()
df.to_csv("datasets_clean/hs_post_clean.csv")

In [4]:
# Encoding values
df_enc = df.copy()
cols = list(df.columns)
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map['col_answers_map']
n_col = len(answers)
    
# c_map = []
for i in range(0, n_col):
    col = answers[i]['new_name']
    mapping = answers[i]['mapping']
    
    if col in cols and mapping != {} and col != 'household_business_owners':
        vals = list(mapping.values())
        new_vals = [replace_str(v.lower()) for v in vals]
        col_mapping = dict()
        if "yes" in new_vals and "no" in new_vals:
            col_mapping = {'no': 0, 'yes':1, 
                           'option 1': 0, 'option 2': 1}

        elif any(map(col.__contains__, ["_change", "_improvement"])):

            cats = new_vals
            neutral = ['no', 'same', 'stay']
            down = ['lost', 'decrease','decreased','decreases',]
            up = ['add', 'better', 'increase',
                    'increased','increases', 'greater']
            for cat in cats:
                cat = str(cat)
                if cat != 'nan':
                    if cat == 'no' or any(map(cat.__contains__, neutral)):
                        col_mapping[cat] = 0
                    elif any(map(cat.__contains__, down)):
                        col_mapping[cat] = -1
                    elif cat == 'yes' or any(map(cat.__contains__, up)):
                        col_mapping[cat] = 1
        else:
            rng = len(new_vals)
            for idx, val in enumerate(new_vals):
                col_mapping[val] = idx+1
                
        df_enc[col].replace(col_mapping, inplace=True)
    
# df_enc.head()
df_enc.to_csv('datasets_encoded/hs_post_encoded.csv')