In [1]:
import pandas as pd
import numpy as np
import json
import category_encoders as ce
import matplotlib.pyplot as plt

In [2]:
# Transform into ratio
cols_post = [
'end', 'renewvia_id', 'interviewed_before',
    "nigeria_community","kenya_community",
'connection_period', 'country', 'gender', 'age',
'occupation_change', 'household_income_change', 
'avg_household_income', 'household_headcount_change',
'girls_schooling_change','boys_schooling_change', 
'school_performance_change',
'girls_unschooled_reasons', 'boys_unschooled_reasons',
'household_business_owners',
'business_type', 'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'power_sources',
'power_sources_usage', 'power_sources_primary',
'appliances_count', 'appliances_count_change', 'cellphones_count',
'appliances_type', 'appliances_type_addition',
'light_hours_current', 'light_primary_sources', 'kerosene_lamp_usage_change',
'kerosene_lamps_count', 'kerosene_lamp_usage_time',
'kerosene_lamps_cost', 'cooking_energy_sources',
'cooking_fuel_collection_time',
'cooking_fuel_responsible', 'cooking_energy_cost',
'applicances_charging_sources', 
'applicances_charging_cost', 'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'feel_safer_with_minigrid', 'phone_charge_location', 
'phone_charge_frequency', 'phone_charge_cost',
'phone_charge_travel_distance', 'water_source', 'clean_drinking_water',
'clean_drinking_water_source', 'community_clean_water_source',
'water_collection_travel_distance', 'water_collection_time',
'water_collection_responsible', 'avg_person_age_water_collection',
'water_cost', 'clinic_travel_distance',
'clinic_electricity_access_minigrid', 
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
'minigrid_access_productivity_improvement',
'minigrid_effect_on_household_female', 
'customerAccountNumber', 'tariff'
]

data = pd.read_csv("datasets_annotated/post_annotated.csv", usecols=cols_post)
# df["community"] = (df["nigeria_community"].astype(str) + 
#                    df["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))
# # standardize case
# df = df.apply(lambda x: x.astype(str).str.lower())
# df.head()

print(data.shape)
# standardize case
data = data.apply(lambda x: x.astype(str).str.lower())
# Get identified customers only
data = data[~data['renewvia_id'].isin(["no meter", "nan", ""])]
data["community"] = (data["nigeria_community"].astype(str) + 
                   data["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))

#Household Income
data["avg_household_income"] = data["avg_household_income"].str.extract('([-+]?\d*,?\d+)')
data["avg_household_income"] = data["avg_household_income"].str.replace(",", "")
data["avg_household_income"] = pd.to_numeric(data["avg_household_income"])

# Remove duplicates
df = data.sort_values('avg_household_income', ascending=False).drop_duplicates('renewvia_id').sort_index()
df['renewvia_id'] = df['renewvia_id'].map(lambda x: x.replace(".0", "").upper())
print(df.shape)
df.head()

(2603, 72)
(2304, 73)


Unnamed: 0,end,renewvia_id,interviewed_before,connection_period,country,nigeria_community,kenya_community,age,gender,occupation_change,...,clinic_travel_distance,clinic_electricity_access_minigrid,clinic_refrigeration_access,better_access_health_minigrid,minigrid_access_life_improvement,minigrid_access_productivity_improvement,minigrid_effect_on_household_female,customerAccountNumber,tariff,community
0,2022-07-28 10:31:53,141201,not sure,24-36 months,kenya,,kalobeyei town,,female,yes,...,,,,,,,,141201.0,residential,kalobeyei town
1,2022-08-06 03:38:46,161147,not sure,24-36 months,kenya,,oyamo,,male,no,...,,,,,,,,161147.0,residential,oyamo
2,2022-08-03 09:54:38,ND0043,no,3-6 months,kenya,,ndeda,,male,yes,...,,,,,,,,,,ndeda
3,2022-07-23 05:29:51,131104,no,12-24 months,kenya,,kalobeyei settlement,24.0,male,yes,...,between 2-3 km,yes,yes,yes,"no, it's the same",no,no,131104.0,commercial,kalobeyei settlement
4,2022-07-22 05:13:00,131110,no,12-24 months,kenya,,kalobeyei settlement,45.0,male,no,...,less than 1km,yes,yes,yes,"no, it's the same",,no female affected,131110.0,residential,kalobeyei settlement


In [3]:
# Count of multiple selection columns
df['business_use_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['business_from_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_binary'] = df['business_owners_count'].map(lambda x: 1
                                                                  if x >= 1
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)

df['appliances_type_addition'] = df['appliances_type_addition'].map(lambda x: x.replace(" ", ";"))
df['appliances_addition_count'] = df['appliances_type_addition'].map(lambda x: 
                                                                     len(x[:-1].split(";"))
                                                                     if type(x) == str
                                                                     else 0)

df['connection_period'].replace('1-3 months', 
                                        '0-3 months', 
                                        inplace=True)

df['avg_person_age_water_collection'].replace('15 years or older', 
                                              '15 -18 years old', 
                                              inplace=True)

df['clinic_travel_distance'].replace('less than 1km', 
                                      'less than 1 km', 
                                      inplace=True)

df["water_cost"].replace("i don't pay, it's free.", 
                          "i don't pay, it's free", 
                          inplace=True)

trim_cols = ['clean_drinking_water_source', 
             'phone_charge_travel_distance']
for c in trim_cols:
    df[c] = df[c].str.rstrip()

In [4]:
df.replace('nan', np.nan, inplace=True)
def replace_str(value):
    replace_chars = {'': [")", '.)', ',', '/', "ikom",
                         'u\n', "'", ".0", 
                          "years", ' years', "dead",
                          "_years", "000", "00","*"],
                    '_': ['- ',' (', ' ', '-', "__"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            if type(value) == str:
                value = value.replace(c, new).rstrip()
            else:
                continue

    return value

    
cols_num = ['appliances_count', 
            'appliances_addition_count',
            'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
            'business_owners_count', 'business_owners_female']

cols_replace = list(df.columns)
cols_replace.remove("renewvia_id")
cols_replace.remove("avg_household_income")
cols_replace

for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x))
    if col in cols_num:
        if col == 'light_hours_current':
            df[col].replace({"8-12": "8", "3-4":"3"}, inplace=True)
        df[col] = df[col].astype(str)
        df[col] = df[col].str.extract('(\d+)', expand=False)
        df[col] = df[col].astype(float).astype('Int64')
    else:
        continue

df[cols_num].head()


Unnamed: 0,appliances_count,appliances_addition_count,cellphones_count,light_hours_current,kerosene_lamps_count,business_owners_count,business_owners_female
0,,5,,,0,1,0
1,,2,,,0,0,0
2,,2,,,2,1,0
3,3.0,4,2.0,12.0,0,2,1
4,2.0,3,3.0,6.0,0,0,0


In [5]:
df.to_csv("datasets_clean/post_clean.csv")

In [6]:
# Transform into ratio
cols_focus = [
'renewvia_id', 'community',
'connection_period', 'country', 'gender', 'age',
'household_income_change', 
'avg_household_income', 
'girls_schooling_change','boys_schooling_change', 
'school_performance_change',
'business_type', 'business_recent', 'business_from_minigrid',
'business_use_minigrid', 
'light_hours_current', 'light_primary_sources', 'kerosene_lamp_usage_change',
'kerosene_lamps_count', 'kerosene_lamp_usage_time',
'kerosene_lamps_cost', 'cooking_energy_sources',
'cooking_fuel_collection_time',
'cooking_fuel_responsible', 'cooking_energy_cost',
'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'feel_safer_with_minigrid',
'water_source', 'clean_drinking_water',
'clean_drinking_water_source',
'water_collection_travel_distance', 'water_collection_time',
'water_collection_responsible', 'avg_person_age_water_collection',
'water_cost', 'clinic_travel_distance',
'clinic_electricity_access_minigrid', 
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
'customerAccountNumber', 'tariff',
'business_owners_binary', 'business_owners_female',   
]

# Encoding values
df_enc = df[cols_focus].copy()
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map['col_answers_map']
n_col = len(answers)
    
# c_map = []
for i in range(0, n_col):
    col = answers[i]['new_name']
    mapping = answers[i]['mapping']
    
    if col in cols_focus and mapping != {} and col != 'household_business_owners':
        vals = list(mapping.values())
        new_vals = [replace_str(v.lower()) for v in vals]
        col_mapping = dict()
        if "yes" in new_vals and "no" in new_vals:
            col_mapping = {'no': 0, 'yes':1, 
                           'option 1': 0, 'option 2': 1}

        elif any(map(col.__contains__, ["_change", "_improvement"])):

            cats = new_vals
            neutral = ['no', 'same', 'stay']
            down = ['lost', 'decrease','decreased','decreases',]
            up = ['add', 'better', 'increase',
                    'increased','increases', 'greater']
            for cat in cats:
                cat = str(cat)
                if cat != 'nan':
                    if cat == 'no' or any(map(cat.__contains__, neutral)):
                        col_mapping[cat] = 0
                    elif any(map(cat.__contains__, down)):
                        col_mapping[cat] = -1
                    elif cat == 'yes' or any(map(cat.__contains__, up)):
                        col_mapping[cat] = 1
        else:
            rng = len(new_vals)
            for idx, val in enumerate(new_vals):
                col_mapping[val] = idx+1
                
        df_enc[col].replace(col_mapping, inplace=True)
    
# df_enc.head()
df_enc.to_csv('datasets_encoded/post_encoded.csv')