In [1]:
import pandas as pd
import numpy as np
import json
import category_encoders as ce
import matplotlib.pyplot as plt

In [2]:
# Transform into ratio
cols_post = [
'end', 'renewvia_id', 'interviewed_before',
    "nigeria_community","kenya_community",
'connection_period', 'country', 'gender', 'age',
'occupation_change', 'household_income_change', 
'avg_household_income', 'household_headcount_change',
'girls_schooling_change','boys_schooling_change', 
'school_performance_change',
'girls_unschooled_reasons', 'boys_unschooled_reasons',
'household_business_owners',
'business_type', 'business_recent', 'business_from_minigrid',
'business_use_minigrid', 'power_sources',
'power_sources_usage', 'power_sources_primary',
'appliances_count', 'appliances_count_change', 'cellphones_count',
'appliances_type', 'appliances_type_addition',
'light_hours_current', 'light_primary_sources', 'kerosene_lamp_usage_change',
'kerosene_lamps_count', 'kerosene_lamp_usage_time',
'kerosene_lamps_cost', 'cooking_energy_sources',
'cooking_fuel_collection_time',
'cooking_fuel_responsible', 'cooking_energy_cost',
'applicances_charging_sources', 
'applicances_charging_cost', 'community_lights', 'home_exterior_lights',
'exterior_lights_minigrid', 'feel_safe_dark',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'feel_safer_with_minigrid', 'phone_charge_location', 
'phone_charge_frequency', 'phone_charge_cost',
'phone_charge_travel_distance', 'water_source', 'clean_drinking_water',
'clean_drinking_water_source', 'community_clean_water_source',
'water_collection_travel_distance', 'water_collection_time',
'water_collection_responsible', 'avg_person_age_water_collection',
'water_cost', 'clinic_travel_distance',
'clinic_electricity_access_minigrid', 
'clinic_refrigeration_access', 'better_access_health_minigrid',
'minigrid_access_life_improvement',
'minigrid_access_productivity_improvement',
'minigrid_effect_on_household_female', 
]

data = pd.read_csv("datasets_annotated/post_annotated.csv", usecols=cols_post)
# df["community"] = (df["nigeria_community"].astype(str) + 
#                    df["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))
# # standardize case
# df = df.apply(lambda x: x.astype(str).str.lower())
# df.head()

print(data.shape)
# standardize case
data = data.apply(lambda x: x.astype(str).str.lower())
# Get identified customers only
data = data[~data['renewvia_id'].isin(["no meter", "nan", ""])]
data["community"] = (data["nigeria_community"].astype(str) + 
                   data["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))

#Household Income
data["avg_household_income"] = data["avg_household_income"].str.extract('([-+]?\d*,?\d+)')
data["avg_household_income"] = data["avg_household_income"].str.replace(",", "")
data["avg_household_income"] = pd.to_numeric(data["avg_household_income"])

# Remove duplicates
df = data.sort_values('avg_household_income', ascending=False).drop_duplicates('renewvia_id').sort_index()
df['renewvia_id'] = df['renewvia_id'].map(lambda x: x.replace(".0", "").upper())
df.rename(columns={"clinic_electricity_access_minigrid": "clinic_electricity_access"}, inplace=True)
print(df.shape)
df.head()

(2603, 70)
(2304, 71)


Unnamed: 0,end,renewvia_id,interviewed_before,connection_period,country,nigeria_community,kenya_community,age,gender,occupation_change,...,avg_person_age_water_collection,water_cost,clinic_travel_distance,clinic_electricity_access,clinic_refrigeration_access,better_access_health_minigrid,minigrid_access_life_improvement,minigrid_access_productivity_improvement,minigrid_effect_on_household_female,community
0,2022-07-28 10:31:53,141201,not sure,24-36 months,kenya,,kalobeyei town,,female,yes,...,,,,,,,,,,kalobeyei town
1,2022-08-06 03:38:46,161147,not sure,24-36 months,kenya,,oyamo,,male,no,...,,,,,,,,,,oyamo
2,2022-08-03 09:54:38,ND0043,no,3-6 months,kenya,,ndeda,,male,yes,...,,,,,,,,,,ndeda
3,2022-07-23 05:29:51,131104,no,12-24 months,kenya,,kalobeyei settlement,24.0,male,yes,...,15 years or older,"i don't pay, it's free.",between 2-3 km,yes,yes,yes,"no, it's the same",no,no,kalobeyei settlement
4,2022-07-22 05:13:00,131110,no,12-24 months,kenya,,kalobeyei settlement,45.0,male,no,...,15 years or older,"i don't pay, it's free.",less than 1km,yes,yes,yes,"no, it's the same",,no female affected,kalobeyei settlement


In [3]:
df["feel_unsafe_reasons"] = df["feel_unsafe_reasons"].map(lambda x: "unsafe_travel"
                                                        if any(map(x.__contains__, ["unsafe travel"])) 
                                                        else x)
df['phone_charge_cost'].replace({'n_kes_and_above': '1000 n/kes and above',
                                 '1000 n and above': '1000 n/kes and above'},
                                  inplace=True)

In [4]:
costs = ["kerosene_lamps_cost", "cooking_energy_cost", 
         "applicances_charging_cost", "phone_charge_cost",
        "water_cost"]
for col in costs:
    df[col] = df[col].map(lambda x: "nothing"
                        if any(map(x.__contains__, ["nothing", "free"])) 
                        else x)

In [5]:
# To binarize
# clean = ['battery', 'solar', 'renewvia']
unclean = ['petrol', 'diesel', 'kerosene', 'charcoal']
bin_cols = [
# From Renewvia or not
"applicances_charging_sources",
"cooking_energy_sources",
"light_primary_sources",
"power_sources"
# "power_sources_primary",
]

# Sub-grouping
chores_male_involved = [
# Involvement of male into house chores
"cooking_fuel_responsible",
"water_collection_responsible"
]
school_aged = ['12-15 years old', '15 -18 years old', 
             '10-12 years old', '5-10 years old']

df["water_collection_school_aged"] = df["avg_person_age_water_collection"].map(lambda x: 1
                                                                                if any(map(x.__contains__, school_aged)) 
                                                                                else 0)
for col in bin_cols:
    df[col+"_unclean"] = df[col].map(lambda x: 1
                                    if any(map(x.__contains__, unclean)) 
                                    else 0)
    
for col in chores_male_involved:
    df[col+"_binary"] = df[col].map(lambda x: 1
                                    if any(map(x.__contains__, ["adult male"])) 
                                    else 0)

In [6]:
# Count of multiple selection columns
df['business_use_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['business_from_minigrid'].replace({'option 1': 'yes', 'option 2': 'no'}, 
                                    inplace=True)
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_binary'] = df['business_owners_count'].map(lambda x: 1
                                                                  if x >= 1
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)

df['appliances_type_addition'] = df['appliances_type_addition'].map(lambda x: x.replace(" ", ";"))
df['appliances_addition_count'] = df['appliances_type_addition'].map(lambda x: 
                                                                     len(x[:-1].split(";"))
                                                                     if type(x) == str
                                                                     else 0)
df['unsafe_due_to_theft'] = df['feel_unsafe_reasons'].map(lambda x: 1 
                                                          if x == "potential_theft"
                                                          else 0)

df['connection_period'].replace('1-3 months', 
                                        '0-3 months', 
                                        inplace=True)

df['avg_person_age_water_collection'].replace({'15 years or older': '15 -18 years old',
                                               'years_or_older': '18_or_older'},
                                              inplace=True)

df["community_lights"].replace({"no_none": "no", "street_lights":"yes"}, 
                          inplace=True)

distance_cols = ['clinic_travel_distance', 
                 'phone_charge_travel_distance', 
                 'water_collection_travel_distance']
for col in distance_cols:
    rep_map = {
               'no_need_to_travel_-_charge_at_home': 'no travel',
               'no need to travel - charge at home': 'no travel',
                'no need to travel - at home water supply': 'no travel',
                'no_need_to_travel_-_at_home_water_supply': 'no travel',
               'between_2-5_km': 'between 2-3 km',
                'less than 1km': 'less than 1 km',
               'less than 1 km ': 'less than 1 km',
                'less_than_1_km': 'less than 1 km',
               'greater_than_10km': 'greater than 10 km'}
    
    df[col].replace(rep_map, inplace=True)
    
    
df["water_cost"].replace("i don't pay, it's free.", 
                          "i don't pay, it's free", 
                          inplace=True)

water_cols = ['water_source', "clean_drinking_water_source"]
for col in water_cols:
    rep_map = {
    'clear water source (fresh spring, lake etc.)': 'clear_water',
    'dirty water source (pond, contaminated well etc.)': 'dirty_water',
    'dirty water source (pond, contaminated well, etc)': 'dirty_water',
    'bottle water': 'bottled water', 
    'boiled': 'boiled water', 
    'boil': 'boiled water',
    'boild': 'boiled water',
    'clean community source ': 'clean community source',
    'treated / filtered water': 'treated', 
          }
    df[col].replace(rep_map, inplace=True)

trim_cols = ['clean_drinking_water_source', 
             'phone_charge_travel_distance']
for c in trim_cols:
    df[c] = df[c].str.rstrip()


In [7]:
# distance_cols = ['clinic_travel_distance', 
#                  'phone_charge_travel_distance', 
#                  'water_collection_travel_distance']
# for col in distance_cols:
#     print(df[col].unique())

In [8]:
df.replace('nan', np.nan, inplace=True)
def replace_str(value):
    replace_chars = {'': [")", '.)', ',', '/', "ikom",
                         'u\n', "'", ".0", 
                          "years", ' years', "dead",
                          "_years", 
                          # "000", "00",
                          "*"],
                    '_': ['- ',' (', ' ', '-', "__"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            if type(value) == str:
                value = value.replace(c, new).rstrip()
            else:
                continue

    return value

    
cols_num = ['appliances_count', 
            'appliances_addition_count',
            'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
            'business_owners_count', 'business_owners_female']

cols_replace = list(df.columns)
cols_replace.remove("renewvia_id")
cols_replace.remove("avg_household_income")
cols_replace

for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x))
    if col in cols_num:
        if col == 'light_hours_current':
            df[col].replace({"8-12": "8", "3-4":"3"}, inplace=True)
        df[col] = df[col].astype(str)
        df[col] = df[col].str.extract('(\d+)', expand=False)
        df[col] = df[col].astype(float).astype('Int64')
    else:
        continue

df[cols_num].head()

Unnamed: 0,appliances_count,appliances_addition_count,cellphones_count,light_hours_current,kerosene_lamps_count,business_owners_count,business_owners_female
0,,5,,,0,1,0
1,,2,,,0,0,0
2,,2,,,2,1,0
3,3.0,4,2.0,12.0,0,2,1
4,2.0,3,3.0,6.0,0,0,0


In [9]:
df.to_csv("datasets_clean/post_clean.csv")

In [10]:
cols_enc = [
 'renewvia_id',
 'interviewed_before',
 'connection_period',
 'country',
 'community',
 'age',
 'gender',
 'occupation_change',
 'household_income_change',
 'avg_household_income',
 'household_headcount_change',
 'girls_schooling_change',
 'boys_schooling_change',
 'school_performance_change',
 'girls_unschooled_reasons',
 'boys_unschooled_reasons',
 'household_business_owners',
 'business_type',
 'business_recent',
 'business_from_minigrid',
 'business_use_minigrid',
 'power_sources_usage',
 'appliances_count',
 'appliances_count_change',
 'cellphones_count',
 'appliances_type_addition',
 'light_hours_current',
 'kerosene_lamp_usage_change',
 'kerosene_lamps_count',
 'kerosene_lamp_usage_time',
 'kerosene_lamps_cost',
 'cooking_energy_sources',
 'cooking_fuel_collection_time',
 'cooking_fuel_responsible',
 'cooking_energy_cost',
 'applicances_charging_sources',
 'applicances_charging_cost',
 'community_lights',
 'home_exterior_lights',
 'exterior_lights_minigrid',
 'feel_safe_dark',
 'feel_safe_if_exterior_lights',
 'feel_unsafe_reasons',
 'feel_safer_with_minigrid',
 'phone_charge_location',
 'phone_charge_frequency',
 'phone_charge_cost',
 'phone_charge_travel_distance',
 'water_source',
 'clean_drinking_water',
 'clean_drinking_water_source',
 'community_clean_water_source',
 'water_collection_travel_distance',
 'water_collection_time',
 'water_collection_responsible',
 'avg_person_age_water_collection',
 'water_cost',
 'clinic_travel_distance',
 'clinic_electricity_access',
 'clinic_refrigeration_access',
 'better_access_health_minigrid',
 'minigrid_access_life_improvement',
 'minigrid_access_productivity_improvement',
 'minigrid_effect_on_household_female',
 'water_collection_school_aged',
 'applicances_charging_sources_unclean',
 'cooking_energy_sources_unclean',
 'power_sources_unclean',
 'cooking_fuel_responsible_binary',
 'water_collection_responsible_binary',
 'business_owners_count',
 'business_owners_binary',
 'business_owners_female',
 'appliances_addition_count',
 'unsafe_due_to_theft'
]
df_enc = df.copy()
with open('col_encoding.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map["col_encoding"]
n_col = len(answers)
    
c_map = []
for i in range(0, n_col):
    col = answers[i]['col_name']
    if col in cols_enc:
        col_map = answers[i]['col_enc_map']
        df_enc[col].replace(col_map, inplace=True)
        
# for col in cols_enc:
#     # print(col)
#     print(col, df_enc[col].unique())
    
df_enc.to_csv('datasets_encoded/post_encoded.csv')