In [1]:
import pandas as pd
import numpy as np
import json
import collections
import re
import category_encoders as ce

In [2]:
# Transform into ratio
cols_pre = [
'end', 'renewvia_id', 'status',
'connection_period_initial', 'country',
'nigeria_community', 'kenya_community', 'age', 'gender',
'occupation', 'primary_provider', 'primary_provider_occupation',
'employement_type', 'avg_household_income', 'household_headcount',
'adult_headcount', 'girls_headcount', 'boys_headcount', 
'girls_schooling', 'girls_unschooled_reasons',
'boys_schooling', 'boys_unschooled_reasons', 
'household_business_owners',  'power_sources',
'power_sources_usage', 'power_sources_primary',
'appliances_count', 'cellphones_count', 'appliances_type',
'appliances_addition_type', 'light_hours_current',
'light_primary_sources', 'kerosene_lamps_count',
'kerosene_lamp_usage_time', 'kerosene_lamps_cost', 'cooking_energy_sources', 
'cooking_fuel_collection_time', 'cooking_fuel_responsible',
'cooking_energy_cost', 'applicances_charging_sources',
'applicances_charging_cost', 'feel_safe_dark', 
'community_lights', 'home_exterior_lights',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'phone_charge_location', 'phone_charge_frequency',
'phone_charge_cost', 'phone_charge_travel_distance', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_collection_responsible',
'avg_person_age_water_collection', 'water_cost',
'clinic_travel_distance',
'clinic_electricity_access', 'clinic_refrigeration_access', 'end_date',
'occupation_secondary_provider', 'cooking_energy_sources', 
'community_clean_water_source'
]

#Importing the datasets
data = pd.read_csv("datasets_annotated/initial_annotated.csv", usecols=cols_pre)
print(data.shape)
# standardize case
data = data.apply(lambda x: x.astype(str).str.lower())
# Get identified customers only
#data = data[~data['renewvia_id'].isin(["no meter", "nan", ""])]
data["community"] = (data["nigeria_community"].astype(str) + 
                   data["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))

#Household Income
data["avg_household_income"] = data["avg_household_income"].str.extract('([-+]?\d*,?\d+)')
data["avg_household_income"] = data["avg_household_income"].str.replace(",", "")
data["avg_household_income"] = pd.to_numeric(data["avg_household_income"])

# Remove duplicates
df = data.copy()
df = df.sort_values('avg_household_income', ascending=False).drop_duplicates('renewvia_id').sort_index()
df['renewvia_id'] = df['renewvia_id'].map(lambda x: x.replace(".0", "").upper())
print(df.shape)
df.head()

(3952, 65)
(1338, 66)


Unnamed: 0,end,status,connection_period_initial,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,...,water_collection_responsible,avg_person_age_water_collection,water_cost,clinic_travel_distance,clinic_electricity_access,clinic_refrigeration_access,end_date,occupation_secondary_provider,community_clean_water_source,community
0,9/28/2021 20:25,pre-connection,,501121,nigeria,akipelai,,,,,...,,,,,,,9/28/2021,,,akipelai
1,10/24/2021 9:55,pre-connection,3-6 months,570063,nigeria,balep,,45,male,farming,...,child male,,0-500 n/kes,less_than_1_km,no,no,10/24/2021,,,balep
2,10/24/2021 11:16,pre-connection,3-6 months,570028,nigeria,balep,,52 years,male,education,...,child male,18 or older,"i don't pay, it's free",less_than_1_km,no,no,10/24/2021,,,balep
3,10/24/2021 11:22,pre-connection,3-6 months,570156,nigeria,balep,,23 years,male,farming,...,adult female,18 or older,"i don't pay, it's free",less_than_1_km,no,no,10/24/2021,,,balep
4,10/25/2021 19:50,pre-connection,3-6 months,570097,nigeria,balep,,27 years,male,education,...,child female,12-15 years old,"i don't pay, it's free",less_than_1_km,no,no,10/25/2021,,,balep


In [3]:
# Count of multiple selection columns
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: 
                                                                  len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['appliances_type_count'] = df['appliances_type'].map(lambda x: 
                                                          len(x[:-1].split(";")) 
                                                          if x != ""
                                                          else 0)

df['business_owners_binary'] = df['business_owners_count'].map(lambda x: 1
                                                                  if x >= 1
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                    if x != ""
                                                                    and ('female' in x) 
                                                                    else 0)
df['appliances_type'] = df['appliances_type'].map(lambda x: x.replace(" ", ";"))

df['connection_period_initial'].replace('1-3 months', '0-3 months', inplace=True)

df['avg_person_age_water_collection'].replace('15 years or older', '15 -18 years old', inplace=True)

# df["age"].replace({"_hours": "", "422": "42", "299": "29", "1989": "34"}, inplace=True)
df["water_cost"].replace("i don't pay, it's free.", 
                          "i don't pay, it's free", 
                          inplace=True)

df["community_lights"].replace({'street lights': 'yes', 'no, none': 'no'}, 
                                inplace=True)

trim_cols = ['clean_drinking_water_source', 'phone_charge_travel_distance']
for c in trim_cols:
    df[c] = df[c].str.rstrip()

distance_cols = ['clinic_travel_distance', 
                 'phone_charge_travel_distance', 
                 'water_collection_travel_distance']
for col in distance_cols:
    rep_map = {
               'no_need_to_travel_-_charge_at_home': 'no travel charge at home',
               'no need to travel - charge at home': 'no travel charge at home',
                'no need to travel - at home water supply': 'no travel home water supply',
                'no_need_to_travel_-_at_home_water_supply': 'no travel home water supply',
               'between_2-5_km': 'between 2-3 km',
                'less than 1km': 'less than 1 km',
               'less than 1 km ': 'less than 1 km',
                'less_than_1_km': 'less than 1 km',
               'greater_than_10km': 'greater than 10 km'}
    df[col].replace(rep_map, inplace=True)
    
water_cols = ['clean_water_source', 'water_source', "clean_drinking_water_source"]
for col in water_cols:
    rep_map = {
    'clear water source (fresh spring, lake etc.)': 'clear_water',
    'dirty water source (pond, contaminated well etc.)': 'dirty_water',
    'dirty water source (pond, contaminated well, etc)': 'dirty_water',
    'bottle water': 'bottled water', 
    'boiled': 'boiled water', 
    'boil': 'boiled water',
    'boild': 'boiled water',
    'clean community source ': 'clean community source',
    'treated / filtered water': 'treated', 
          }
    df[col].replace(rep_map, inplace=True)
    
time_map = {
    'between 2 and 5 hours':'between 2-5 hours',
    'between 5 and 8 hours': 'between 5-8 hours',
    'unsafe travel to obtain, water, supplies or charging phones': "unsafe_travel",
    'unsafe travel to obtain, water, supplies and charging phones': "unsafe_travel",
}
df['feel_unsafe_reasons'].replace(time_map, inplace=True)

In [4]:
# To binarize
# clean = ['battery', 'solar', 'renewvia']
unclean = ['petrol', 'diesel', 'kerosene', 'charcoal']
bin_cols = [
# Unclean sources
"applicances_charging_sources",
"cooking_energy_sources",
"light_primary_sources",
"power_sources",
"power_sources_primary",
]

# Sub-grouping
chores_male_involved = [
# Involvement of male into house chores
"cooking_fuel_responsible",
"water_collection_responsible"
]
school_aged = ['12-15 years old', '15 -18 years old', 
             '10-12 years old', '5-10 years old']

df["water_collection_school_aged"] = df["avg_person_age_water_collection"].map(lambda x: 1
                                                                                if any(map(x.__contains__, school_aged)) 
                                                                                else 0)
for col in bin_cols:
    df[col+"_unclean"] = df[col].map(lambda x: 1
                                    if any(map(x.__contains__, unclean)) 
                                    else 0)
    
for col in chores_male_involved:
    df[col+"_binary"] = df[col].map(lambda x: 1
                                    if any(map(x.__contains__, ["adult male"])) 
                                    else 0)

In [5]:
df.replace('nan', np.nan, inplace=True)
def replace_str(value):
    replace_chars = {'': [")", '.)', ',', '/', "ikom",
                         'u\n', "'", ".0", 
                          "years", ' years', "dead",
                          "_years", "*",
                         # "000", "00", 
                         ],
                    '_': ['- ',' (', ' ', '-', "__"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            if type(value) == str:
                value = value.replace(c, new).rstrip()
            else:
                continue

    return value

cols_num = ['household_headcount', 'age',
            'girls_schooling', 'boys_schooling', 'appliances_count', 
            'appliances_type_count',
            'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
            'business_owners_count', 'business_owners_female',
           'adult_headcount', 'girls_headcount', 'boys_headcount']
df["age"].replace({'34_hours':"34", '1989': "34", "299": "29", "422":"42"}, inplace=True)
cols_replace = list(df.columns)
cols_replace.remove("renewvia_id")
cols_replace.remove("avg_household_income")

# Convert to numeric columns
for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x))
    if col in cols_num:
        # if col == 'light_hours_current':
        #     df[col].replace({"8-12": "8", "3-4":"3"}, inplace=True)
        df[col] = df[col].astype(str)
        df[col] = df[col].str.extract('(\d+)', expand=False)
        df[col] = df[col].astype(float).astype('Int64')
    else:
        continue

In [6]:
df['i_dont_feel_unsafe'] = df['feel_unsafe_reasons'].map(lambda x: 1 
                                                          if x == "i_dont_feel_unsafe"
                                                          else 0)
df['unsafe_due_to_theft'] = df['feel_unsafe_reasons'].map(lambda x: 1 
                                                          if x == "potential_theft"
                                                          else 0)
df['unsafe_due_to_unsafe_travel'] = df['feel_unsafe_reasons'].map(lambda x: 1 
                                                          if x == "unsafe_travel"
                                                          else 0)
df['unsafe_due_to_lack_of_community_lighting'] = df['feel_unsafe_reasons'].map(lambda x: 1 
                                                          if x == "lack_of_community_lighting"
                                                          else 0)

In [7]:
df.to_csv("datasets_clean/initial_clean.csv")

In [8]:
# cols_focus = [
# 'renewvia_id', 'community',
# 'age', 'gender', 'status',
# 'connection_period_initial',
# 'avg_household_income', 
# 'minigrid_signup_primary_reason',
# 'light_hours_current', 
# 'kerosene_lamps_count',
# 'kerosene_lamp_usage_time', 'kerosene_lamps_cost',
# 'cooking_energy_sources', 'cooking_energy_sources', 
# 'cooking_fuel_collection_time', 'cooking_fuel_responsible',
# 'cooking_energy_cost', 
# 'feel_safe_dark', 'community_lights', 'home_exterior_lights',
# 'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
# 'clean_drinking_water', 'clean_drinking_water_source',
# 'clean_water_source', 'water_collection_travel_distance',
# 'water_collection_time', 'water_collection_responsible',
# 'avg_person_age_water_collection', 'water_cost',
# 'clinic_electricity_access', 'clinic_refrigeration_access', 
# 'end_date', 'appliances_count',
# 'business_owners_binary', 'business_owners_female',
#     'unsafe_due_to_theft'
# ]


# Encoding values
df_enc = df.copy()
# # mapping missing or mistyped answers
# with open('col_answers_mapping.json', 'r') as f:
#     col_answers_map = json.load(f)
    
# answers = col_answers_map['col_answers_map']
# n_col = len(answers)
    
# c_map = []
# for i in range(0, n_col):
#     col = answers[i]['new_name']
#     mapping = answers[i]['mapping']
#     if col in cols_focus and mapping != {} and col != 'household_business_owners':
#         vals = list(mapping.values())
#         new_vals = [replace_str(v.lower()) for v in vals]
#         if "yes" in new_vals and "no" in new_vals:
#             # c_map.append({col: })
#             col_mapping = {'no':0, 'yes':1, 
#                        'option 1': 0, 'option 2': 1}
            
#         elif col == "community_lights":
#             col_mapping = {"yes": 1, 'no': 0}
            
#         else:
#             col_mapping = dict()
#             rng = len(new_vals)
#             for idx, val in enumerate(new_vals):
#                 col_mapping[val] = idx+1
                
#         df_enc[col] = df_enc[col].replace(col_mapping)
# # df_enc.head()
# df_enc.to_csv('datasets_encoded/initial_encoded.csv')

In [9]:
cols_enc = [
 'renewvia_id', 'country',  'community', 'age',
 'status','connection_period_initial',
 'gender', 'occupation', 'employement_type',
 'avg_household_income', 'household_headcount',
 'adult_headcount', 'girls_headcount',
 'boys_headcount','girls_schooling',
 'girls_unschooled_reasons', 'boys_schooling', 
 'boys_unschooled_reasons', 'power_sources_usage', 
 'appliances_count', 'cellphones_count',
 'light_hours_current', 'kerosene_lamps_count', 
 'kerosene_lamp_usage_time',
 'kerosene_lamps_cost', 'cooking_energy_sources',
 'cooking_fuel_collection_time', 'cooking_energy_cost',
 'applicances_charging_sources', 'applicances_charging_cost',
 'feel_safe_dark', 'community_lights',
 'home_exterior_lights', 'feel_safe_if_exterior_lights',
 'feel_unsafe_reasons', 'phone_charge_location',
 'phone_charge_frequency', 'phone_charge_cost',
 'phone_charge_travel_distance',
 'water_source', 'clean_drinking_water',
 'clean_drinking_water_source',
 'water_collection_travel_distance',
 'water_collection_time', 'water_collection_responsible',
 'avg_person_age_water_collection',
 'water_cost', 'clinic_travel_distance',
 'clinic_electricity_access', 'clinic_refrigeration_access',
 'business_owners_count', 'appliances_type_count',
 'business_owners_binary', 'business_owners_female',
    'i_dont_feel_safe',
 'unsafe_due_to_theft',  
'unsafe_due_to_lack_of_community_lighting', 
'unsafe_due_to_unsafe_travel', 
    'water_collection_school_aged',
 'applicances_charging_sources_unclean',
 'cooking_energy_sources_unclean', 'power_sources_unclean',
 'cooking_fuel_responsible_binary',
 'water_collection_responsible_binary'
]

df_enc = df.copy()
with open('col_encoding.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map["col_encoding"]
n_col = len(answers)
    
c_map = []
for i in range(0, n_col):
    col = answers[i]['col_name']
    if col in cols_enc:
        col_map = answers[i]['col_enc_map']
        df_enc[col].replace(col_map, inplace=True)
        
df_enc["feel_safe_levels"] = np.where(df_enc.feel_safe_dark > 3, -1, 
                                      (np.where(
                                          df_enc.feel_safe_dark < 3, 
                                          1, 0)))
# for col in cols_enc:
#     # print(col)
#     print(col, df_enc[col].unique())
    
df_enc.to_csv('datasets_encoded/initial_encoded.csv')