In [1]:
import pandas as pd
import numpy as np
import json
import collections
import re
import category_encoders as ce

In [2]:
# Transform into ratio
cols_pre = [
'end', 'renewvia_id', 'status',
'connection_period_initial',
'country',
'nigeria_community', 'kenya_community', 'age', 'gender',
'occupation', 'primary_provider', 'primary_provider_occupation',
'employement_type', 'avg_household_income', 'household_headcount',
'adult_headcount', 'girls_headcount', 'boys_headcount', 'girls_age',
'boys_age', 'girls_schooling', 'girls_unschooled_reasons',
'boys_schooling',
'boys_unschooled_reasons', 
'household_business_owners', 'minigrid_signup_primary_reason',
'minigrid_signup_secondary_reason', 'power_sources',
'power_sources_usage',
'power_sources_primary',
'appliances_count', 'cellphones_count', 'appliances_type',
'appliances_explain', 'appliances_addition_type',
'light_hours_current',
'light_primary_sources', 'kerosene_lamps_count',
'kerosene_lamp_usage_time', 'kerosene_lamps_cost',
'cooking_energy_sources', 
'cooking_fuel_collection_time', 'cooking_fuel_responsible',
'cooking_energy_cost', 'applicances_charging_sources',
'applicances_charging_cost',
'feel_safe_dark', 'community_lights', 'home_exterior_lights',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'phone_charge_location',
'phone_charge_frequency',
'phone_charge_cost', 'phone_charge_travel_distance', 'water_source',
'clean_drinking_water', 'clean_drinking_water_source',
'clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_collection_responsible',
'avg_person_age_water_collection', 'water_cost',
'other_household_activities', 'clinic_travel_distance',
'clinic_electricity_access',
'clinic_refrigeration_access', 'end_date',
'occupation_secondary_provider',
'cooking_energy_sources', 'community_clean_water_source'
]

#Importing the datasets
data = pd.read_csv("datasets_annotated/initial_annotated.csv", usecols=cols_pre)
print(data.shape)
# standardize case
data = data.apply(lambda x: x.astype(str).str.lower())
# Get identified customers only
data = data[~data['renewvia_id'].isin(["no meter", "nan", ""])]
data["community"] = (data["nigeria_community"].astype(str) + 
                   data["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))

#Household Income
data["avg_household_income"] = data["avg_household_income"].str.extract('([-+]?\d*,?\d+)')
data["avg_household_income"] = data["avg_household_income"].str.replace(",", "")
data["avg_household_income"] = pd.to_numeric(data["avg_household_income"])

# Remove duplicates
df = data.sort_values('avg_household_income', ascending=False).drop_duplicates('renewvia_id').sort_index()
df['renewvia_id'] = df['renewvia_id'].map(lambda x: x.replace(".0", "").upper())
print(df.shape)
df.head()

(3952, 71)
(1336, 72)


Unnamed: 0,end,status,connection_period_initial,renewvia_id,country,nigeria_community,kenya_community,age,gender,occupation,...,avg_person_age_water_collection,water_cost,other_household_activities,clinic_travel_distance,clinic_electricity_access,clinic_refrigeration_access,end_date,occupation_secondary_provider,community_clean_water_source,community
0,2021-09-28 20:25:03,pre-connection,,501121,nigeria,akipelai,,,,,...,,,,,,,2021-09-28,,,akipelai
1,2021-10-24 09:55:40,pre-connection,3-6 months,570063,nigeria,balep,,45,male,farming,...,,0-500 n/kes,other food processing,less_than_1_km,no,no,2021-10-24,,,balep
2,2021-10-24 11:16:09,pre-connection,3-6 months,570028,nigeria,balep,,52 years,male,education,...,18 or older,"i don't pay, it's free",processing ugali/gari,less_than_1_km,no,no,2021-10-24,,,balep
3,2021-10-24 11:22:41,pre-connection,3-6 months,570156,nigeria,balep,,23 years,male,farming,...,18 or older,"i don't pay, it's free",other food processing,less_than_1_km,no,no,2021-10-24,,,balep
4,2021-10-25 19:50:12,pre-connection,3-6 months,570097,nigeria,balep,,27 years,male,education,...,12-15 years old,"i don't pay, it's free",processing palm oil,less_than_1_km,no,no,2021-10-25,,,balep


In [3]:
# Count of multiple selection columns
df['household_business_owners'] = df['household_business_owners'].map(lambda x: x.replace("none;", ""))
df['business_owners_count'] = df['household_business_owners'].map(lambda x: 
                                                                  len(x[:-1].split(";")) 
                                                                  if x != ""
                                                                  else 0)
df['business_owners_binary'] = df['business_owners_count'].map(lambda x: 1
                                                                  if x >= 1
                                                                  else 0)
df['business_owners_female'] = df['household_business_owners'].map(lambda x: 1 
                                                                            if x != ""
                                                                            and ('female' in x) 
                                                                            else 0)
df['appliances_type'] = df['appliances_type'].map(lambda x: x.replace(" ", ";"))
df['appliances_type_count'] = df['appliances_type'].map(lambda x: 
                                                          len(x[:-1].split(";")) 
                                                          if x != ""
                                                          else 0)
df['connection_period_initial'].replace('1-3 months', 
                                        '0-3 months', 
                                        inplace=True)

df['avg_person_age_water_collection'].replace('15 years or older', 
                                              '15 -18 years old', 
                                              inplace=True)

df["age"].replace({"_hours": "", "422": "42", "299": "29", "1989": "34"}, inplace=True)
df["water_cost"].replace("i don't pay, it's free.", 
                          "i don't pay, it's free", 
                          inplace=True)

trim_cols = ['clean_drinking_water_source', 'phone_charge_travel_distance']
for c in trim_cols:
    df[c] = df[c].str.rstrip()

distance_cols = ['clinic_travel_distance', 
                 'phone_charge_travel_distance', 
                 'water_collection_travel_distance']
for col in distance_cols:
    rep_map = {
               'no_need_to_travel_-_charge_at_home': 'no travel charge at home',
               'no need to travel - charge at home': 'no travel charge at home',
                'no need to travel - at home water supply': 'no travel home water supply',
                'no_need_to_travel_-_at_home_water_supply': 'no travel home water supply',
               'between_2-5_km': 'between 2-3 km',
               'less than 1 km ': 'less than 1 km',
               'greater_than_10km': 'greater than 10 km'}
    df[col].replace(rep_map, inplace=True)
    
water_cols = ['clean_water_source', 'water_source', "clean_drinking_water_source"]
for col in water_cols:
    rep_map = {
    'clear water source (fresh spring, lake etc.)': 'clear_water',
    'dirty water source (pond, contaminated well etc.)': 'dirty_water',
    'dirty water source (pond, contaminated well, etc)': 'dirty_water',
    'bottle water': 'bottled water', 
    'boiled': 'boiled water', 
    'boil': 'boiled water',
    'boild': 'boiled water',
    'clean community source ': 'clean community source',
    'treated / filtered water': 'treated', 
          }
    df[col].replace(rep_map, inplace=True)

In [4]:
df.replace('nan', np.nan, inplace=True)
def replace_str(value):
    replace_chars = {'': [")", '.)', ',', '/', "ikom",
                         'u\n', "'", ".0", 
                          "years", ' years', "dead",
                          "_years", "000", "00", "*"],
                    '_': ['- ',' (', ' ', '-', "__"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            if type(value) == str:
                value = value.replace(c, new).rstrip()
            else:
                continue

    return value

cols_num = ['household_headcount', 
            'girls_schooling', 'boys_schooling', 'appliances_count', 
            'appliances_type_count',
            'cellphones_count', 'light_hours_current', 'kerosene_lamps_count',
            'business_owners_count', 'business_owners_female']

cols_replace = list(df.columns)
cols_replace.remove("renewvia_id")
cols_replace.remove("avg_household_income")

# Convert to numeric columns
for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x))
    if col in cols_num:
        # if col == 'light_hours_current':
        #     df[col].replace({"8-12": "8", "3-4":"3"}, inplace=True)
        df[col] = df[col].astype(str)
        df[col] = df[col].str.extract('(\d+)', expand=False)
        df[col] = df[col].astype(float).astype('Int64')
    else:
        continue
    
    
df["community_lights"].replace({"no_none": "no", "street_lights":"yes"}, 
                          inplace=True)
df[cols_num].head()

Unnamed: 0,household_headcount,girls_schooling,boys_schooling,appliances_count,appliances_type_count,cellphones_count,light_hours_current,kerosene_lamps_count,business_owners_count,business_owners_female
0,,,,,1,,,,1,0
1,3.0,,1.0,5.0,2,3.0,5.0,,1,1
2,9.0,2.0,4.0,5.0,4,8.0,5.0,,1,1
3,2.0,,,4.0,4,2.0,5.0,,1,1
4,7.0,2.0,3.0,5.0,5,7.0,6.0,,1,0


In [5]:
df.to_csv("datasets_clean/initial_clean.csv")

# df_clean = 
# # df_clean.head()
# df_clean.to_csv("datasets_clean/initial_clean.csv")

In [6]:
# df.columns

In [7]:
cols_focus = [
'renewvia_id', 'community',
'age', 'gender', 'status',
'connection_period_initial',
'avg_household_income', 
'minigrid_signup_primary_reason',
'light_hours_current', 'light_primary_sources', 
'kerosene_lamps_count',
'kerosene_lamp_usage_time', 'kerosene_lamps_cost',
'cooking_energy_sources', 'cooking_energy_sources', 
'cooking_fuel_collection_time', 'cooking_fuel_responsible',
'cooking_energy_cost', 
'feel_safe_dark', 'community_lights', 'home_exterior_lights',
'feel_safe_if_exterior_lights', 'feel_unsafe_reasons',
'clean_drinking_water', 'clean_drinking_water_source',
'clean_water_source', 'water_collection_travel_distance',
'water_collection_time', 'water_collection_responsible',
'avg_person_age_water_collection', 'water_cost',
'clinic_electricity_access', 'clinic_refrigeration_access', 
'end_date', 
'business_owners_binary', 'business_owners_female',
]


# Encoding values
df_enc = df[cols_focus].copy()
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map['col_answers_map']
n_col = len(answers)
    
c_map = []
for i in range(0, n_col):
    col = answers[i]['new_name']
    mapping = answers[i]['mapping']
    if col in cols_focus and mapping != {} and col != 'household_business_owners':
        vals = list(mapping.values())
        new_vals = [replace_str(v.lower()) for v in vals]
        if "yes" in new_vals and "no" in new_vals:
            # c_map.append({col: })
            mapping = {'no':0, 'yes':1, 
                       'option 1': 0, 'option 2': 1}
            df_enc[col] = df_enc[col].replace(mapping)
            
        else:
            mapping = dict()
            rng = len(new_vals)
            for idx, val in enumerate(new_vals):
                mapping[val] = idx+1
                
            df_enc[col] = df_enc[col].replace(mapping)
# df_enc.head()
df_enc.to_csv('datasets_encoded/initial_encoded.csv')