In [6]:
import pandas as pd
import numpy as np
import json
import collections
import re
import category_encoders as ce

In [7]:
# Transform into ratio
# filter_cols = [
# 'renewvia_id',
# 'country',
# 'community',
# 'connection_period',
# 'business_type',
# 'operation_status',
# 'electricity_sources_non_minigrid',
# 'appliances_addition',
# 'kerosene_usage_change',
# 'diesel_usage_change',
# 'operations_hours_change',
# 'hours_increase',
# 'clean_drinking_water_access',
# 'new_prod_serv_add',
# 'workforce_change',
# 'workforce_change_female',
# 'weekly_monthly_earnings',
# 'health_offering_change',
# 'school_attendance_change',
# 'school_attendance_performance',
# ]

# Importing the commecial dataset
df = pd.read_csv("datasets_annotated/ci_post_annotated.csv", 
                   # usecols=filter_cols
                )
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

df.head()

Unnamed: 0.1,Unnamed: 0,start,end,renewvia_id,country,kenya_community,nigeria_community,original_person,connection_period_non_original_subscriber,first_name,...,health_offering_change,clinic_offering_unable_prior_minigrid,school_attendance_change,school_offering_unable_prior_minigrid,school_performance,minigrid_benefits,minigrid_problems,end_date,customerAccountNumber,tariff
0,0,2022-07-20 04:18:02,2022-07-20 04:24:28,131206,kenya,kalobeyei settlement,,,,,...,none of these;,not available,"no, we have not noticed a change",not available,"no, there has not been a noticable change",increased production and effieciency,delayed payment and system updates,2022-07-20,131206.0,commercial
1,1,2022-07-20 04:55:11,2022-07-20 05:01:23,131231,kenya,kalobeyei settlement,,,,,...,none of these;,not available,"no, we have not noticed a change",not available,"no, there has not been a noticable change",not available,delayed update after payment,2022-07-20,131231.0,commercial
2,2,2022-07-20 05:03:46,2022-07-20 05:12:06,131542,kenya,kalobeyei settlement,,,,,...,none of these;,not available,"no, we have not noticed a change",not available,"no, there has not been a noticable change",not available,initially delayed update after payment,2022-07-20,131542.0,commercial
3,3,2022-07-20 05:12:18,2022-07-20 05:42:12,131220,kenya,kalobeyei settlement,,,,,...,none of these;,not available,"no, we have not noticed a change",not available,"no, there has not been a noticable change",increased sales,payment problem,2022-07-20,131220.0,commercial
4,4,2022-07-20 05:42:21,2022-07-20 05:47:14,131485,kenya,kalobeyei settlement,,,,,...,none of these;,not available,"no, we have not noticed a change",not available,"no, there has not been a noticable change",not available,,2022-07-20,131485.0,commercial


In [8]:
cols_change = [
'kenya_community',
 'nigeria_community',
 'original_person',
 'connection_period_non_original_subscriber',
 'connection_period',
 'business_type',
 'operation_status',
 'non_renewvia_electrical_sources',
 'kerosene_usage_change',
 'diesel_usage_change',
 'operations_hours_change',
 'business_hours_increase',
 'clean_drinking_water_access',
 'ci_offering_change',
 'workforce_change',
 'workforce_change_female',
 'earnings_change',
 'school_attendance_change',
 'school_performance',
 'tariff'
]

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", ".0"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string

for col in cols_change:
    df[col] = df[col].apply(lambda x: replace_str(x))
    
df.replace(['nan', '<na>'], np.nan, inplace=True)

df["workforce_change_female"] = df["workforce_change_female"].astype(float).astype('Int64')

df['appliances_addition_count'] = df['appliances_addition_type'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
# No. of new services offered by clinic
df['health_offering_change_count'] = df['health_offering_change'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
df['school_performance'].replace('no_there_has_not_been_a_noticable_change',
              'no_there_has_not_been_a_noticeable_change', inplace=True)

# df.head()
df.to_csv("datasets_clean/ci_post_clean.csv")

In [9]:
# Encoding values
df_enc = df.copy()
cols = list(df.columns)
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map['col_answers_map']
n_col = len(answers)
    
# c_map = []
for i in range(0, n_col):
    col = answers[i]['new_name']
    mapping = answers[i]['mapping']
    
    if col in cols and mapping != {} and col != 'household_business_owners':
        vals = list(mapping.values())
        new_vals = [replace_str(v.lower()) for v in vals]
        col_mapping = dict()
        if "yes" in new_vals and "no" in new_vals:
            col_mapping = {'no': 0, 'yes':1, 
                           'option 1': 0, 'option 2': 1}

        elif any(map(col.__contains__, ["_change", "_improvement"])):

            cats = new_vals
            neutral = ['no', 'same', 'stay', 'not']
            down = ['lost', 'decrease','decreased','decreases',]
            up = ['add', 'better', 'increase',
                    'increased','increases', 'greater']
            for cat in cats:
                cat = str(cat)
                if cat != 'nan':
                    if cat == 'no' or any(map(cat.__contains__, neutral)):
                        col_mapping[cat] = 0
                    elif any(map(cat.__contains__, down)):
                        col_mapping[cat] = -1
                    elif cat == 'yes' or any(map(cat.__contains__, up)):
                        col_mapping[cat] = 1
        else:
            rng = len(new_vals)
            for idx, val in enumerate(new_vals):
                col_mapping[val] = idx+1
                
        df_enc[col].replace(col_mapping, inplace=True)

# df_enc.head()
df_enc.to_csv("datasets_encoded/ci_post_encoded.csv")