In [None]:
import pandas as pd
import numpy as np
import json
import collections
import re
import category_encoders as ce

In [None]:
# Importing the commecial dataset
df = pd.read_csv("datasets_annotated/ci_post_annotated.csv", 
                   # usecols=filter_cols
                )
# standardize case
df = df.apply(lambda x: x.astype(str).str.lower())

df["community"] = (df["nigeria_community"].astype(str) + 
                   df["kenya_community"].astype(str)).apply(lambda x: x.replace("nan", ""))
df.head()

In [None]:
df['school_performance'].unique()

In [None]:
cols_change = [
'kenya_community',
 'nigeria_community',
 'original_person',
 'connection_period_non_original_subscriber',
 'connection_period',
 'business_type',
 'operation_status',
 'non_renewvia_electrical_sources',
 'kerosene_usage_change',
 'diesel_usage_change',
 'operations_hours_change',
 'business_hours_increase',
 'clean_drinking_water_access',
 'ci_offering_change',
 'workforce_change',
 'workforce_change_female',
 'earnings_change',
 'school_attendance_change',
 'school_performance',
]

def replace_str(string):
    replace_chars = {'_': ['- ',' (', ' ', '-'],
                    '': ['.)', ',', '/', 'u\n', "'", ".0"]
                    }

    for new, old in replace_chars.items():
        for c in old:
            string = string.replace(c, new)

    return string
cols_replace = list(df.columns)
cols_replace.remove("renewvia_id")

for col in cols_replace:
    df[col] = df[col].apply(lambda x: replace_str(x))
    
df.replace(['nan', '<na>'], np.nan, inplace=True)

df["workforce_change_female"] = df["workforce_change_female"].astype(float).astype('Int64')

df['appliances_addition_count'] = df['appliances_addition_type'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
# No. of new services offered by clinic
df['health_offering_change_count'] = df['health_offering_change'].map(lambda x: len(x.split(";")) 
                                                                  if type(x) == str else 0)
df['school_performance'].replace('no_there_has_not_been_a_noticable_change',
              'no_there_has_not_been_a_noticeable_change', inplace=True)
df["operation_status"].replace('no_it_closed','no_its_closed', inplace=True)

In [None]:
df['health_offering_change'] = df['health_offering_change'].replace(regex=['none_of_these;', 
                                            'none_of_these'],value='')
df['clinic_longer_hours'] =  df['health_offering_change'].map(lambda x: 1 
                                                            if type(x) == str and
                                                              'clinic_stays_open_longer_hours' in 
                                                              x else 0)
df['vaccine_cold_storage'] =  df['health_offering_change'].map(lambda x: 1 
                                                            if type(x) == str and 'clinic_now_has_a_refrigerator_for_vaccines' in 
                                                              x else 0)
df['more_patients_treated'] =  df['health_offering_change'].map(lambda x: 1 
                                                            if type(x) == str and'clinic_can_see_more_patients' in 
                                                              x else 0)
df['shorter_wait'] =  df['health_offering_change'].map(lambda x: 1 
                                                            if type(x) == str and 'wait_times_are_shorter' in 
                                                              x else 0)
df['health_offering_change_count'] = df['health_offering_change'].map(lambda x: len(x.split(";"))-1 
                                                                  if type(x) == str and x != '' else 0)
df[['health_offering_change_count','health_offering_change']]

# df.to_csv("datasets_clean/ci_post_clean.csv")


In [None]:
cols_focus = [
'renewvia_id',
 'country',
 'community',
 'connection_period_non_original_subscriber',
 'connection_period',
 'business_type',
 'operation_status',
 'non_renewvia_electrical_sources',
 'appliances_addition_type',
 'kerosene_usage_change',
 'diesel_usage_change',
 'operations_hours_change',
 'business_hours_increase',
 'clean_drinking_water_access',
 'ci_offering_change',
 'workforce_change',
 'workforce_change_female',
 'earnings_change',
 'ci_new_offering',
 'health_offering_change',
 'clinic_offering_unable_prior_minigrid',
 'school_attendance_change',
 'school_offering_unable_prior_minigrid',
 'school_performance',
 'end_date',
 'appliances_addition_count',
 'health_offering_change_count'
]

# Encoding values
df_enc = df[cols_focus].copy()
# mapping missing or mistyped answers
with open('col_answers_mapping.json', 'r') as f:
    col_answers_map = json.load(f)
    
answers = col_answers_map['col_answers_map']
n_col = len(answers)
    
# c_map = []
for i in range(0, n_col):
    col = answers[i]['new_name']
    mapping = answers[i]['mapping']
    
    if col in cols_focus and mapping != {} and col != 'household_business_owners':
        vals = list(mapping.values())
        new_vals = [replace_str(v.lower()) for v in vals]
        col_mapping = dict()
        if "yes" in new_vals and "no" in new_vals:
            col_mapping = {'no': 0, 'yes':1, 
                           'option 1': 0, 'option 2': 1}

        elif col == "school_performance":
            col_mapping = {
                "yes_its_gotten_worse": -1,
                'no_there_has_not_been_a_noticeable_change': 0,
                'yes_overall_school_performance_is_better': 1}
            
        elif any(map(col.__contains__, ["_change", "_improvement"])):

            cats = new_vals
            neutral = ['no', 'same', 'stay', 'not']
            down = ['lost', 'decrease','decreased','decreases', 'worse',]
            up = ['add', 'better', 'increase', 
                    'increased','increases', 'greater']
            for cat in cats:
                cat = str(cat)
                if cat != 'nan':
                    if cat == 'no' or any(map(cat.__contains__, neutral)):
                        col_mapping[cat] = 0
                    elif any(map(cat.__contains__, down)):
                        col_mapping[cat] = -1
                    elif cat == 'yes' or any(map(cat.__contains__, up)):
                        col_mapping[cat] = 1

            
        else:
            rng = len(new_vals)
            for idx, val in enumerate(new_vals):
                col_mapping[val] = idx+1
                
        df_enc[col].replace(col_mapping, inplace=True)

df_enc.head()
df_enc.to_csv("datasets_encoded/ci_post_encoded.csv")