In [1]:
import os
import re
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
cols = [
"renewvia_id",
"business_type",
"operation_status",
"kerosene_usage_change",
"diesel_usage_change",
"operations_hours_change",
"clean_drinking_water_access",
"new_prod_serv_add",
"workforce_change",
"workforce_change_female",
"weekly_monthly_earnings",
"school_attendance_change",
"school_attendance_performance",
]

cols_non_ord = {"renewvia_id","new_prod_serv_add","business_type"}
cols_ord = [ele for ele in cols if ele not in cols_non_ord]

In [5]:
# Importing the commecial dataset
ci = pd.read_csv("datasets_clean/commercial_post_connection_merged.csv", 
             usecols=cols)


# Creating the mapping for each categroical variable with ordinality
cats_ord_map = list()
for col in cols_ord:
    val_ord = dict()
    cats = list(ci[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_ord_map.append({"col":col, "mapping": val_ord})

        
#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)


#Binary Encoding
enc_bin = ce.BinaryEncoder(cols =["new_prod_serv_add"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)

# OneHot Encoding
enc_hot = ce.OneHotEncoder(cols =["business_type"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            use_cat_names=True,
                            return_df=True)

ci_bin = enc_bin.fit_transform(ci)
ci_ord = enc_ord.fit_transform(ci_bin)
ci_new = enc_hot.fit_transform(ci_ord)
ci_new.to_csv("datasets_clean/commercial_encoded.csv")

In [34]:
# Transform into ratio
filter_cols = [
"renewvia_id",
"avg_monthly_household_income",
"household_headcount",
"female_schooling",
"male_schooling",
"household_business_owners",
"electronics_count",
"cellphones_count",
"light_hours_current",
"kerosene_lamp_usage_count",
"cooking_energy_sources",
"cooking_fuel_collection_time",
"cooking_energy_monthly_cost",
"applicances_charging_monthly_cost",
"feel_safe_dark",
"community_lights",
"home_exterior_lights",
"feel_safe_if_exterior_lights",
"phone_charge_location",
"phone_charge_monthly_cost",
"water_source",
"clean_drinking_water",
"clean_drinking_water_source",
"water_collection_travel_distance",
"water_collection_time",
"water_monthly_cost",
"clinic_travel_distance",
"clinic_electricity_access",
"clinic_open_hours",
"clinic_refrigeration_access",

]
hs_pre_com = pd.read_csv("datasets_clean/household_pre_connection_commcare.csv", usecols=filter_cols)
hs_pre_ms = pd.read_csv("datasets_clean/household_pre_connection_ms_form.csv", 
                        usecols=filter_cols, encoding = "ISO-8859-1")
hs_pre = pd.concat([hs_pre_com, hs_pre_ms])
hs_pre.to_csv("datasets_clean/household_pre_merged.csv")

In [35]:
for col in list(hs_pre.columns)[3:]:
    cats = hs[col].unique()
    print(col, cats, "\n")

female_schooling [nan  2.  1.  4.  3.  0.  5.  6.  7. 36.] 

male_schooling [nan  1.  4.  3.  0.  2.  8.  5.  6.  7.] 

household_business_owners [nan 'adult_female' 'none' 'adult_male'] 

electronics_count [nan  5.  4.  2.  1.  3.  7.  9.  0.  8. 15.  6. 14. 12.] 

cellphones_count [nan  3.  8.  2.  7.  9. 15.  4. 11.  6.  1. 12.  5. 10. 17. 13. 18. 19.
  0.] 

light_hours_current [nan  5.  6.  4.  3.  2.  8.  7.  0. 46. 12.  9. 10. 11.  1. 13. 24.] 

kerosene_lamp_usage_count [nan  2.  0.  1.  3.  5.  4.  6.] 

cooking_energy_sources [nan 'firewood' 'kerosene' 'other' 'charcoal' 'biomass'
 'minigrid_electricity_renewvia'] 

cooking_fuel_collection_time [nan 'hours' 'less_than_1_hour' 'copy-1-of-hours' 'greater_than_5_hours'] 

cooking_energy_monthly_cost [nan 'nkes' 'copy-4-of-nkes' 'copy-1-of-nkes' 'copy-2-of-nkes'
 'copy-3-of-nkes'] 

applicances_charging_monthly_cost [nan 'copy-3-of-nkes' 'copy-4-of-nkes' 'copy-1-of-nkes' 'copy-2-of-nkes'
 'nkes'] 

feel_safe_dark [nan 'very_unsaf