In [1]:
import os
import re
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
commercial_cols_imp = [
"renewvia_id",
"business_type",
"operation_status",
"kerosene_usage_change",
"diesel_usage_change",
"operations_hours_change",
"clean_drinking_water_access",
"new_prod_serv_add",
"workforce_change",
"workforce_change_female",
"weekly_monthly_earnings",
"school_attendance_change",
"school_attendance_performance",
]

In [3]:
category_map = [
{"col": "operation_status", 
 "mapping": {
            "no_its_closed":0,
            "yes_we_are_still_operating": 1, 
            }},  
{"col": "school_attendance_change", 
 "mapping": {
            "no_we_have_not_noticed_a_change":0,
            "yes_we_have_seen_overall_greater_attendance": 1, 
            }},     
{"col": "school_attendance_performance", 
 "mapping": {
            "no_there_has_not_been_a_noticeable_change":0,
            "yes_overall_school_performance_is_better": 1, 
        }}, 
{"col": "workforce_change", 
 "mapping": {
            "yes_we_have_lost_workers": -1,
            "no_the_number_has_remained_the_same": 0,
            "yes_we_have_added_workers": 1, 
            }}, 
]

cols = ["kerosene_usage_change", 
        "weekly_monthly_earnings", 
        "diesel_usage_change",
       "operations_hours_change",
       "clean_drinking_water_access"]
col_ord = {
    'yes_it_has_decreased': -1,
    'no_it_is_pretty_much_the_same': 0,
    'yes_it_has_increased': 1
 }

for col in cols:
    category_map.append({"col":col, "mapping":col_ord})

In [4]:
 ci = pd.read_csv("datasets_clean/commercial_post_connection_merged.csv", 
                 usecols=commercial_cols_imp)

# for col in commercial_cols_imp:
#     cats = ci[col].unique()
#     if len(cats) <= 10:
#         print(col, cats, "\n")


#OneHot Encoding
enc_bin = ce.BinaryEncoder(cols =["new_prod_serv_add"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)
#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=category_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

# OneHot Encoding
enc_hot = ce.OneHotEncoder(cols =["business_type"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            use_cat_names=True,
                            return_df=True)

ci_bin = enc_bin.fit_transform(ci)
ci_ord = enc_ord.fit_transform(ci_bin)
ci_new = enc_hot.fit_transform(ci_ord)
ci_new.to_csv("commercial_encoded.csv")