In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
cols = [
"renewvia_id",
"business_type",
"operation_status",
"kerosene_usage_change",
"diesel_usage_change",
"operations_hours_change",
"clean_drinking_water_access",
"new_prod_serv_add",
"workforce_change",
"workforce_change_female",
"weekly_monthly_earnings",
"school_attendance_change",
"school_attendance_performance",
]

cols_non_ord = {"renewvia_id","new_prod_serv_add","business_type"}
cols_ord = [ele for ele in cols if ele not in cols_non_ord]

In [3]:
# Importing the commecial dataset
ci = pd.read_csv("datasets_clean/commercial_post_survey_clean.csv", 
                 usecols=cols)

# Creating the mapping for each categroical variable with ordinality
cats_ord_map = list()
for col in cols_ord:
    val_ord = dict()
    cats = list(ci[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_ord_map.append({"col":col, "mapping": val_ord})

        
#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)


#Binary Encoding
enc_bin = ce.BinaryEncoder(cols =["new_prod_serv_add"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)

# OneHot Encoding
enc_hot = ce.OneHotEncoder(cols =["business_type"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            use_cat_names=True,
                            return_df=True)

ci_bin = enc_bin.fit_transform(ci)
ci_ord = enc_ord.fit_transform(ci_bin)
ci_new = enc_hot.fit_transform(ci_ord)
ci_new.to_csv("datasets_clean/commercial_encoded.csv")