In [1]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [2]:
cols = [
"renewvia_id",
"business_type",
"operation_status",
"kerosene_usage_change",
"diesel_usage_change",
"operations_hours_change",
"clean_drinking_water_access",
"new_prod_serv_add",
"workforce_change",
"workforce_change_female",
"weekly_monthly_earnings",
"school_attendance_change",
"school_attendance_performance",
]

cols_non_ord = {"renewvia_id","new_prod_serv_add","business_type"}
cols_ord = [ele for ele in cols if (ele not in cols_non_ord) 
            and (ele != 'workforce_change_female') ]

In [3]:
# Importing the commecial dataset
df = pd.read_csv("commercial_post_survey_clean.csv", 
                 usecols=cols)
df.replace('nan', np.nan, inplace=True)
df["workforce_change_female"] = df["workforce_change_female"].astype(float).astype('Int64')
    
df.head()

Unnamed: 0,renewvia_id,business_type,operation_status,kerosene_usage_change,diesel_usage_change,operations_hours_change,clean_drinking_water_access,new_prod_serv_add,workforce_change,workforce_change_female,weekly_monthly_earnings,school_attendance_change,school_attendance_performance
0,131206,shop,,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,yes_it_has_increased,yes,no_the_number_has_remained_the_same,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change
1,131231,other_business,,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,yes_it_has_increased,yes,yes_we_have_added_workers,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change
2,131542,shop,,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,yes_it_has_increased,yes,no_the_number_has_remained_the_same,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change
3,131220,shop,,yes_it_has_increased,yes_it_has_increased,yes_they_have_increased,yes_it_has_increased,yes,yes_we_have_added_workers,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change
4,131485,shop,,yes_it_has_decreased,yes_it_has_decreased,yes_they_have_increased,yes_it_has_increased,no,no_the_number_has_remained_the_same,,yes_they_have_increased,no_we_have_not_noticed_a_change,no_there_has_not_been_a_noticeable_change


In [4]:
# Creating the mapping for each categroical variable with ordinality
cats_ord_map = list()
for col in cols_ord:
    val_ord = dict()
    cats = list(df[col].unique())
    for cat in cats:
        cat = str(cat)
        if cat != 'nan':
            if any(map(cat.__contains__, ['no', 'same'])):
                val_ord[cat] = 0
            elif any(map(cat.__contains__, ['lost', 'decrease'])):
                val_ord[cat] = -1
            elif any(map(cat.__contains__, ['still', 'add', 
                                            'better', 'increase',
                                           'greater'])):
                val_ord[cat] = 1

    cats_ord_map.append({"col":col, "mapping": val_ord})

        

#Binary Encoding
enc_bin = ce.BinaryEncoder(cols =["new_prod_serv_add"], 
                            handle_unknown="return_nan",
                            handle_missing="return_nan", 
                            return_df=True)
df = enc_bin.fit_transform(df)
# OneHot Encoding
# enc_hot = ce.OneHotEncoder(cols =["business_type"], 
#                             handle_unknown="return_nan",
#                             handle_missing="return_nan",
#                             use_cat_names=True,
#                             return_df=True)
# df = enc_hot.fit_transform(df)

#Ordinal Encoding
enc_ord = ce.OrdinalEncoder(mapping=cats_ord_map, 
                            handle_unknown="return_nan",
                            handle_missing="return_nan",
                            return_df=True)

df = enc_ord.fit_transform(df)

# df.head()
df.to_csv("commercial_post_survey_encoded.csv")