# Categorical Encoding

* In this noebook, we experiment with different encdoers, models, and datasets

In [2]:
# !pip install fasttext dirty_cat category_encoders
# !pip install lightgbm xgboost

In [13]:
from get_data import dataloader
from column_encoder import ColumnEncoder #This is where the magic happens
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
import get_data
from joblib import Parallel, delayed
import multiprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction import FeatureHasher
import category_encoders as ce
import pandas as pd

#from xgboost import XGBClassifier, XGBRegressor

# kc_train = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_train.get_input_target()
# kc_test = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_test.get_input_target()
# nominal = ['nom_'+str(i) for i in range(0,10)]

## Data

* Predict using the position title, weather the employee earns more than 100k

In [58]:
# Data downloaded from here: https://opendata.vancouver.ca/explore/dataset/employee-remuneration-and-expenses-earning-over-75000/export/?disjunctive.department&disjunctive.title&sort=year
emp_df = pd.read_csv('public_data/employee.csv', sep=';')[['Title', 'Remuneration']]
emp_df['Remuneration_class'] = emp_df['Remuneration'].apply(lambda x: x > 100000)
emp_df.drop('Remuneration', axis=1, inplace=True)
emp_df

Unnamed: 0,Title,Remuneration_class
0,Fire Captain,True
1,Trades Ii - Gardener,False
2,Corporate Compliance Analyst,False
3,Telecommunications Plant Coordinator,False
4,Civil Engineer I,False
...,...,...
25323,Firefighter,False
25324,Financial Analyst I,False
25325,Fire Prevention Inspector,False
25326,Firefighter,False


In [43]:
X = emp_df['Title']
y = emp_df['Remuneration_class']
X_col = ['Title']

In [38]:
def test_train_split(X, y, test_size=0.33, random_state=1):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = test_train_split(X, y)

In [51]:
# sample_X_train = X_train[:2000]
# sample_y_train = y_train[:2000]
# sample_X_test = X_test[:200]
# sample_y_test = y_test[:200]

sample_X_train = X_train
sample_y_train = y_train
sample_X_test = X_test
sample_y_test = y_test

## Encoding

### Integer Encoding

In [52]:
#from sklearn.preprocessing import OrdinalEncoder
#preprocessor = ColumnTransformer([(col, ColumnEncoder('OrdinalEncoder'), col) for col in X_col])
preprocessor = ColumnEncoder('OrdinalEncoder')
pipeline_le = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
pipeline_le.fit(sample_X_train, sample_y_train)
pred = pipeline_le.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

       False       0.80      0.72      0.76      5214
        True       0.60      0.71      0.65      3145

    accuracy                           0.71      8359
   macro avg       0.70      0.71      0.70      8359
weighted avg       0.73      0.71      0.72      8359



### Binary Encoder

In [53]:
#preprocessor = ColumnTransformer([(col, ColumnEncoder('BinaryEncoder'), col) for col in X_col])
preprocessor = ColumnEncoder('BinaryEncoder')
pipeline_be = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_be.fit(sample_X_train, sample_y_train)
pred = pipeline_be.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

       False       0.82      0.77      0.79      5214
        True       0.65      0.72      0.69      3145

    accuracy                           0.75      8359
   macro avg       0.74      0.75      0.74      8359
weighted avg       0.76      0.75      0.75      8359



### One Hot Encoder

In [54]:
#preprocessor = ColumnTransformer([(col, ColumnEncoder('OneHotEncoder'), col) for col in X_col])
preprocessor = ColumnEncoder('OneHotEncoder')
pipeline_ohe = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_ohe.fit(sample_X_train, sample_y_train)
pred = pipeline_ohe.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

       False       0.83      0.63      0.71      5214
        True       0.56      0.79      0.66      3145

    accuracy                           0.69      8359
   macro avg       0.70      0.71      0.69      8359
weighted avg       0.73      0.69      0.69      8359



### Hashing Encoder

In [55]:
preprocessor = ce.HashingEncoder()

pipeline_fh = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_fh.fit(sample_X_train, sample_y_train)
pred = pipeline_fh.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

       False       0.67      0.55      0.60      5214
        True       0.42      0.55      0.48      3145

    accuracy                           0.55      8359
   macro avg       0.55      0.55      0.54      8359
weighted avg       0.58      0.55      0.56      8359



### Word based: Similarity Encoder

In [56]:
#preprocessor = ColumnTransformer([(col, ColumnEncoder('SimilarityEncoder'), col) for col in X_col])
preprocessor = ColumnEncoder('SimilarityEncoder')
pipeline_se = Pipeline([
    ('enc', preprocessor),
    ('svd', TruncatedSVD(n_components=100)),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_se.fit(sample_X_train, sample_y_train)
pred = pipeline_se.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

       False       0.85      0.82      0.83      5214
        True       0.71      0.76      0.74      3145

    accuracy                           0.79      8359
   macro avg       0.78      0.79      0.78      8359
weighted avg       0.80      0.79      0.80      8359



In [57]:
#preprocessor = ColumnTransformer([(col, ColumnEncoder('SimilarityEncoder'), col) for col in X_col])
preprocessor = ColumnEncoder('SimilarityEncoder')
pipeline_se = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_se.fit(sample_X_train, sample_y_train)
pred = pipeline_se.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

       False       0.85      0.81      0.83      5214
        True       0.71      0.76      0.74      3145

    accuracy                           0.79      8359
   macro avg       0.78      0.79      0.78      8359
weighted avg       0.80      0.79      0.80      8359

