# Categorical Encoding

* In this noebook, we experiment with different encdoers, models, and datasets

In [4]:
# !pip install dirty_cat category_encoders
# !pip install lightgbm xgboost
#fasttext

from get_data import dataloader
from column_encoder import ColumnEncoder #This is where the magic happens
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
import get_data
from joblib import Parallel, delayed
import multiprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction import FeatureHasher
import category_encoders as ce
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor

# kc_train = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_train.get_input_target()
# kc_test = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_test.get_input_target()
# nominal = ['nom_'+str(i) for i in range(0,10)]

Collecting dirty_cat
[?25l  Downloading https://files.pythonhosted.org/packages/8d/84/7de88b45593b71fe8552c3038232502337eb3c0bd4b296361849a20fdabc/dirty_cat-0.0.5-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 3.7MB/s eta 0:00:011
[?25hCollecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 8.4MB/s eta 0:00:011
Collecting patsy>=0.5.1 (from category_encoders)
[?25l  Downloading https://files.pythonhosted.org/packages/ea/0c/5f61f1a3d4385d6bf83b83ea495068857ff8dfb89e74824c6e9eb63286d8/patsy-0.5.1-py2.py3-none-any.whl (231kB)
[K     |████████████████████████████████| 235kB 9.5MB/s eta 0:00:01
Collecting statsmodels>=0.9.0 (from category_encoders)
[?25l  Downloading https://files.pythonhosted.org/packages/cb/83/540fd83238a18abe6c2d280fa8e489ac5fcefa1f370f0ca1a

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


## Data

* Predict using the position title, weather the employee earns more than 100k

In [5]:
# Data downloaded from here: https://opendata.vancouver.ca/explore/dataset/employee-remuneration-and-expenses-earning-over-75000/export/?disjunctive.department&disjunctive.title&sort=year
emp_df = pd.read_csv('public_data/employee.csv', sep=';')[['Title', 'Remuneration']]
emp_df['Remuneration_class'] = emp_df['Remuneration'].apply(lambda x: x > 100000)
emp_df.drop('Remuneration', axis=1, inplace=True)
emp_df

Unnamed: 0,Title,Remuneration_class
0,Fire Captain,True
1,Trades Ii - Gardener,False
2,Corporate Compliance Analyst,False
3,Telecommunications Plant Coordinator,False
4,Civil Engineer I,False
5,Property Development Officer Ii,True
6,Manager Project Management Office,True
7,Firefighter,True
8,Firefighter,True
9,Mgr Regulatory Compl Progm Impl & Admin,True


In [6]:
X = emp_df['Title']
y = emp_df['Remuneration_class']
X_col = ['Title']

In [7]:
def test_train_split(X, y, test_size=0.33, random_state=1):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = test_train_split(X, y)

In [8]:
# sample_X_train = X_train[:2000]
# sample_y_train = y_train[:2000]
# sample_X_test = X_test[:200]
# sample_y_test = y_test[:200]

sample_X_train = X_train
sample_y_train = y_train
sample_X_test = X_test
sample_y_test = y_test

## EXPERIMENTS

In [9]:
def experiments(encoder, dim_red=None,
                model=LGBMClassifier(is_unbalance='True')):
    if dim_red:
        pipeline = Pipeline([
        ('enc', encoder),
        ('dim_red', dim_red),
        ('clf', model)
         ])
    else:
        pipeline = Pipeline([
        ('enc', encoder),
        ('clf', model)
         ])
    pipeline.fit(sample_X_train, sample_y_train)
    pred = pipeline.predict(sample_X_test)
    return(pipeline, classification_report(sample_y_test, pred))
        

## Encoding

### Integer Encoding

In [10]:
preprocessor = ColumnEncoder('OrdinalEncoder')
pipe_ie, report = experiments(preprocessor)
print(report)

              precision    recall  f1-score   support

       False       0.80      0.72      0.76      5214
        True       0.60      0.71      0.65      3145

    accuracy                           0.71      8359
   macro avg       0.70      0.71      0.70      8359
weighted avg       0.73      0.71      0.72      8359



### Binary Encoder

In [11]:
preprocessor = ColumnEncoder('BinaryEncoder')
pipe_be, report = experiments(preprocessor)
print(report)

              precision    recall  f1-score   support

       False       0.82      0.77      0.79      5214
        True       0.65      0.72      0.69      3145

    accuracy                           0.75      8359
   macro avg       0.74      0.75      0.74      8359
weighted avg       0.76      0.75      0.75      8359



### One Hot Encoder

In [29]:
preprocessor = ColumnEncoder('OneHotEncoder')
pipe_ohe, report = experiments(preprocessor)
print(report)

              precision    recall  f1-score   support

       False       0.83      0.63      0.71      5214
        True       0.56      0.79      0.66      3145

    accuracy                           0.69      8359
   macro avg       0.70      0.71      0.69      8359
weighted avg       0.73      0.69      0.69      8359



### Hashing Encoder

In [13]:
preprocessor = ce.HashingEncoder()
pipe_he, report = experiments(preprocessor)
print(report)

              precision    recall  f1-score   support

       False       0.67      0.55      0.60      5214
        True       0.42      0.55      0.48      3145

    accuracy                           0.55      8359
   macro avg       0.55      0.55      0.54      8359
weighted avg       0.58      0.55      0.56      8359



### Word based: Similarity Encoder

In [16]:
preprocessor = ColumnEncoder('SimilarityEncoder')
pipe_se, report = experiments(preprocessor,
                              dim_red=TruncatedSVD(n_components=100))
print(report)

              precision    recall  f1-score   support

       False       0.85      0.82      0.83      5214
        True       0.72      0.76      0.74      3145

    accuracy                           0.80      8359
   macro avg       0.78      0.79      0.78      8359
weighted avg       0.80      0.80      0.80      8359



In [17]:
preprocessor = ColumnEncoder('SimilarityEncoder')
pipe_se, report = experiments(preprocessor)
print(report)

              precision    recall  f1-score   support

       False       0.85      0.81      0.83      5214
        True       0.71      0.76      0.74      3145

    accuracy                           0.79      8359
   macro avg       0.78      0.79      0.78      8359
weighted avg       0.80      0.79      0.80      8359



### Min Hash encoder

In [19]:
preprocessor = ColumnEncoder('MinHashEncoder', n_components=300)
pipe_mhe, report = experiments(preprocessor)
print(report)

              precision    recall  f1-score   support

       False       0.85      0.81      0.83      5214
        True       0.71      0.77      0.74      3145

    accuracy                           0.80      8359
   macro avg       0.78      0.79      0.79      8359
weighted avg       0.80      0.80      0.80      8359



### Online Gamma Poisson Factorization

In [None]:
preprocessor = ColumnEncoder('OnlineGammaPoissonFactorization', ngram_range=(2, 4), n_components=10)
pipe_ogpf, report = experiments(preprocessor)
print(report)

In [16]:
pipe_ogpf['enc'].get_feature_names()

['inspector, electrical, buildings',
 'lieutenant, prevention, captain',
 'firefighter, fire, refm',
 'consultant, accountant, coordination',
 'journeyman, subforeman, mechanic',
 'partnerships, emergency, manager',
 'planner, plannner, solicitor',
 'superintendent, equipment, supervisor',
 'programmer, business, systems',
 'application, applications, technology']