# Categorical Encoding

* In this noebook, we experiment with different encdoers, models, and datasets

In [2]:
# !pip install fasttext dirty_cat category_encoders
# !pip install lightgbm xgboost

In [22]:
from get_data import dataloader
from column_encoder import ColumnEncoder #This is where the magic happens
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
import get_data
from joblib import Parallel, delayed
import multiprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction import FeatureHasher
import category_encoders as ce
#from xgboost import XGBClassifier, XGBRegressor

# kc_train = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_train.get_input_target()
# kc_test = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_test.get_input_target()
# nominal = ['nom_'+str(i) for i in range(0,10)]

## Data

In [23]:
insights = dataloader('data/Insights/insights.csv', "insights")
insights.get_input_target()
X_col = insights.X.columns

In [24]:
def test_train_split(X, y, test_size=0.33, random_state=1):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = test_train_split(insights.X, insights.y)

In [25]:
# sample_X_train = X_train[:2000]
# sample_y_train = y_train[:2000]
# sample_X_test = X_test[:200]
# sample_y_test = y_test[:200]

sample_X_train = X_train
sample_y_train = y_train
sample_X_test = X_test
sample_y_test = y_test

## Encoding

### Integer Encoding

In [26]:
#try class weight

In [27]:
#from sklearn.preprocessing import OrdinalEncoder
preprocessor = ColumnTransformer([(col, ColumnEncoder('OrdinalEncoder'), col) for col in X_col])

pipeline_le = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier()),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
pipeline_le.fit(sample_X_train, sample_y_train)
pred = pipeline_le.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.60      0.65      0.62     13638
           1       0.72      0.68      0.70     18305

    accuracy                           0.67     31943
   macro avg       0.66      0.66      0.66     31943
weighted avg       0.67      0.67      0.67     31943



### Binary Encoder

In [28]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('BinaryEncoder'), col) for col in X_col])

pipeline_be = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier()),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_be.fit(sample_X_train, sample_y_train)
pred = pipeline_be.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.61      0.57      0.59     13638
           1       0.69      0.73      0.71     18305

    accuracy                           0.66     31943
   macro avg       0.65      0.65      0.65     31943
weighted avg       0.66      0.66      0.66     31943



### One Hot Encoder

In [34]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('OneHotEncoder'), col) for col in X_col])

pipeline_ohe = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier()),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_ohe.fit(sample_X_train, sample_y_train)
pred = pipeline_ohe.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.59      0.69      0.64     13638
           1       0.74      0.64      0.69     18305

    accuracy                           0.67     31943
   macro avg       0.67      0.67      0.66     31943
weighted avg       0.68      0.67      0.67     31943



### Hashing Encoder

In [30]:
preprocessor = ce.HashingEncoder()

pipeline_fh = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier()),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_fh.fit(sample_X_train, sample_y_train)
pred = pipeline_fh.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.55      0.31      0.40     13638
           1       0.61      0.81      0.70     18305

    accuracy                           0.60     31943
   macro avg       0.58      0.56      0.55     31943
weighted avg       0.59      0.60      0.57     31943



### Word based: Similarity Encoder

In [31]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('SimilarityEncoder'), col) for col in X_col])

pipeline_se = Pipeline([
    ('enc', preprocessor),
    ('svd', TruncatedSVD(n_components=100)),
    ('clf', LGBMClassifier()),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_se.fit(sample_X_train, sample_y_train)
pred = pipeline_se.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.60      0.59      0.59     13638
           1       0.70      0.71      0.70     18305

    accuracy                           0.66     31943
   macro avg       0.65      0.65      0.65     31943
weighted avg       0.66      0.66      0.66     31943



In [33]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('SimilarityEncoder'), col) for col in X_col])

pipeline_se = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier()),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_se.fit(sample_X_train, sample_y_train)
pred = pipeline_se.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.60      0.66      0.63     13638
           1       0.73      0.67      0.70     18305

    accuracy                           0.67     31943
   macro avg       0.66      0.67      0.66     31943
weighted avg       0.67      0.67      0.67     31943



##### Next Steps

* Tuning to see if this can make sense (number of reduced dimensions) 
* But we won't really have any way of tuning in unsupervised algorithm
* Can we use hyperparameters tuned here in other cases as well?
* Get more similar datasets for repeating experiments
* Experiment with dimensionality reduction and classifiers

