# Categorical Encoding

* In this noebook, we experiment with different encdoers, models, and datasets

In [2]:
# !pip install fasttext dirty_cat category_encoders
# !pip install lightgbm xgboost

In [8]:
from get_data import dataloader
from column_encoder import ColumnEncoder #This is where the magic happens
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
import get_data
from joblib import Parallel, delayed
import multiprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction import FeatureHasher
import category_encoders as ce
#from xgboost import XGBClassifier, XGBRegressor

# kc_train = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_train.get_input_target()
# kc_test = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
# kc_test.get_input_target()
# nominal = ['nom_'+str(i) for i in range(0,10)]

## Data

In [9]:
insights = dataloader('data/Insights/insights.csv', "insights")
insights.get_input_target()
X_col = insights.X.columns

In [10]:
def test_train_split(X, y, test_size=0.33, random_state=1):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = test_train_split(insights.X, insights.y)

In [11]:
# sample_X_train = X_train[:2000]
# sample_y_train = y_train[:2000]
# sample_X_test = X_test[:200]
# sample_y_test = y_test[:200]

sample_X_train = X_train
sample_y_train = y_train
sample_X_test = X_test
sample_y_test = y_test

## Encoding

### Integer Encoding

In [26]:
#try class weight

In [12]:
#from sklearn.preprocessing import OrdinalEncoder
preprocessor = ColumnTransformer([(col, ColumnEncoder('OrdinalEncoder'), col) for col in X_col])

pipeline_le = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
pipeline_le.fit(sample_X_train, sample_y_train)
pred = pipeline_le.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.57      0.77      0.66     13638
           1       0.77      0.57      0.65     18305

    accuracy                           0.65     31943
   macro avg       0.67      0.67      0.65     31943
weighted avg       0.69      0.65      0.65     31943



### Binary Encoder

In [13]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('BinaryEncoder'), col) for col in X_col])

pipeline_be = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_be.fit(sample_X_train, sample_y_train)
pred = pipeline_be.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.57      0.77      0.66     13638
           1       0.77      0.57      0.65     18305

    accuracy                           0.66     31943
   macro avg       0.67      0.67      0.66     31943
weighted avg       0.69      0.66      0.66     31943



### One Hot Encoder

In [14]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('OneHotEncoder'), col) for col in X_col])

pipeline_ohe = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_ohe.fit(sample_X_train, sample_y_train)
pred = pipeline_ohe.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.57      0.79      0.66     13638
           1       0.78      0.56      0.65     18305

    accuracy                           0.66     31943
   macro avg       0.67      0.67      0.66     31943
weighted avg       0.69      0.66      0.65     31943



### Hashing Encoder

In [15]:
preprocessor = ce.HashingEncoder()

pipeline_fh = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_fh.fit(sample_X_train, sample_y_train)
pred = pipeline_fh.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.51      0.81      0.62     13638
           1       0.74      0.41      0.53     18305

    accuracy                           0.58     31943
   macro avg       0.62      0.61      0.58     31943
weighted avg       0.64      0.58      0.57     31943



### Word based: Similarity Encoder

In [24]:
sample_X_train['pmanufacturer'].value_counts()

GenuineIntel                    50341
Intel                            6483
Intel(R) Corporation             4214
Red Hat                          3046
AuthenticAMD                      509
Bochs                             137
AMD                                79
QEMU                               39
Intel Corporation                   2
nan                                 1
Advanced Micro Devices, Inc.        1
Name: pmanufacturer, dtype: int64

In [16]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('SimilarityEncoder'), col) for col in X_col])

pipeline_se = Pipeline([
    ('enc', preprocessor),
    ('svd', TruncatedSVD(n_components=100)),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_se.fit(sample_X_train, sample_y_train)
pred = pipeline_se.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.58      0.73      0.65     13638
           1       0.75      0.60      0.67     18305

    accuracy                           0.66     31943
   macro avg       0.66      0.67      0.66     31943
weighted avg       0.68      0.66      0.66     31943



In [17]:
preprocessor = ColumnTransformer([(col, ColumnEncoder('SimilarityEncoder'), col) for col in X_col])

pipeline_se = Pipeline([
    ('enc', preprocessor),
    ('clf', LGBMClassifier(is_unbalance='True')),
     ])
 #Add param_grid for dimensionality reduction, classifier experiments
    
pipeline_se.fit(sample_X_train, sample_y_train)
pred = pipeline_se.predict(sample_X_test)
print(classification_report(sample_y_test, pred))

              precision    recall  f1-score   support

           0       0.58      0.75      0.66     13638
           1       0.76      0.59      0.67     18305

    accuracy                           0.66     31943
   macro avg       0.67      0.67      0.66     31943
weighted avg       0.69      0.66      0.66     31943



##### Next Steps

* Tuning to see if this can make sense (number of reduced dimensions) 
* But we won't really have any way of tuning in unsupervised algorithm
* Can we use hyperparameters tuned here in other cases as well?
* Get more similar datasets for repeating experiments
* Experiment with dimensionality reduction and classifiers

