# Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)
import matplotlib.pyplot as plt

from pycaret.classification import *

from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, log_loss, f1_score, matthews_corrcoef

In [2]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [3]:
df_text_features = pd.read_pickle(f'{DATA_PATH_PREP}/05_df_text_features.pkl')
df_text_features.columns = np.arange(len(df_text_features.columns))
df_text_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,592,593,594,595,596,597,598,599,600,601
0,2,0,0,0,27,1,3,2,0,0,...,0.0,0.0,0.0,0.0,49.702143,10.939000,11.485714,6.074857,17.1,aleko-konstantinov
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.000000,15.000000,18.000000,0.370000,3.1,aleko-konstantinov
2,1,1,1,0,14,5,6,1,0,2,...,0.0,0.0,0.0,0.0,62.255004,5.406862,6.738253,6.584122,14.1,aleko-konstantinov
3,0,2,1,0,20,4,8,2,0,1,...,0.0,0.0,0.0,0.0,44.808968,7.441849,7.365079,8.631143,13.4,aleko-konstantinov
4,5,0,1,0,18,3,2,0,0,0,...,0.0,0.0,0.0,0.0,18.517500,12.798214,8.784127,12.651111,26.5,aleko-konstantinov
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,9,0,3,1,0,0,...,0.0,0.0,0.0,0.0,11.955081,14.000000,10.933333,12.230244,19.0,jordan-jovkov
596,4,0,0,0,12,1,7,3,0,0,...,0.0,0.0,0.0,0.0,44.467727,7.845284,6.627273,8.910227,17.4,jordan-jovkov
597,0,0,0,0,5,1,3,1,0,0,...,0.0,0.0,0.0,0.0,4.365000,14.000000,12.444444,12.678667,19.3,jordan-jovkov
598,0,0,0,0,5,1,2,1,0,3,...,0.0,0.0,0.0,0.0,41.242038,6.264717,6.000000,9.365283,15.0,jordan-jovkov


# Choose a model with PyCaret

## With PCA

In [4]:
s = setup(
    data=df_text_features,
    target=601,
    train_size=0.8,
    pca=True,
    pca_method='kernel',
    # remove_multicollinearity=True,
)

Unnamed: 0,Description,Value
0,Session id,4528
1,Target,601
2,Target type,Multiclass
3,Target mapping,"aleko-konstantinov: 0, dimityr-dimov: 1, dimityr-talev: 2, elin-pelin: 3, ivan_vazov: 4, jordan-jovkov: 5"
4,Original data shape,"(600, 602)"
5,Transformed data shape,"(600, 480)"
6,Transformed train set shape,"(480, 480)"
7,Transformed test set shape,"(120, 480)"
8,Numeric features,601
9,Preprocess,True


In [5]:
remove_metric('AUC')
remove_metric('Recall')
remove_metric('Precision')
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')

Name                                                                                           LogLoss
Display Name                                                                                   LogLoss
Score Function                                                   <function log_loss at 0x7f0a84c0e5f0>
Scorer               make_scorer(log_loss, greater_is_better=False, needs_proba=True, error_score=0.0)
Target                                                                                      pred_proba
Args                                                                                                {}
Greater is Better                                                                                False
Multiclass                                                                                        True
Custom                                                                                            True
Name: logloss, dtype: object

In [6]:
best = compare_models(sort='LogLoss')

Unnamed: 0,Model,Accuracy,F1,Kappa,MCC,LogLoss,TT (Sec)
svm,SVM - Linear Kernel,0.5,0.4938,0.4,0.4045,0.0,0.093
ridge,Ridge Classifier,0.4604,0.4514,0.3525,0.3569,0.0,0.068
catboost,CatBoost Classifier,0.4521,0.4495,0.3425,0.3466,1.4798,254.716
lr,Logistic Regression,0.3896,0.3838,0.2675,0.2724,1.5811,1.201
et,Extra Trees Classifier,0.4021,0.3976,0.2825,0.2863,1.6276,0.235
rf,Random Forest Classifier,0.3521,0.3472,0.2225,0.2251,1.6563,0.369
gbc,Gradient Boosting Classifier,0.3396,0.3375,0.2075,0.2101,1.6585,13.065
xgboost,Extreme Gradient Boosting,0.3792,0.3773,0.255,0.2594,1.7123,4.326
ada,Ada Boost Classifier,0.2021,0.1916,0.0425,0.0447,1.7846,0.659
lda,Linear Discriminant Analysis,0.1917,0.1843,0.03,0.0307,1.7863,0.132


## Without PCA

In [8]:
s = setup(
    data=df_text_features,
    target=601,
    train_size=0.8,
    # pca=True,
    # pca_method='linear',
    # remove_multicollinearity=True,
)

Unnamed: 0,Description,Value
0,Session id,7411
1,Target,601
2,Target type,Multiclass
3,Target mapping,"aleko-konstantinov: 0, dimityr-dimov: 1, dimityr-talev: 2, elin-pelin: 3, ivan_vazov: 4, jordan-jovkov: 5"
4,Original data shape,"(600, 602)"
5,Transformed data shape,"(600, 602)"
6,Transformed train set shape,"(480, 602)"
7,Transformed test set shape,"(120, 602)"
8,Numeric features,601
9,Preprocess,True


In [9]:
remove_metric('AUC')
remove_metric('Recall')
remove_metric('Precision')
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')

best = compare_models(sort='LogLoss')

Unnamed: 0,Model,Accuracy,F1,Kappa,MCC,LogLoss,TT (Sec)
svm,SVM - Linear Kernel,0.4812,0.4398,0.3775,0.4145,0.0,0.09
ridge,Ridge Classifier,0.7625,0.7597,0.715,0.72,0.0,0.06
lr,Logistic Regression,0.7542,0.7504,0.705,0.7095,0.7168,1.858
catboost,CatBoost Classifier,0.7583,0.7585,0.71,0.7141,0.7325,31.437
xgboost,Extreme Gradient Boosting,0.7125,0.7073,0.655,0.6593,0.8567,2.541
gbc,Gradient Boosting Classifier,0.6896,0.6859,0.6275,0.6337,0.8851,2.167
lightgbm,Light Gradient Boosting Machine,0.7375,0.7347,0.685,0.6886,0.9172,0.443
et,Extra Trees Classifier,0.8104,0.8089,0.7725,0.7754,0.9185,0.19
rf,Random Forest Classifier,0.7667,0.7653,0.72,0.7234,1.049,0.209
ada,Ada Boost Classifier,0.4646,0.463,0.3575,0.3629,1.6031,0.2


# Choosing a model

In [13]:
ridge = create_model('ridge', return_train_score=True)
tuned_lr = tune_model(ridge)
final_ridge = s.finalize_model(ridge)
final_ridge

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,F1,Kappa,MCC,LogLoss
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CV-Train,0,1.0,1.0,1.0,1.0,-0.0
CV-Train,1,1.0,1.0,1.0,1.0,-0.0
CV-Train,2,1.0,1.0,1.0,1.0,-0.0
CV-Train,3,1.0,1.0,1.0,1.0,-0.0
CV-Train,4,1.0,1.0,1.0,1.0,-0.0
CV-Train,5,1.0,1.0,1.0,1.0,-0.0
CV-Train,6,1.0,1.0,1.0,1.0,-0.0
CV-Train,7,1.0,1.0,1.0,1.0,-0.0
CV-Train,8,1.0,1.0,1.0,1.0,-0.0
CV-Train,9,1.0,1.0,1.0,1.0,-0.0


Unnamed: 0_level_0,Accuracy,F1,Kappa,MCC,LogLoss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.7917,0.7961,0.75,0.7527,-0.0
1,0.7708,0.7684,0.725,0.7277,-0.0
2,0.7292,0.7281,0.675,0.6761,-0.0
3,0.8125,0.804,0.775,0.7795,-0.0
4,0.7708,0.7685,0.725,0.7261,-0.0
5,0.8125,0.801,0.775,0.7795,-0.0
6,0.7917,0.7861,0.75,0.7608,-0.0
7,0.7708,0.758,0.725,0.7335,-0.0
8,0.8333,0.8306,0.8,0.8063,-0.0
9,0.8125,0.8087,0.775,0.7832,-0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [17]:
et = create_model('et', return_train_score=True)
tuned_lr = tune_model(et)
final_et = s.finalize_model(et)
final_et

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,F1,Kappa,MCC,LogLoss
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CV-Train,0,1.0,1.0,1.0,1.0,0.0
CV-Train,1,1.0,1.0,1.0,1.0,0.0
CV-Train,2,1.0,1.0,1.0,1.0,0.0
CV-Train,3,1.0,1.0,1.0,1.0,0.0
CV-Train,4,1.0,1.0,1.0,1.0,0.0
CV-Train,5,1.0,1.0,1.0,1.0,0.0
CV-Train,6,1.0,1.0,1.0,1.0,0.0
CV-Train,7,1.0,1.0,1.0,1.0,0.0
CV-Train,8,1.0,1.0,1.0,1.0,0.0
CV-Train,9,1.0,1.0,1.0,1.0,0.0


Unnamed: 0_level_0,Accuracy,F1,Kappa,MCC,LogLoss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.7292,0.7338,0.675,0.6807,1.4936
1,0.8125,0.8098,0.775,0.7791,1.4769
2,0.6875,0.6924,0.625,0.6303,1.4986
3,0.7917,0.7854,0.75,0.7575,1.4881
4,0.6458,0.64,0.575,0.5836,1.5301
5,0.6875,0.6925,0.625,0.6313,1.5078
6,0.7292,0.7199,0.675,0.6818,1.518
7,0.75,0.7493,0.7,0.7067,1.5181
8,0.6042,0.6079,0.525,0.5278,1.5333
9,0.6667,0.6702,0.6,0.6103,1.5043


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


# Saving to files

In [15]:
filename = f'{DATA_PATH_PREP}/06_pycaret_ridge_textfeats_ridge'
save_model(final_ridge, filename)

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['0', '1', '2', '3', '4', '5', '6',
                                              '7', '8', '9', '10', '11', '12',
                                              '13', '14', '15', '16', '17', '18',
                                              '19', '20', '21', '22', '23', '24',
                                              '25', '26', '27', '28...
                  TransformerWrapper(exclude=None, include=[],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                         

In [18]:
filename = f'{DATA_PATH_PREP}/06_pycaret_ridge_textfeats_et'
save_model(final_et, filename)

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['0', '1', '2', '3', '4', '5', '6',
                                              '7', '8', '9', '10', '11', '12',
                                              '13', '14', '15', '16', '17', '18',
                                              '19', '20', '21', '22', '23', '24',
                                              '25', '26', '27', '28...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt',
                                       max_leaf_nodes=N