# Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)
import matplotlib.pyplot as plt

from pycaret.classification import *

from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, log_loss, f1_score, matthews_corrcoef

In [2]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [3]:
df_sbert = pd.read_pickle(f'{DATA_PATH_PREP}/03_df_samples_sbert.pkl')
df_sbert.columns = np.arange(len(df_sbert.columns))
df_sbert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13303,13304,13305,13306,13307,13308,13309,13310,13311,13312
0,0.023407,0.088533,0.017080,0.019026,-0.000730,-0.029901,-0.009842,0.026202,0.007030,0.034095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,aleko-konstantinov
1,0.008410,0.067847,-0.000749,0.005137,-0.012316,-0.036057,-0.002313,0.023626,-0.017833,-0.009246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,aleko-konstantinov
2,0.009139,-0.010337,0.004948,-0.015620,0.064593,0.014258,0.004488,-0.006584,0.021060,0.021273,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,aleko-konstantinov
3,-0.029518,0.040321,0.087215,0.004826,0.001398,-0.047775,0.064771,0.022354,0.049565,-0.018346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,aleko-konstantinov
4,0.000699,0.020800,-0.007530,0.008134,-0.002479,-0.000242,-0.013638,-0.045005,0.019961,0.006761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,aleko-konstantinov
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,-0.048833,-0.015363,-0.004902,0.006237,-0.023212,0.076304,-0.052319,-0.046082,-0.044669,0.018333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jordan-jovkov
596,-0.027543,-0.011782,-0.004672,-0.027033,0.011562,-0.017644,0.048794,-0.006814,0.020294,-0.056003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jordan-jovkov
597,-0.032206,-0.014207,-0.011887,-0.012598,-0.036125,0.045266,0.015762,0.022364,0.007701,0.022064,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jordan-jovkov
598,0.033862,-0.012871,0.017728,-0.009554,-0.054497,-0.021147,0.012415,0.017542,-0.018924,0.027988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jordan-jovkov


# Choose a model with PyCaret

## With PCA

In [4]:
s = setup(
    data=df_sbert,
    target=df_sbert.columns[-1],
    train_size=0.8,
    pca=True,
    pca_method='linear',
)

Unnamed: 0,Description,Value
0,Session id,7221
1,Target,13312
2,Target type,Multiclass
3,Target mapping,"aleko-konstantinov: 0, dimityr-dimov: 1, dimityr-talev: 2, elin-pelin: 3, ivan_vazov: 4, jordan-jovkov: 5"
4,Original data shape,"(600, 13313)"
5,Transformed data shape,"(600, 481)"
6,Transformed train set shape,"(480, 481)"
7,Transformed test set shape,"(120, 481)"
8,Numeric features,13312
9,Preprocess,True


In [5]:
remove_metric('AUC')
remove_metric('Recall')
remove_metric('Precision')
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')

Name                                                                                           LogLoss
Display Name                                                                                   LogLoss
Score Function                                                   <function log_loss at 0x7f42047ba9e0>
Scorer               make_scorer(log_loss, greater_is_better=False, needs_proba=True, error_score=0.0)
Target                                                                                      pred_proba
Args                                                                                                {}
Greater is Better                                                                                False
Multiclass                                                                                        True
Custom                                                                                            True
Name: logloss, dtype: object

In [6]:
best = compare_models(sort='LogLoss')

Unnamed: 0,Model,Accuracy,F1,Kappa,MCC,LogLoss,TT (Sec)
svm,SVM - Linear Kernel,0.7667,0.7661,0.72,0.7266,0.0,0.66
ridge,Ridge Classifier,0.8188,0.818,0.7825,0.7852,0.0,0.681
lr,Logistic Regression,0.7771,0.7765,0.7325,0.7355,0.9549,5.871
catboost,CatBoost Classifier,0.6812,0.6811,0.6175,0.6213,0.9769,275.989
xgboost,Extreme Gradient Boosting,0.6208,0.6191,0.545,0.548,1.0726,5.372
lightgbm,Light Gradient Boosting Machine,0.625,0.6169,0.55,0.5555,1.1841,5.845
gbc,Gradient Boosting Classifier,0.5104,0.5075,0.4125,0.4225,1.3628,15.881
et,Extra Trees Classifier,0.5042,0.4986,0.405,0.4149,1.4705,1.201
rf,Random Forest Classifier,0.5646,0.5595,0.4775,0.4833,1.4778,1.448
ada,Ada Boost Classifier,0.3188,0.3015,0.1825,0.1913,1.7247,1.663


# Choosing a model

In [8]:
et = create_model('ridge', return_train_score=True)
tuned_lr = tune_model(et)
final_ridge = automl(optimize='F1')
final_ridge

Unnamed: 0_level_0,Accuracy,F1,Kappa,MCC,LogLoss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.7292,0.7348,0.675,0.6764,-0.0
1,0.8542,0.85,0.825,0.8276,-0.0
2,0.8333,0.8308,0.8,0.8042,-0.0
3,0.7917,0.786,0.75,0.7531,-0.0
4,0.8333,0.8346,0.8,0.8029,-0.0
5,0.8333,0.8299,0.8,0.8038,-0.0
6,0.7917,0.7947,0.75,0.7516,-0.0
7,0.8125,0.8123,0.775,0.7754,-0.0
8,0.8958,0.8941,0.875,0.8759,-0.0
9,0.8125,0.809,0.775,0.7766,-0.0


# Saving to files

In [9]:
filename = f'{DATA_PATH_PREP}/06_pycaret_sbert'
save_model(final_ridge, filename)

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['0', '1', '2', '3', '4', '5', '6',
                                              '7', '8', '9', '10', '11', '12',
                                              '13', '14', '15', '16', '17', '18',
                                              '19', '20', '21', '22', '23', '24',
                                              '25', '26', '27', '28...
                                                     iterated_power='auto',
                                                     n_components=None,
                                                     n_oversamples=10,
                                                     power_