# Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)
import matplotlib.pyplot as plt

from pycaret.classification import *

from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, log_loss, f1_score, matthews_corrcoef

KeyboardInterrupt: 

In [None]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [None]:
df_text_features = pd.read_pickle(f'{DATA_PATH_PREP}/05_df_text_features.pkl')
df_text_features.columns = np.arange(len(df_text_features.columns))
df_text_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,592,593,594,595,596,597,598,599,600,601
0,2,0,0,0,27,1,3,2,0,0,...,0.0,0.0,0.0,0.0,49.702143,10.939000,11.485714,6.074857,17.1,aleko-konstantinov
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.000000,15.000000,18.000000,0.370000,3.1,aleko-konstantinov
2,1,1,1,0,14,5,6,1,0,2,...,0.0,0.0,0.0,0.0,62.255004,5.406862,6.738253,6.584122,14.1,aleko-konstantinov
3,0,2,1,0,20,4,8,2,0,1,...,0.0,0.0,0.0,0.0,44.808968,7.441849,7.365079,8.631143,13.4,aleko-konstantinov
4,5,0,1,0,18,3,2,0,0,0,...,0.0,0.0,0.0,0.0,18.517500,12.798214,8.784127,12.651111,26.5,aleko-konstantinov
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,9,0,3,1,0,0,...,0.0,0.0,0.0,0.0,11.955081,14.000000,10.933333,12.230244,19.0,jordan-jovkov
596,4,0,0,0,12,1,7,3,0,0,...,0.0,0.0,0.0,0.0,44.467727,7.845284,6.627273,8.910227,17.4,jordan-jovkov
597,0,0,0,0,5,1,3,1,0,0,...,0.0,0.0,0.0,0.0,4.365000,14.000000,12.444444,12.678667,19.3,jordan-jovkov
598,0,0,0,0,5,1,2,1,0,3,...,0.0,0.0,0.0,0.0,41.242038,6.264717,6.000000,9.365283,15.0,jordan-jovkov


# Choose a model with PyCaret

## With PCA

In [None]:
s = setup(
    data=df_text_features,
    target=601,
    train_size=0.8,
    pca=True,
    pca_method='kernel',
    # remove_multicollinearity=True,
)

Unnamed: 0,Description,Value
0,Session id,308
1,Target,601
2,Target type,Multiclass
3,Target mapping,"aleko-konstantinov: 0, dimityr-dimov: 1, dimityr-talev: 2, elin-pelin: 3, ivan_vazov: 4, jordan-jovkov: 5"
4,Original data shape,"(600, 602)"
5,Transformed data shape,"(600, 480)"
6,Transformed train set shape,"(480, 480)"
7,Transformed test set shape,"(120, 480)"
8,Numeric features,601
9,Preprocess,True


In [None]:
remove_metric('AUC')
remove_metric('Recall')
remove_metric('Precision')
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')

Name                                                                                           LogLoss
Display Name                                                                                   LogLoss
Score Function                                                   <function log_loss at 0x7f779c71a5f0>
Scorer               make_scorer(log_loss, greater_is_better=False, needs_proba=True, error_score=0.0)
Target                                                                                      pred_proba
Args                                                                                                {}
Greater is Better                                                                                False
Multiclass                                                                                        True
Custom                                                                                            True
Name: logloss, dtype: object

In [None]:
best = compare_models(sort='LogLoss')

Unnamed: 0,Model,Accuracy,F1,Kappa,MCC,LogLoss,TT (Sec)
svm,SVM - Linear Kernel,0.4708,0.4563,0.365,0.3737,0.0,0.087
ridge,Ridge Classifier,0.4208,0.4124,0.305,0.3073,0.0,0.069
lr,Logistic Regression,0.3521,0.3401,0.2225,0.225,1.616,1.154
et,Extra Trees Classifier,0.375,0.371,0.25,0.2524,1.6193,0.178
rf,Random Forest Classifier,0.3604,0.3568,0.2325,0.2349,1.6812,0.361
gbc,Gradient Boosting Classifier,0.2896,0.2811,0.1475,0.1496,1.7035,10.574
xgboost,Extreme Gradient Boosting,0.3875,0.3812,0.265,0.2679,1.7478,3.357
lda,Linear Discriminant Analysis,0.2438,0.2362,0.0925,0.0941,1.7861,0.113
ada,Ada Boost Classifier,0.2083,0.1862,0.05,0.0518,1.7952,0.457
lightgbm,Light Gradient Boosting Machine,0.4,0.4001,0.28,0.2828,2.0007,3.861


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Without PCA

In [None]:
s = setup(
    data=df_text_features,
    target='author',
    train_size=0.8,
    # pca=True,
    # pca_method='linear',
    # remove_multicollinearity=True,
)

In [None]:
remove_metric('AUC')
remove_metric('Recall')
remove_metric('Precision')
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')

best = compare_models(sort='LogLoss')

# Choosing a model

In [None]:
s = setup(
    data=df_text_features,
    target='author',
    train_size=0.8,
    pca=True,
    pca_method='linear',
    # remove_multicollinearity=True,
)

remove_metric('AUC')
remove_metric('Recall')
remove_metric('Precision')
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')

In [None]:
ridge = create_model('ridge', return_train_score=True)
final_ridge = s.finalize_model(ridge)
final_ridge

In [None]:
# lr = create_model('lr')
# tuned_lr = tune_model(lr)

# Saving to files

In [None]:
filename = f'{DATA_PATH_PREP}/06_pycaret_ridge'
save_model(final_ridge, filename)