# Imports

In [1]:
import numpy as np
import pandas as pd

pd.set_option('max_colwidth', 150)

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from catboost import CatBoostClassifier

In [2]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [3]:
data_table_soft = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_soft.pkl')
data_table_med = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_med.pkl')
data_table_hard = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_hard.pkl')

# Split data

In [9]:
X = data_table_hard.drop('author', axis=1)
y = data_table_hard['author']

# Run a model

In [10]:
strat_k_fold = StratifiedKFold(n_splits=10, shuffle=True)

lr = LogisticRegression()
print(f'LogisticRegression: {cross_val_score(lr, X, y, cv=strat_k_fold).mean()}')

knn = KNeighborsClassifier(n_neighbors = 5)
print(f'KNeighborsClassifier: {cross_val_score(knn, X, y, cv=strat_k_fold).mean()}')

gnb = GaussianNB()
print(f'GaussianNB: {cross_val_score(gnb, X, y, cv=strat_k_fold).mean()}')

mnb = MultinomialNB()
print(f'MultinomialNB: {cross_val_score(mnb, X, y, cv=strat_k_fold).mean()}')



LogisticRegression: 0.65




KNeighborsClassifier: 0.7583333333333333




GaussianNB: 0.6666666666666666




MultinomialNB: 0.6333333333333334


In [11]:
from sklearn.model_selection import cross_validate

clf = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=4, loss_function='MultiClass')

# Perform cross-validation
scores = cross_validate(clf, X, y, cv=5, scoring='neg_log_loss', return_train_score=True)

# Print the mean and standard deviation of the scores
# print("Mean score: ", np.mean(scores))
# print("Standard deviation: ", np.std(scores))
print(scores)



0:	learn: 1.5736996	total: 155ms	remaining: 2m 34s
1:	learn: 1.5528133	total: 202ms	remaining: 1m 40s
2:	learn: 1.4924041	total: 247ms	remaining: 1m 22s
3:	learn: 1.4427308	total: 292ms	remaining: 1m 12s
4:	learn: 1.4226778	total: 338ms	remaining: 1m 7s
5:	learn: 1.3902130	total: 386ms	remaining: 1m 3s
6:	learn: 1.3699635	total: 436ms	remaining: 1m 1s
7:	learn: 1.3346362	total: 483ms	remaining: 59.9s
8:	learn: 1.2850748	total: 531ms	remaining: 58.4s
9:	learn: 1.2568599	total: 576ms	remaining: 57s
10:	learn: 1.2257796	total: 628ms	remaining: 56.5s
11:	learn: 1.2029843	total: 680ms	remaining: 56s
12:	learn: 1.1734442	total: 728ms	remaining: 55.3s
13:	learn: 1.1593417	total: 772ms	remaining: 54.4s
14:	learn: 1.1330286	total: 817ms	remaining: 53.7s
15:	learn: 1.1148258	total: 867ms	remaining: 53.3s
16:	learn: 1.0889920	total: 921ms	remaining: 53.3s
17:	learn: 1.0695965	total: 967ms	remaining: 52.7s
18:	learn: 1.0435225	total: 1.01s	remaining: 52.4s
19:	learn: 1.0266001	total: 1.06s	remaini

Traceback (most recent call last):
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 312, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 2447, in log_loss
    raise ValueError(
ValueError: y_true and y_pred contain different number of classes 4, 6. Please provide the true labels explicitly through the labels argument. Classes found in y_true: ['aleko-konstantinov' 'elin-pelin' 'ivan_vazov' 'jordan-jovkov']



0:	learn: 1.7385122	total: 79.2ms	remaining: 1m 19s
1:	learn: 1.7034994	total: 130ms	remaining: 1m 5s
2:	learn: 1.6740480	total: 180ms	remaining: 60s
3:	learn: 1.6319008	total: 232ms	remaining: 57.7s
4:	learn: 1.6041553	total: 282ms	remaining: 56.1s
5:	learn: 1.5548215	total: 339ms	remaining: 56.2s
6:	learn: 1.5167890	total: 401ms	remaining: 56.8s
7:	learn: 1.4814380	total: 452ms	remaining: 56.1s
8:	learn: 1.4305231	total: 505ms	remaining: 55.6s
9:	learn: 1.4074661	total: 569ms	remaining: 56.3s
10:	learn: 1.3626630	total: 635ms	remaining: 57.1s
11:	learn: 1.3452017	total: 691ms	remaining: 56.9s
12:	learn: 1.3252169	total: 745ms	remaining: 56.6s
13:	learn: 1.3026774	total: 803ms	remaining: 56.5s
14:	learn: 1.2761623	total: 854ms	remaining: 56.1s
15:	learn: 1.2568152	total: 904ms	remaining: 55.6s
16:	learn: 1.2325542	total: 956ms	remaining: 55.3s
17:	learn: 1.2066211	total: 1.02s	remaining: 55.5s
18:	learn: 1.1934215	total: 1.07s	remaining: 55.2s
19:	learn: 1.1673305	total: 1.12s	remaini

Traceback (most recent call last):
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 312, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 2447, in log_loss
    raise ValueError(
ValueError: y_true and y_pred contain different number of classes 4, 6. Please provide the true labels explicitly through the labels argument. Classes found in y_true: ['aleko-konstantinov' 'elin-pelin' 'ivan_vazov' 'jordan-jovkov']



0:	learn: 1.7586091	total: 152ms	remaining: 2m 31s
1:	learn: 1.7224158	total: 246ms	remaining: 2m 2s
2:	learn: 1.6882650	total: 338ms	remaining: 1m 52s
3:	learn: 1.6528447	total: 456ms	remaining: 1m 53s
4:	learn: 1.6102757	total: 553ms	remaining: 1m 49s
5:	learn: 1.5695444	total: 646ms	remaining: 1m 47s
6:	learn: 1.5415018	total: 749ms	remaining: 1m 46s
7:	learn: 1.5184152	total: 823ms	remaining: 1m 42s
8:	learn: 1.4901693	total: 898ms	remaining: 1m 38s
9:	learn: 1.4631380	total: 946ms	remaining: 1m 33s
10:	learn: 1.4431560	total: 1.03s	remaining: 1m 32s
11:	learn: 1.4158486	total: 1.1s	remaining: 1m 30s
12:	learn: 1.3823352	total: 1.16s	remaining: 1m 27s
13:	learn: 1.3509665	total: 1.21s	remaining: 1m 25s
14:	learn: 1.3257511	total: 1.27s	remaining: 1m 23s
15:	learn: 1.3140798	total: 1.33s	remaining: 1m 21s
16:	learn: 1.2763668	total: 1.38s	remaining: 1m 19s
17:	learn: 1.2463140	total: 1.43s	remaining: 1m 18s
18:	learn: 1.2191770	total: 1.49s	remaining: 1m 16s
19:	learn: 1.1930750	tot

Traceback (most recent call last):
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 312, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/home/work/Dev/ati/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 2447, in log_loss
    raise ValueError(
ValueError: y_true and y_pred contain different number of classes 4, 6. Please provide the true labels explicitly through the labels argument. Classes found in y_true: ['aleko-konstantinov' 'elin-pelin' 'ivan_vazov' 'jordan-jovkov']



0:	learn: 1.5643576	total: 75.2ms	remaining: 1m 15s
1:	learn: 1.5313222	total: 127ms	remaining: 1m 3s
2:	learn: 1.5079333	total: 175ms	remaining: 58.2s
3:	learn: 1.4828163	total: 235ms	remaining: 58.6s
4:	learn: 1.4621715	total: 305ms	remaining: 1m
5:	learn: 1.4338185	total: 391ms	remaining: 1m 4s
6:	learn: 1.4092642	total: 461ms	remaining: 1m 5s
7:	learn: 1.3727666	total: 534ms	remaining: 1m 6s
8:	learn: 1.3567892	total: 630ms	remaining: 1m 9s
9:	learn: 1.3155532	total: 723ms	remaining: 1m 11s
10:	learn: 1.2911679	total: 818ms	remaining: 1m 13s
11:	learn: 1.2709730	total: 897ms	remaining: 1m 13s
12:	learn: 1.2434317	total: 958ms	remaining: 1m 12s
13:	learn: 1.2290081	total: 1.03s	remaining: 1m 12s
14:	learn: 1.2131667	total: 1.11s	remaining: 1m 12s
15:	learn: 1.1834648	total: 1.17s	remaining: 1m 11s
16:	learn: 1.1685102	total: 1.21s	remaining: 1m 9s
17:	learn: 1.1435767	total: 1.26s	remaining: 1m 8s
18:	learn: 1.1130815	total: 1.31s	remaining: 1m 7s
19:	learn: 1.0937192	total: 1.35s	r