# Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)
import matplotlib.pyplot as plt

import pickle

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from catboost import CatBoostClassifier


from sklearn.metrics import accuracy_score, log_loss, f1_score, matthews_corrcoef

In [2]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [3]:
data_table_soft = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_soft.pkl')
data_table_med = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_med.pkl')
data_table_hard = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_hard.pkl')

# Split data

In [4]:
num_pca_components = 16
X = data_table_hard.drop('author', axis=1)
y = data_table_hard['author']
# X = data_table_soft.drop('author', axis=1)
# y = data_table_soft['author']

le = LabelEncoder()
y = le.fit_transform(y)
print(dict(enumerate(le.classes_)))

print(f'Reducing {X.shape[1]} to {num_pca_components}')
pca = PCA(n_components=num_pca_components)
pr_components = pca.fit_transform(X)
X_PCA = pd.DataFrame(data=pr_components, columns=[f'comp_{idx}' for idx in range(num_pca_components)])
X_PCA

seed = 42
# X_train, X_test_val_test, y_train, y_test_val_test = train_test_split(X_PCA, y, random_state=seed, train_size=0.8)
X_train, X_test_val_test, y_train, y_test_val_test = train_test_split(X, y, random_state=seed, train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X_test_val_test, y_test_val_test, random_state=seed, train_size=0.5)

print(f'{X_train.shape} | {y_train.shape}')
print(f'{X_val.shape}  | {y_val.shape}')
print(f'{X_test.shape}  | {y_test.shape}')

{0: 'aleko-konstantinov', 1: 'dimityr-dimov', 2: 'dimityr-talev', 3: 'elin-pelin', 4: 'ivan_vazov', 5: 'jordan-jovkov'}
Reducing 27585 to 16
(480, 27585) | (480,)
(60, 27585)  | (60,)
(60, 27585)  | (60,)


# Set up pipeline

In [None]:

# Define the pipeline for feature extraction, dimensionality reduction, and model training
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('pca', PCA(n_components=16)),
])

# Define the parameter grids for SVC and LogisticRegression
svc_param_grid = {'clf': [SVC(kernel='linear', decision_function_shape='ovo')],
                  'clf__C': [0.1, 1, 10]}

lr_param_grid = {'clf': [LogisticRegression(multi_class='auto', solver='lbfgs')],
                 'clf__C': [0.1, 1, 10]}

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(pipe, svc_param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy
print("Best hyperparameters for SVC:", grid_search.best_params_)
print("Accuracy for SVC:", grid_search.score(X_test, y_test))

grid_search = GridSearchCV(pipe, lr_param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy
print("Best hyperparameters for LogisticRegression:", grid_search.best_params_)
print("Accuracy for LogisticRegression:", grid_search.score(X_test, y_test))


# Choose a model

In [5]:
cat_boost = CatBoostClassifier()
cat_boost.fit(X_train, y_train)

y_pred = cat_boost.predict(X_val)
y_pred_proba = cat_boost.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
neg_log_loss = log_loss(y_val, y_pred_proba)
mcc = matthews_corrcoef(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')

print(f'{acc=}' )
print(f'{neg_log_loss=}' )
print(f'{mcc=}' )
print(f'{f1=}' )

Learning rate set to 0.07622
0:	learn: 1.7345841	total: 592ms	remaining: 9m 51s
1:	learn: 1.6978842	total: 1.08s	remaining: 8m 58s
2:	learn: 1.6595869	total: 1.53s	remaining: 8m 27s
3:	learn: 1.6263747	total: 2.01s	remaining: 8m 19s
4:	learn: 1.6116548	total: 2.37s	remaining: 7m 52s
5:	learn: 1.5815491	total: 2.77s	remaining: 7m 38s
6:	learn: 1.5498200	total: 3.13s	remaining: 7m 24s
7:	learn: 1.5188659	total: 3.5s	remaining: 7m 14s
8:	learn: 1.4961328	total: 3.87s	remaining: 7m 6s
9:	learn: 1.4729889	total: 4.24s	remaining: 7m
10:	learn: 1.4497711	total: 4.64s	remaining: 6m 57s
11:	learn: 1.4327207	total: 5s	remaining: 6m 52s
12:	learn: 1.4036555	total: 5.37s	remaining: 6m 47s
13:	learn: 1.3927134	total: 5.72s	remaining: 6m 43s
14:	learn: 1.3744063	total: 6.1s	remaining: 6m 40s
15:	learn: 1.3632610	total: 6.45s	remaining: 6m 36s
16:	learn: 1.3506237	total: 6.8s	remaining: 6m 33s
17:	learn: 1.3421988	total: 7.16s	remaining: 6m 30s
18:	learn: 1.3246870	total: 7.51s	remaining: 6m 27s
19:	

## `LogisticRegression`

In [6]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_val)
y_pred_proba = log_reg.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
neg_log_loss = log_loss(y_val, y_pred_proba)
mcc = matthews_corrcoef(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')

print(f'{acc=}' )
print(f'{neg_log_loss=}' )
print(f'{mcc=}' )
print(f'{f1=}' )

acc=0.9833333333333333
neg_log_loss=0.959079661676191
mcc=0.9801284895735022
f1=0.9858906525573192


# Going with `LogisticRegression`

It's simple and it works.

In [7]:
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

acc = accuracy_score(y_test, y_pred)
neg_log_loss = log_loss(y_test, y_pred_proba)
mcc = matthews_corrcoef(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'{acc=}' )
print(f'{neg_log_loss=}' )
print(f'{mcc=}' )
print(f'{f1=}' )

acc=0.9833333333333333
neg_log_loss=0.9579831604337368
mcc=0.9802084802698515
f1=0.980952380952381


In [8]:
pickle.dump(log_reg, open(f'{DATA_PATH_PREP}/06_model_log_reg.pkl', 'wb'))