# Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)
import matplotlib.pyplot as plt

import pickle

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from catboost import CatBoostClassifier

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, log_loss, f1_score, matthews_corrcoef

In [2]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [3]:
# data_table_soft = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_soft.pkl')
# data_table_med = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_med.pkl')
data_table_hard = pd.read_pickle(f'{DATA_PATH_PREP}/04_data_table_hard.pkl')

# Split data

In [5]:
num_pca_components = 15
X = data_table_hard.drop('author', axis=1)
y = data_table_hard['author']

le = LabelEncoder()
y = le.fit_transform(y)
print(dict(enumerate(le.classes_)))

print(f'Reducing {X.shape[1]} to {num_pca_components}')
pca = PCA(n_components=num_pca_components)
pr_components = pca.fit_transform(X)
X_PCA = pd.DataFrame(data=pr_components, columns=[f'comp_{idx}' for idx in range(num_pca_components)])
X_PCA

seed = 42
X_train, X_test_val_test, y_train, y_test_val_test = train_test_split(X_PCA, y, random_state=seed, train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X_test_val_test, y_test_val_test, random_state=seed, train_size=0.5)

print(f'{X_train.shape} | {y_train.shape}')
print(f'{X_val.shape}  | {y_val.shape}')
print(f'{X_test.shape}  | {y_test.shape}')

{0: 'aleko-konstantinov', 1: 'dimityr-dimov', 2: 'dimityr-talev', 3: 'elin-pelin', 4: 'ivan_vazov', 5: 'jordan-jovkov'}
Reducing 240360 to 15
(144, 15) | (144,)
(18, 15)  | (18,)
(18, 15)  | (18,)


# Choose a model

## `LogisticRegression`

In [6]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_val)
y_pred_proba = log_reg.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
neg_log_loss = log_loss(y_val, y_pred_proba)
mcc = matthews_corrcoef(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')

print(f'{acc=}' )
print(f'{neg_log_loss=}' )
print(f'{mcc=}' )
print(f'{f1=}' )

acc=1.0
neg_log_loss=0.25601648986378006
mcc=1.0
f1=1.0


# Going with `LogisticRegression`

It's simple and it works.

In [7]:
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

acc = accuracy_score(y_test, y_pred)
neg_log_loss = log_loss(y_test, y_pred_proba)
mcc = matthews_corrcoef(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'{acc=}' )
print(f'{neg_log_loss=}' )
print(f'{mcc=}' )
print(f'{f1=}' )

acc=1.0
neg_log_loss=0.33646550318857105
mcc=1.0
f1=1.0


In [8]:
pickle.dump(log_reg, open(f'{DATA_PATH_PREP}/06_model_log_reg.pkl', 'wb'))