# Imports

In [1]:
import numpy as np
import pandas as pd

import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from preprocessing import tokenize

from sklearn.metrics import accuracy_score, log_loss, f1_score, matthews_corrcoef

In [2]:
def identity(x):
  return x

DATA_PATH_PREP = '../DATA/prepared'

In [3]:
df_samples = pd.read_pickle(f'{DATA_PATH_PREP}/03_df_samples.pkl')
vectorizer = pickle.load(open(f'{DATA_PATH_PREP}/04_vectorizer_hard.pkl', 'rb'))

In [4]:
df_samples

Unnamed: 0,text,author
0,"сията й — злато с шепа, моля ти се. Късмет ли ...",aleko-konstantinov
1,сега не си раз,aleko-konstantinov
2,ме какво да правя.\n\t— Как какво да правиш! П...,aleko-konstantinov
3,"а, че не съчувствувай на македонците…\n\t(Я ту...",aleko-konstantinov
4,"шва)… значи, и да се разгатне енигмата на зеле...",aleko-konstantinov
...,...,...
595,"я занятието си с друго, не тъй почтено и краси...",jordan-jovkov
596,"с една галантност, в която се съглежда не уме...",jordan-jovkov
597,то градските учители и учителки напрягаха всич...,jordan-jovkov
598,"азник на панаира, за разходките по шосето, за ...",jordan-jovkov


In [5]:
def avg_len(group):
    return sum(len(x) for x in group) / len(group)

grouped = df_samples.groupby("author").agg({'text': avg_len})

print(grouped)

                      text
author                    
aleko-konstantinov  520.42
dimityr-dimov       525.64
dimityr-talev       504.14
elin-pelin          524.27
ivan_vazov          530.01
jordan-jovkov       525.37


# Split data

In [6]:
df_samples_tmp = df_samples

X = df_samples_tmp.drop('author', axis=1)
y = df_samples_tmp['author']

le = LabelEncoder()
y = le.fit_transform(y)
print(dict(enumerate(le.classes_)))

seed = 42
X_train, X_test_val_test, y_train, y_test_val_test = train_test_split(X, y, random_state=seed, train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X_test_val_test, y_test_val_test, random_state=seed, train_size=0.5)
print(f'{X_train.shape} | {y_train.shape}')
print(f'{X_val.shape}  | {y_val.shape}')
print(f'{X_test.shape}  | {y_test.shape}')

{0: 'aleko-konstantinov', 1: 'dimityr-dimov', 2: 'dimityr-talev', 3: 'elin-pelin', 4: 'ivan_vazov', 5: 'jordan-jovkov'}
(480, 1) | (480,)
(60, 1)  | (60,)
(60, 1)  | (60,)


# Pipeline

In [7]:
class Tokenizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_cp = X.copy()
        X_cp['text'] = X_cp['text'].map(lambda text: tokenize(text, 'hard'))
        return X_cp

    
class Vectorizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return vectorizer.transform(X['text'])
 

pipe = Pipeline(steps=[
    ('tokenizer', Tokenizer()),
    ('vectorizer', Vectorizer()),
    ('svd', TruncatedSVD()),
    ('svc', SVC(probability=True)),
])

pipe.fit(X_train, y_train)

In [9]:
y_pred = pipe.predict(X_val)
y_pred_proba = pipe.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
neg_log_loss = log_loss(y_val, y_pred_proba)
mcc = matthews_corrcoef(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')

print(f'{acc=}' )
print(f'{neg_log_loss=}' )
print(f'{mcc=}' )
print(f'{f1=}' )

acc=0.4
neg_log_loss=1.3363672782320353
mcc=0.2854502346252153
f1=0.36419130259307836


# Saving to files

In [None]:
pickle.dump(pipe, open(f'{DATA_PATH_PREP}/06_pipe_hard.pkl', 'wb'))