<a href="https://colab.research.google.com/github/NikolaJanik/Polish_poetry_classification_with_transformers/blob/main/ML_and_NN_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install dependencies (for Google Colab)

In [None]:
!pip install transformers sacremoses --quiet
!pip install xgboost lightgbm --quiet

# 2. Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.tree import DecisionTreeClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from transformers import HerbertTokenizer, RobertaModel

# 3. Load and prepare data


In [None]:
df_raw = pd.read_csv('/content/polish_poetry.csv', sep=';')

# Usuwamy zbędne kolumny
drop_cols = [col for col in df_raw.columns if 'Unnamed' in col]
df_raw = df_raw.drop(columns=drop_cols)

print("Liczba wierszy:", df_raw.shape[0])
print("Klasy:", df_raw['Label'].nunique())

# 4. Tokenization and embedding function

In [None]:
def make_embedding(df, model_info):
    model_name, tokenizer, model = model_info
    embeddings = []
    labels = []

    for text, label in tqdm(zip(df['Text'], df['Label']), total=len(df)):
        inputs = tokenizer(
            text,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        outputs = model(**inputs)
        vector = outputs.last_hidden_state[:, 0, :].detach().numpy()[0]
        embeddings.append(vector)
        labels.append(label)

    return pd.DataFrame({
        f"{model_name}_embedding": embeddings,
        "label": labels
    })

# 5. HerBERT initialization

In [None]:
df_embedded = make_embedding(df_raw, herbert)
df_embedded = df_embedded.sample(frac=1).reset_index(drop=Tru

# 7. Prepare X and y sets

In [None]:
def get_splits(df, embed_col, test_size=0.2, val_size=0.2):
    X = np.stack(df[embed_col])
    y = df['label']

    # Jeżeli mniej niż 8 klas, zamieniamy etykiety na kategorie 0...n-1
    if len(np.unique(y)) < 8:
        y = pd.factorize(y)[0]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, stratify=y_train)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = get_splits(df_embedded, 'Herbert_embedding')


# 8. Machine learning model (Decision Tree)

In [None]:
def run_ml_model(X_train, y_train, X_test, y_test, model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred, normalize='true')

    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(xticks_rotation=45)
    plt.title(f"{model_name} | Accuracy: {acc:.2f}")
    plt.show()
    return acc, cm

tree_acc, tree_cm = run_ml_model(
    X_train, y_train, X_test, y_test,
    DecisionTreeClassifier(max_depth=20),
    "Decision Tree"
)

# 9. Neural network model

In [None]:
input_size = X_train.shape[1]
num_classes = len(np.unique(y_train))

y_train_cat = to_categorical(y_train, num_classes)
y_val_cat = to_categorical(y_val, num_classes)

model_NN = Sequential([
    Dense(input_size, activation='relu', input_shape=(input_size,)),
    Dense(2 * input_size, activation='relu'),
    Dense(4 * input_size, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model_NN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model_NN.fit(
    X_train, y_train_cat,
    validation_data=(X_val, y_val_cat),
    epochs=50,
    batch_size=64,
    verbose=1
)

# 10. Learning rate curve

In [None]:
def plot_learning_curve(history):
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    axs[0].plot(history.history['accuracy'], label='train')
    axs[0].plot(history.history['val_accuracy'], label='val')
    axs[0].set_title("Accuracy")
    axs[0].legend()

    axs[1].plot(history.history['loss'], label='train')
    axs[1].plot(history.history['val_loss'], label='val')
    axs[1].set_title("Loss")
    axs[1].legend()

    plt.show()

plot_learning_curve(history)

# 11. Neural network confusion matrix

In [None]:
y_pred_nn = np.argmax(model_NN.predict(X_test), axis=1)
nn_acc = accuracy_score(y_test, y_pred_nn)
nn_cm = confusion_matrix(y_test, y_pred_nn, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=nn_cm)
disp.plot(xticks_rotation=45)
plt.title(f"Neural Network | Accuracy: {nn_acc:.2f}")
plt.show()