<a href="https://colab.research.google.com/github/NikolaJanik/Polish_poetry_classification_with_transformers/blob/main/cosine_distance_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Polish Poetry Author Classification – Cosine and Euclidean Distance Based Model

# 1. Install necessary packages

In [None]:
!pip install transformers
!pip install sacremoses

# 2. Imports

In [None]:
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import torch
from transformers import HerbertTokenizer, RobertaModel

# 3. Load HerBERT model and tokenizer

In [None]:
herbert_klej = [
    "Herbert-klej",
    HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1"),
    RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
]

# 4. Load dataset

In [None]:

# Adjust path as needed
df_raw = pd.read_csv('/content/wiersze_do_BERT_Herbert_Miłosz.csv', sep=';')
df_raw = df_raw.drop(columns=['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'])

# Keep only first 400 examples (200 male, 200 female)
df_raw = df_raw.iloc[:400]

# Split into male and female authors
df_women = df_raw[200:].reset_index(drop=True).sample(frac=1).reset_index(drop=True)
df_men = df_raw[:200].reset_index(drop=True).sample(frac=1).reset_index(drop=True)

# Prepare full dataframe
df_orginal = pd.concat([df_raw["Text"], df_raw["Label"], df_raw["Author-short"]], axis=1)
df_orginal = df_orginal.sample(frac=1).reset_index(drop=True)


# 5. Define functions

In [None]:

# Embedding generation using CLS token
def make_embedding(df, model):
    _, tokenizer, model = model
    embedded = {}
    for idx in tqdm(range(len(df))):
        text = df['Text'][idx]
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        cls_embedding = outputs[0][:, 0, :].detach().numpy()[0]
        embedded[idx] = (cls_embedding, df['Label'][idx])
    return pd.DataFrame.from_dict(embedded, orient='index', columns=['embedding', 'label'])


In [None]:

# Normalize embeddings per sample
def normalize_data(X):
    return (X - np.mean(X, axis=1, keepdims=True)) / np.std(X, axis=1, keepdims=True)


In [None]:

# Split data into train, val, test
def get_X_y_train(df, normalization=True):
    X = np.stack(df['embedding'])
    y = df['label'].factorize()[0]
    if normalization:
        X = normalize_data(X)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    return X, y, X_train, X_val, y_train, y_val


In [None]:

# Predict using Euclidean distance from training samples
def predict_by_euclidean(df, n_realizations=10, normalize=False):
    classes = df['label'].factorize()[1].to_list()
    CM_aver = np.zeros((len(classes), len(classes)))
    for _ in range(n_realizations):
        _, y, X_train, X_test, y_train, y_test = get_X_y_train(df, normalization=normalize)
        cm = np.zeros_like(CM_aver)
        for i, x_test in enumerate(X_test):
            distances = np.linalg.norm(X_train - x_test, axis=1)
            y_pred = y_train[np.argmin(distances)]
            cm[y_test[i], y_pred] += 1
        cm = cm / cm.sum(axis=1, keepdims=True)
        CM_aver += cm
    CM_aver /= n_realizations
    acc = round(np.mean(np.diag(CM_aver)), 2)
    disp = ConfusionMatrixDisplay(confusion_matrix=CM_aver)
    disp.plot()
    disp.ax_.set_title(f"Euclidean Distance | Acc: {acc}")
    plt.xticks(ticks=np.arange(len(classes)), labels=classes, rotation=45)
    plt.yticks(ticks=np.arange(len(classes)), labels=classes, rotation=45)
    plt.gcf().set_size_inches(10, 10)
    return CM_aver, acc


# 6. Run model

In [None]:
# Example run for orginal dataset
df_embed = make_embedding(df_orginal, herbert_klej)
df_embed['label'] = df_embed['label'].astype(str)  # Ensure consistency
df_embed = df_embed.reset_index(drop=True)

cm, acc = predict_by_euclidean(df_embed, normalize=False)
print("Average accuracy:", acc)


In [None]:
# Example run for women dataset
df_embed_women = make_embedding(df_women, herbert_klej)
df_embed_women['label'] = df_embed_women['label'].astype(str)  # Ensure consistency
df_embed_women = df_embed_women.reset_index(drop=True)

cm_women, acc_women = predict_by_euclidean(df_embed_women, normalize=False)
print("Average accuracy (women):", acc_women)


In [None]:

# Example run for men dataset
df_embed_men = make_embedding(df_men, herbert_klej)
df_embed_men['label'] = df_embed_men['label'].astype(str)  # Ensure consistency
df_embed_men = df_embed_men.reset_index(drop=True)

cm_men, acc_men = predict_by_euclidean(df_embed_men, normalize=False)
print("Average accuracy (men):", acc_men)
