<a href="https://colab.research.google.com/github/NikolaJanik/Polish_poetry_classification_with_transformers/blob/main/data_analisys_herBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Poetry Classification and Embedding Analysis with HerBERT

# 1. Install Dependencies (for Colab)

In [None]:
!pip install -q transformers sacremoses umap-learn

# 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap
import torch
from transformers import HerbertTokenizer, RobertaModel

# 3. Load and Clean Data

In [None]:
df = pd.read_csv("/mnt/data/polish_poetry.csv", sep=";")
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col])
df = df.dropna(subset=["Text", "Label", "Author-short"])
df = df.reset_index(drop=True)

# 4. Initialize HerBERT Model

In [None]:
herbert_tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-large-cased")
herbert_model = RobertaModel.from_pretrained("allegro/herbert-large-cased")

# 5. Generate Embeddings

In [None]:
def generate_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

In [None]:
X_embeddings = generate_embeddings(df["Text"], herbert_tokenizer, herbert_model)
df["Herbert_embedding"] = list(X_embeddings)

# 6. Prepare X and y

In [None]:
X = np.stack(df["Herbert_embedding"].values)
y = df["Label"].values

# 7. Normalize Data

In [None]:
def normalize_data(X: np.ndarray) -> np.ndarray:
    return (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)

X_normalized = normalize_data(X)

# 8. UMAP Projection

In [None]:
reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, n_components=2, metric='euclidean')
X_umap = reducer.fit_transform(StandardScaler().fit_transform(X))

plt.figure(figsize=(10,10))
sns.scatterplot(x=X_umap[:,0], y=X_umap[:,1], hue=df['Author-short'], palette='Set2')
plt.title("UMAP Projection of Poem Embeddings")
plt.show()

## 9. PCA 3D Projection

In [None]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(StandardScaler().fit_transform(X))

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, projection='3d')

for label in np.unique(y):
    idx = np.where(y == label)
    ax.scatter(X_pca[idx,0], X_pca[idx,1], X_pca[idx,2], label=df['Author-short'].iloc[idx[0]])

ax.set_title("3D PCA Projection")
ax.legend()
plt.show()

# 10. Distance Matrices (Euclidean & Cosine)

In [None]:
euc_dist = np.linalg.norm(X_normalized[:, None, :] - X_normalized[None, :, :], axis=-1)
cos_dist = np.dot(X_normalized, X_normalized.T) / X.shape[1]

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18, 8))
ax[0].imshow(euc_dist)
ax[0].set_title("Euclidean Distance")
ax[1].imshow(cos_dist)
ax[1].set_title("Cosine Similarity")
plt.show()

# 11. Save Results (Optional)

In [None]:
# %%
# df.to_csv("poetry_with_embeddings.csv", index=False)
# np.save("X_normalized.npy", X_normalized)
# np.save("euclidean_distance.npy", euc_dist)
# np.save("cosine_similarity.npy", cos_dist)