<a href="https://colab.research.google.com/github/NikolaJanik/Polish_poetry_classification_with_transformers/blob/main/data_analisis_herBERT_men_women.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gender-Based Poetry Embedding Analysis with HerBERT

# 1. Install Dependencies (only needed in Colab)

In [None]:
!pip install -q transformers sacremoses umap-learn

# 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import torch
from transformers import HerbertTokenizer, RobertaModel

# 3. Load and Prepare Data

In [None]:
df = pd.read_csv("/mnt/data/polish_poetry.csv", sep=";")
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col])

In [None]:
# Assign Gender: first 200 = men, next 200 = women
df_men = df.iloc[:200].copy()
df_women = df.iloc[200:400].copy()
df_men["Gender"] = "M"
df_women["Gender"] = "K"
df_all = pd.concat([df_men, df_women]).reset_index(drop=True)

# 4. Initialize HerBERT Model

In [None]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-large-cased")
model = RobertaModel.from_pretrained("allegro/herbert-large-cased")

# 5. Generate Embeddings

In [None]:
def generate_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

In [None]:
X_embeddings = generate_embeddings(df_all["Text"], tokenizer, model)
df_all["Embedding"] = list(X_embeddings)

# 6. Normalize Embeddings

In [None]:
def normalize_embeddings(X):
    return (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)

In [None]:
X = np.stack(df_all["Embedding"].values)
X_norm = normalize_embeddings(X)

# 7. Dimensionality Reduction (UMAP, PCA, t-SNE)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
umap_2d = umap.UMAP(n_components=2).fit_transform(X_scaled)
pca_2d = PCA(n_components=2).fit_transform(X_scaled)
tsne_2d = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X_scaled)

In [None]:
def plot_projection(data_2d, labels, title):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=data_2d[:, 0], y=data_2d[:, 1], hue=labels, palette="Set2")
    plt.title(title)
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
plot_projection(umap_2d, df_all["Gender"], "UMAP Projection by Gender")
plot_projection(pca_2d, df_all["Gender"], "PCA Projection by Gender")
plot_projection(tsne_2d, df_all["Gender"], "t-SNE Projection by Gender")

# 9. Distance Matrices (Euclidean, Cosine)

In [None]:
euc_dist = np.linalg.norm(X_norm[:, None, :] - X_norm[None, :, :], axis=-1)
cos_sim = np.dot(X_norm, X_norm.T)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(euc_dist, ax=ax[0], cmap="viridis")
ax[0].set_title("Euclidean Distance")
sns.heatmap(cos_sim, ax=ax[1], cmap="coolwarm")
ax[1].set_title("Cosine Similarity")
plt.tight_layout()
plt.show()

# 10. Save (Optional)

In [None]:
# df_all.to_csv("poetry_gender_embeddings.csv", index=False)
# np.save("embeddings_normalized.npy", X_norm)
# np.save("euclidean_distance.npy", euc_dist)
# np.save("cosine_similarity.npy", cos_sim)