<a href="https://colab.research.google.com/github/NikolaJanik/Polish_poetry_classification_with_transformers/blob/main/data_analisis_herBERT_men_women.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gender-Based Poetry Embedding Analysis with HerBERT

# 1. Install Dependencies (only needed in Colab)

In [None]:
!pip install -q transformers sacremoses umap-learn

# 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import torch
from transformers import HerbertTokenizer, RobertaModel

# 3. Load and Prepare Data

In [None]:
df = pd.read_csv("/mnt/data/polish_poetry.csv", sep=";")
df = df.dropna(subset=["Text", "Label", "Author-short"]).reset_index(drop=True)

In [None]:
# Assign Gender: first 200 = men, next 200 = women
df_men = df.iloc[:200].reset_index(drop=True)
df_women = df.iloc[200:400].reset_index(drop=True)

# 4. Initialize HerBERT Model

In [None]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-large-cased")
model = RobertaModel.from_pretrained("allegro/herbert-large-cased")

# 5. Generate Embeddings

In [None]:
def generate_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

In [None]:
X_men = generate_embeddings(df_men["Text"], tokenizer, model)
X_women = generate_embeddings(df_women["Text"], tokenizer, model)

# 6. Normalize Embeddings

In [None]:
def normalize_embeddings(X):
    return (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)

In [None]:
X_men_norm = normalize(X_men)
X_women_norm = normalize(X_women)

# 7. Function to make plots (UMAP, PCA, t-SNE)

In [None]:
def plot_projection(X, labels, method="UMAP", title=""):
    if method == "UMAP":
        reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, n_components=2)
        X_proj = reducer.fit_transform(StandardScaler().fit_transform(X))
    elif method == "PCA":
        X_proj = PCA(n_components=2).fit_transform(X)
    elif method == "TSNE":
        X_proj = TSNE(n_components=2, perplexity=20).fit_transform(X)
    else:
        raise ValueError("Unsupported method")

    plt.figure(figsize=(10,10))
    sns.scatterplot(x=X_proj[:,0], y=X_proj[:,1], hue=labels, palette="Set2", s=100)
    plt.title(f"{title} — {method}")
    plt.show()

# 8. 2D visualization

In [None]:
plot_projection(X_men_norm, df_men["Author-short"], method="UMAP", title="Men")
plot_projection(X_women_norm, df_women["Author-short"], method="UMAP", title="Women")
plot_projection(X_men_norm, df_men["Author-short"], method="PCA", title="Men")
plot_projection(X_women_norm, df_women["Author-short"], method="PCA", title="Women")
plot_projection(X_men_norm, df_men["Author-short"], method="TSNE", title="Men")
plot_projection(X_women_norm, df_women["Author-short"], method="TSNE", title="Women")

# 9. PCA 3D

In [None]:
from mpl_toolkits.mplot3d import Axes3D

def plot_pca_3d(X, labels, title=""):
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(X)

    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111, projection='3d')
    for label in np.unique(labels):
        idx = labels == label
        ax.scatter(X_pca[idx,0], X_pca[idx,1], X_pca[idx,2], label=label)
    ax.set_title(f"3D PCA Projection — {title}")
    ax.legend()
    plt.show()


In [None]:
plot_pca_3d(X_men_norm, df_men["Author-short"], "Men")
plot_pca_3d(X_women_norm, df_women["Author-short"], "Women")

# 9. Distance Matrices (Euclidean, Cosine)

In [None]:
def plot_distance_matrices(X, title=""):
    euc_dist = np.linalg.norm(X[:, None, :] - X[None, :, :], axis=-1)
    cos_sim = np.dot(X, X.T) / X.shape[1]

    fig, ax = plt.subplots(1, 2, figsize=(20, 8))
    ax[0].imshow(euc_dist)
    ax[0].set_title(f"{title} — Euclidean Distance")
    ax[1].imshow(cos_sim)
    ax[1].set_title(f"{title} — Cosine Similarity")
    plt.show()

In [None]:
plot_distance_matrices(X_men_norm, title="Men")
plot_distance_matrices(X_women_norm, title="Women")

# 10. Save (Optional)

In [None]:
# np.save("X_men.npy", X_men)
# np.save("X_women.npy", X_women)
# df_men.to_csv("men_poems_with_embeddings.csv", index=False)
# df_women.to_csv("women_poems_with_embeddings.csv", index=False)