# Loading models

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch

model_name = "google-bert/bert-base-uncased"

base_model = AutoModel.from_pretrained(model_name)

ft_model = AutoModelForSequenceClassification.from_pretrained(
    "./finetuned_BERT",
    problem_type="multi_label_classification"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Getting embeddings

In [None]:
import torch
import numpy as np
from collections import defaultdict

def collect_word_embeddings(model, tokenizer, words, combine="mean"):
    model.eval()
    word_vectors = {}

    with torch.no_grad():
        for word in words:
            inputs = tokenizer(
                word,
                return_tensors="pt",
                truncation=True,
                add_special_tokens=False
            )
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)
            last_hidden = outputs.hidden_states[-1].squeeze(0)  # [num_subwords, hidden_dim]
            
            if combine == "mean":
                vec = last_hidden.mean(dim=0)
            elif combine == "sum":
                vec = last_hidden.sum(dim=0)
            else:
                raise ValueError("combine must be 'mean' or 'sum'")
                
            word_vectors[word] = vec.cpu().numpy()

    return word_vectors

In [None]:
words = [
    "anger",
    "joy",
    "fear",
    "sadness",
    "surprise",
    "disgust",
    "love",
    "guilt",
    "curiosity",
    "relief",
    "happy",
    "angry",
    "scared",
    "depressed",
    "ashamed",
    "jealous",
    "confident",
    "stressed",
    "hopeful",
    "lonely"
]

base_embeddings = collect_word_embeddings(base_model, tokenizer, words, combine="mean")
ft_embeddings = collect_word_embeddings(ft_model.bert, tokenizer, words, combine="mean")

# t-SNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def tsne_reduce(emb_dict):
    tokens = list(emb_dict.keys())
    vecs = np.stack(list(emb_dict.values()))
    reduced = TSNE(n_components=2, random_state=42, perplexity=5).fit_transform(vecs)
    return tokens, reduced

tokens_base, reduced_base = tsne_reduce(base_embeddings)
tokens_ft, reduced_ft = tsne_reduce(ft_embeddings)

# Plotting

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 7))
axes = axes.flatten()


axes[0].scatter(
    reduced_base[:, 0], reduced_base[:, 1],
    color='royalblue', s=80, alpha=0.7, edgecolors='k', linewidths=0.5
)
for i, tok in enumerate(tokens_base):
    axes[0].text(
        reduced_base[i, 0] + 0.02, reduced_base[i, 1] + 0.02,
        tok, fontsize=10, weight='bold', color='navy'
    )
axes[0].set_title("Pre-trained Bert Embedding Space", fontsize=13, weight='bold')
axes[0].set_xlabel("t-SNE 1")
axes[0].set_ylabel("t-SNE 2")
axes[0].grid(alpha=0.3)


axes[1].scatter(
    reduced_ft[:, 0], reduced_ft[:, 1],
    color='crimson', s=80, alpha=0.7, edgecolors='k', linewidths=0.5
)
for i, tok in enumerate(tokens_ft):
    axes[1].text(
        reduced_ft[i, 0] + 0.02, reduced_ft[i, 1] + 0.02,
        tok, fontsize=10, weight='bold', color='darkred'
    )
axes[1].set_title("Fine-tuned Bert Embedding Space", fontsize=13, weight='bold')
axes[1].set_xlabel("t-SNE 1")
axes[1].set_ylabel("t-SNE 2")
axes[1].grid(alpha=0.3)

plt.suptitle("t-SNE Visualization of Emotion Embeddings", fontsize=15, weight='bold', y=1.03)
plt.tight_layout()
plt.savefig("finetuned_BERT/tsne_movement_bert.png", dpi=300, bbox_inches="tight")
plt.show()