In [1]:
TOKEN=""#FILL IN API TOKEN HERE
#!git clone data repo here

Cloning into 'stories_shorts'...
remote: Enumerating objects: 923, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 923 (delta 3), reused 3 (delta 3), pack-reused 918 (from 1)[K
Receiving objects: 100% (923/923), 24.69 MiB | 13.64 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [3]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
from tqdm import tqdm
import random
import nltk
import joblib
from torch.cuda.amp import autocast  # <-- AMP for CUDA

nltk.download("wordnet")
from nltk.corpus import wordnet  # noqa: F401 (placeholder for future augmentation)

# ============================================================
# DEVICE & AMP SETUP  (FORCE CUDA)
# ============================================================
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. Please enable a GPU runtime (e.g., Colab: Runtime â†’ Change runtime type â†’ GPU).")

device = "cuda"
print("CUDA available:", torch.cuda.is_available())
print("Using device:", device)
print("GPU:", torch.cuda.get_device_name(0))
torch.backends.cudnn.benchmark = True
use_amp = True  # mixed precision on GPU

# ============================================================
# SPLIT CR4 INTO TRAIN / VAL / TEST (80/10/10)
# ============================================================
from sklearn.model_selection import train_test_split

cr4_full = pd.read_csv("data/cr4/CR4NarrEmote_t1Yes.csv", low_memory=False)
cr4_full = cr4_full.rename(columns={"passage": "text", "EMO_arousal": "arousal"})[["text", "arousal"]]
cr4_full["arousal"] = pd.to_numeric(cr4_full["arousal"], errors="coerce")
cr4_full = cr4_full.dropna()

# 80% train, 20% temp
cr4_train, cr4_temp = train_test_split(
    cr4_full,
    test_size=0.20,
    random_state=42,
    shuffle=True,
)

# split remaining 20% into 10% val, 10% test
cr4_val, cr4_test = train_test_split(
    cr4_temp,
    test_size=0.50,
    random_state=42,
    shuffle=True,
)

# ============================================================
# LOAD TALES_VA AND RESPLIT TO 80/10/10
# ============================================================
aem_train_raw = pd.read_csv("data/tales_va/train.csv")[["text", "A_EWE"]]
aem_val_raw = pd.read_csv("data/tales_va/val.csv")[["text", "A_EWE"]]
aem_test_raw = pd.read_csv("data/tales_va/test.csv")[["text", "A_EWE"]]

aem_full = pd.concat([aem_train_raw, aem_val_raw, aem_test_raw], ignore_index=True)
aem_full = aem_full.rename(columns={"A_EWE": "arousal"})
aem_full["arousal"] = pd.to_numeric(aem_full["arousal"], errors="coerce")
aem_full = aem_full.dropna()

train_aem, temp_aem = train_test_split(
    aem_full,
    test_size=0.20,
    random_state=42,
    shuffle=True,
)
val_aem, test_aem = train_test_split(
    temp_aem,
    test_size=0.50,
    random_state=42,
    shuffle=True,
)

# ============================================================
# COMBINE DATASETS WITHOUT DROPPING ANYTHING (OTHER THAN NaNs)
# ============================================================
train_aem["source"] = "aem"
val_aem["source"] = "aem"
test_aem["source"] = "aem"

cr4_train["source"] = "cr4"
cr4_val["source"] = "cr4"
cr4_test["source"] = "cr4"

train_df = pd.concat([train_aem, cr4_train], ignore_index=True)
val_df = pd.concat([val_aem, cr4_val], ignore_index=True)
test_df = pd.concat([test_aem, cr4_test], ignore_index=True)

print("Final dataset sizes (combined, no dropping):")
print(
    "TRAIN:", len(train_df),
    "| AEM:", (train_df["source"] == "aem").sum(),
    "| CR4:", (train_df["source"] == "cr4").sum(),
)
print(
    "VAL:  ", len(val_df),
    "| AEM:", (val_df["source"] == "aem").sum(),
    "| CR4:", (val_df["source"] == "cr4").sum(),
)
print(
    "TEST: ", len(test_df),
    "| AEM:", (test_df["source"] == "aem").sum(),
    "| CR4:", (test_df["source"] == "cr4").sum(),
)

# ============================================================
# NORMALIZE TARGETS (NO DATA LEAKAGE) - TO ~NORMAL
# ============================================================
print("Fitting quantile transformer (output_distribution='normal')...")
qt = QuantileTransformer(
    n_quantiles=200,
    output_distribution="normal",
    random_state=42,
)
qt.fit(train_df["arousal"].values.reshape(-1, 1))

train_df["arousal_norm"] = qt.transform(train_df["arousal"].values.reshape(-1, 1)).ravel()
val_df["arousal_norm"] = qt.transform(val_df["arousal"].values.reshape(-1, 1)).ravel()
test_df["arousal_norm"] = qt.transform(test_df["arousal"].values.reshape(-1, 1)).ravel()

# ============================================================
# AUGMENTATION (PLACEHOLDER)
# ============================================================
def augment(text: str, prob: float = 0.2) -> str:
    """
    Simple placeholder for text augmentation.
    Currently a no-op, but respects probability and is safe on CPU/GPU.
    """
    if prob <= 0.0:
        return text
    if random.random() > prob:
        return text
    # TODO: add actual augmentation (e.g., synonym replacement using wordnet)
    return text

# ============================================================
# DATASET + DATALOADERS
# ============================================================
# Use a smaller MiniLM model
MINILM_MODEL_NAME = "nreimers/MiniLM-L6-H384-uncased"

tokenizer = AutoTokenizer.from_pretrained(MINILM_MODEL_NAME)


class EmotionDataset(Dataset):
    def __init__(self, df: pd.DataFrame, augment_prob: float = 0.2):
        self.df = df.reset_index(drop=True)
        self.augment_prob = augment_prob

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        text = augment(row["text"], prob=self.augment_prob)

        enc = tokenizer(
            text,
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(float(row["arousal_norm"]), dtype=torch.float32),
        }


# Weighted sampling for train set with ~75% CR4, 25% AEM
train_sources = train_df["source"].values
n_aem = (train_sources == "aem").sum()
n_cr4 = (train_sources == "cr4").sum()

target_frac_aem = 0.25
target_frac_cr4 = 0.75

weights = np.where(
    train_sources == "aem",
    target_frac_aem / n_aem,
    target_frac_cr4 / n_cr4,
)
weights = torch.DoubleTensor(weights)

train_sampler = WeightedRandomSampler(
    weights=weights,
    num_samples=len(weights),
    replacement=True,
)

train_dl = DataLoader(
    EmotionDataset(train_df, augment_prob=0.2),
    batch_size=16,
    sampler=train_sampler,
    num_workers=4,
    pin_memory=True,  # GPU: good to use pinned memory
)

val_dl = DataLoader(
    EmotionDataset(val_df, augment_prob=0.0),
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

test_dl = DataLoader(
    EmotionDataset(test_df, augment_prob=0.0),
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

# ============================================================
# MODEL (MiniLM-based Regressor)
# ============================================================
class TransformerRegressor(nn.Module):
    def __init__(self, dropout: float = 0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MINILM_MODEL_NAME)
        self.dropout = nn.Dropout(dropout)
        self.reg_head = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, ids, mask):
        out = self.encoder(ids, attention_mask=mask)
        # Use CLS token representation (position 0)
        pooled = out.last_hidden_state[:, 0, :]
        pooled = self.dropout(pooled)
        return self.reg_head(pooled).view(-1)

# ============================================================
# EVALUATION FUNCTION (NORMAL + ORIGINAL SCALE)
# ============================================================
def evaluate(model: nn.Module, loader: DataLoader, qt: QuantileTransformer):
    model.eval()
    mse_loss = nn.MSELoss()

    batch_losses = []
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Validating", ncols=100):
            batch = {k: v.to(device) for k, v in batch.items()}

            with autocast(enabled=use_amp):
                out = model(batch["input_ids"], batch["attention_mask"])
                loss = mse_loss(out, batch["labels"])

            batch_losses.append(loss.item())
            all_preds.extend(out.detach().cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    preds = np.array(all_preds).reshape(-1, 1)
    labels = np.array(all_labels).reshape(-1, 1)

    # Normalized metrics (z-space)
    mse_norm = mean_squared_error(labels, preds)
    spear_norm = spearmanr(preds.ravel(), labels.ravel()).correlation

    # Back to original arousal scale
    preds_orig = qt.inverse_transform(preds).ravel()
    labels_orig = qt.inverse_transform(labels).ravel()

    mse_orig = mean_squared_error(labels_orig, preds_orig)
    spear_orig = spearmanr(preds_orig, labels_orig).correlation

    return {
        "avg_loss": float(np.mean(batch_losses)),
        "mse_norm": float(mse_norm),
        "spear_norm": float(spear_norm),
        "mse_orig": float(mse_orig),
        "spear_orig": float(spear_orig),
    }

# ============================================================
# TRAINING LOOP (with early stopping)
# ============================================================
hp = {"lr": 3e-5, "dropout": 0.05}
max_epochs = 15
patience_limit = 5


def train_model(hp):
    os.makedirs("checkpoints", exist_ok=True)

    model = TransformerRegressor(dropout=hp["dropout"]).to(device)

    optim = torch.optim.Adam(model.parameters(), lr=hp["lr"])
    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=3, gamma=0.5)
    mse_loss = nn.MSELoss()

    best_val = float("-inf")
    patience = 0
    ckpt_path = "checkpoints/best_single_model.pt"

    print("\n==============================")
    print("ðŸ”¥ TRAINING MODEL")
    print("==============================")

    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_dl, desc=f"Epoch {epoch}", ncols=100):
            batch = {k: v.to(device) for k, v in batch.items()}

            with autocast(enabled=use_amp):
                out = model(batch["input_ids"], batch["attention_mask"])
                loss = mse_loss(out, batch["labels"])

            optim.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()

            total_loss += loss.item()

        scheduler.step()

        val_metrics = evaluate(model, val_dl, qt)
        print(
            f"Epoch {epoch:02d} | "
            f"TrainLoss={total_loss:.4f} | "
            f"ValLoss={val_metrics['avg_loss']:.5f} | "
            f"ValMSE_norm={val_metrics['mse_norm']:.5f} | "
            f"ValMSE_orig={val_metrics['mse_orig']:.5f} | "
            f"Spearman_orig={val_metrics['spear_orig']:.4f}"
        )

        # Early stopping based on original-scale MSE
        if val_metrics["spear_orig"] > best_val:
            best_val = val_metrics["spear_orig"]
            patience = 0
            torch.save(model.state_dict(), ckpt_path)
        else:
            patience += 1
            if patience >= patience_limit:
                print("Early stopping triggered.")
                break

    return model


# ============================================================
# RUN TRAINING
# ============================================================
model = train_model(hp)

# Load best state
best_ckpt_path = "checkpoints/best_single_model.pt"
model.load_state_dict(torch.load(best_ckpt_path, map_location=device))

# Save artifacts
torch.save(model.state_dict(), "best_model.pt")
tokenizer.save_pretrained("best_model_tokenizer")
joblib.dump(qt, "quantile_transformer.pkl")

print("\nðŸŽ‰ Saved best_model.pt + tokenizer + quantile transformer")

# ============================================================
# FINAL TEST EVAL
# ============================================================
print("\n================ TEST RESULTS ================")
test_metrics = evaluate(model, test_dl, qt)
print("TEST MSE (norm):   ", test_metrics["mse_norm"])
print("TEST Spearman (norm):", test_metrics["spear_norm"])
print("TEST MSE (orig):   ", test_metrics["mse_orig"])
print("TEST Spearman (orig):", test_metrics["spear_orig"])

# ============================================================
# BASELINE TF-IDF
# ============================================================
print("\nRunning TF-IDF baseline...")
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df["text"])
X_test = tfidf.transform(test_df["text"])

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, train_df["arousal_norm"])
baseline_preds = ridge.predict(X_test)

baseline_mse = mean_squared_error(test_df["arousal_norm"], baseline_preds)
print("Baseline TF-IDF MSE (norm space):", baseline_mse)

joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("Saved TF-IDF vectorizer â†’ tfidf_vectorizer.pkl")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


CUDA available: True
Using device: cuda
GPU: NVIDIA A100-SXM4-40GB
Final dataset sizes (combined, no dropping):
TRAIN: 116171 | AEM: 11907 | CR4: 104264
VAL:   14521 | AEM: 1488 | CR4: 13033
TEST:  14523 | AEM: 1489 | CR4: 13034
Fitting quantile transformer (output_distribution='normal')...

ðŸ”¥ TRAINING MODEL


  with autocast(enabled=use_amp):
Epoch 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:01<00:00, 40.10it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 120.61it/s]


Epoch 00 | TrainLoss=6706.7555 | ValLoss=0.50032 | ValMSE_norm=0.50046 | ValMSE_orig=0.00520 | Spearman_orig=0.7988


  with autocast(enabled=use_amp):
Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:00<00:00, 40.27it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 115.99it/s]


Epoch 01 | TrainLoss=4338.9541 | ValLoss=0.48581 | ValMSE_norm=0.48601 | ValMSE_orig=0.00530 | Spearman_orig=0.8130


  with autocast(enabled=use_amp):
Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:59<00:00, 40.49it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 121.11it/s]


Epoch 02 | TrainLoss=2802.7443 | ValLoss=0.47769 | ValMSE_norm=0.47786 | ValMSE_orig=0.00585 | Spearman_orig=0.8334


  with autocast(enabled=use_amp):
Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:59<00:00, 40.52it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 125.36it/s]


Epoch 03 | TrainLoss=1727.0115 | ValLoss=0.47587 | ValMSE_norm=0.47604 | ValMSE_orig=0.00534 | Spearman_orig=0.8376


  with autocast(enabled=use_amp):
Epoch 4: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:59<00:00, 40.45it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 118.49it/s]


Epoch 04 | TrainLoss=1294.9092 | ValLoss=0.45468 | ValMSE_norm=0.45486 | ValMSE_orig=0.00533 | Spearman_orig=0.8461


  with autocast(enabled=use_amp):
Epoch 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:00<00:00, 40.19it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 116.26it/s]


Epoch 05 | TrainLoss=1036.7859 | ValLoss=0.43637 | ValMSE_norm=0.43656 | ValMSE_orig=0.00535 | Spearman_orig=0.8584


  with autocast(enabled=use_amp):
Epoch 6: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:00<00:00, 40.31it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 121.01it/s]


Epoch 06 | TrainLoss=806.9815 | ValLoss=0.43775 | ValMSE_norm=0.43794 | ValMSE_orig=0.00506 | Spearman_orig=0.8614


  with autocast(enabled=use_amp):
Epoch 7: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:57<00:00, 40.86it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 121.78it/s]


Epoch 07 | TrainLoss=693.6636 | ValLoss=0.43151 | ValMSE_norm=0.43169 | ValMSE_orig=0.00506 | Spearman_orig=0.8653


  with autocast(enabled=use_amp):
Epoch 8: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:59<00:00, 40.49it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 119.86it/s]


Epoch 08 | TrainLoss=627.1142 | ValLoss=0.43441 | ValMSE_norm=0.43460 | ValMSE_orig=0.00511 | Spearman_orig=0.8635


  with autocast(enabled=use_amp):
Epoch 9: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:59<00:00, 40.42it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 117.09it/s]


Epoch 09 | TrainLoss=549.5801 | ValLoss=0.43382 | ValMSE_norm=0.43401 | ValMSE_orig=0.00516 | Spearman_orig=0.8639


  with autocast(enabled=use_amp):
Epoch 10: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:58<00:00, 40.60it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 120.53it/s]


Epoch 10 | TrainLoss=501.6169 | ValLoss=0.43385 | ValMSE_norm=0.43404 | ValMSE_orig=0.00509 | Spearman_orig=0.8668


  with autocast(enabled=use_amp):
Epoch 11: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:00<00:00, 40.29it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 121.40it/s]


Epoch 11 | TrainLoss=496.1463 | ValLoss=0.43187 | ValMSE_norm=0.43206 | ValMSE_orig=0.00498 | Spearman_orig=0.8653


  with autocast(enabled=use_amp):
Epoch 12: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:00<00:00, 40.31it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 121.17it/s]


Epoch 12 | TrainLoss=449.3388 | ValLoss=0.42752 | ValMSE_norm=0.42771 | ValMSE_orig=0.00499 | Spearman_orig=0.8658


  with autocast(enabled=use_amp):
Epoch 13: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [02:59<00:00, 40.39it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 116.33it/s]


Epoch 13 | TrainLoss=443.9032 | ValLoss=0.42287 | ValMSE_norm=0.42306 | ValMSE_orig=0.00496 | Spearman_orig=0.8673


  with autocast(enabled=use_amp):
Epoch 14: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7261/7261 [03:00<00:00, 40.29it/s]
  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 121.80it/s]


Epoch 14 | TrainLoss=432.2257 | ValLoss=0.42487 | ValMSE_norm=0.42507 | ValMSE_orig=0.00495 | Spearman_orig=0.8669

ðŸŽ‰ Saved best_model.pt + tokenizer + quantile transformer



  with autocast(enabled=use_amp):
Validating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 908/908 [00:07<00:00, 118.19it/s]


TEST MSE (norm):    0.45738276839256287
TEST Spearman (norm): 0.8546728645786885
TEST MSE (orig):    0.005438676103949547
TEST Spearman (orig): 0.8546871886628019

Running TF-IDF baseline...
Baseline TF-IDF MSE (norm space): 0.6292091699258577
Saved TF-IDF vectorizer â†’ tfidf_vectorizer.pkl


In [6]:
#QUALITATIVE TESTING

import torch
from transformers import AutoTokenizer, AutoModel
import joblib

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# === 1. Reconstruct the full model definition exactly as in training ===
class TransformerRegressor(torch.nn.Module):
    def __init__(self, model_name="nreimers/MiniLM-L6-H384-uncased", dropout=0.05):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(dropout)
        self.reg_head = torch.nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, ids, mask):
        out = self.encoder(ids, attention_mask=mask)
        pooled = out.last_hidden_state[:, 0, :]
        pooled = self.dropout(pooled)
        return self.reg_head(pooled).view(-1)

# === 2. Load model + tokenizer + quantile transformer ===
model = TransformerRegressor()
model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained("best_model_tokenizer")
qt = joblib.load("quantile_transformer.pkl")

# === 3. Define your example texts (low â†’ high arousal) ===
texts = [
    # 1. Low arousal
    "The sun rose slowly over the quiet lake. Birds chirped softly in the distance, and the water was still, reflecting the pale morning sky. A gentle breeze rustled the reeds at the waterâ€™s edge as the day began in peaceful silence.",
    # 2. Lowâ€“medium arousal
    "She strolled through the empty park in the early autumn afternoon. Leaves drifted lazily from the trees, and the scent of damp earth mixed with faint traces of wood smoke from distant chimneys. The air was cool, and the world felt calm and unhurried.",
    # 3. Medium arousal
    "He opened the letter and his hands trembled. The words inside were unexpected: absence, regret, final decisions. His chest tightened as he read each line slowly, feeling a strange mix of sorrow and lingering hope as the paper slipped from his grasp.",
    # 4. Mediumâ€“high arousal
    "Rain battered the windows, echoing in the hollow silence of the room. Thunder rolled overhead. She paced back and forth, heart thudding in her ears, shadows dancing across the walls. With every flash of lightning, memories she thought forgotten surged up â€” sharp, raw, and insistent.",
    # 5. High arousal
    "Smoke and fire roared around them, the building collapsing with a deafening crash. She reached out, screaming for help, her breath ragged and wild. Splinters and rubble rained down as panic clawed at her mind â€” every instinct screaming to run, but the world blurred into chaos.",
]

# === 4. Inference loop over all texts ===
for i, text in enumerate(texts, start=1):
    enc = tokenizer(
        text,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        norm_pred = model(enc["input_ids"], enc["attention_mask"]).cpu().item()
    orig_pred = qt.inverse_transform([[norm_pred]])[0, 0]

    print(f"___ Example {i} ___")
    print("Text:", text)
    print("Predicted arousal (normalized scale):", norm_pred)
    print("Predicted arousal (original scale):", orig_pred)
    print()


Using device: cuda
___ Example 1 ___
Text: The sun rose slowly over the quiet lake. Birds chirped softly in the distance, and the water was still, reflecting the pale morning sky. A gentle breeze rustled the reeds at the waterâ€™s edge as the day began in peaceful silence.
Predicted arousal (normalized scale): -1.7065761089324951
Predicted arousal (original scale): 0.2739526669822491

___ Example 2 ___
Text: She strolled through the empty park in the early autumn afternoon. Leaves drifted lazily from the trees, and the scent of damp earth mixed with faint traces of wood smoke from distant chimneys. The air was cool, and the world felt calm and unhurried.
Predicted arousal (normalized scale): -0.2581581473350525
Predicted arousal (original scale): 0.5171437196911505

___ Example 3 ___
Text: He opened the letter and his hands trembled. The words inside were unexpected: absence, regret, final decisions. His chest tightened as he read each line slowly, feeling a strange mix of sorrow and l