# 1. Load Paramter From Config File

In [None]:
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().resolve().parents[1]
sys.path.append(str(PROJECT_ROOT))
print("Loaded project root:", PROJECT_ROOT)

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from src.dataset import DeepPHQDataset, build_vocab, create_balanced_dataloader, split_by_pid, DeepPHQValDataset
from models.transformer.transformer_model import DeepPHQTransformer
import yaml

# Load config
CONFIG_PATH = Path("config.yaml")

with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

config

# 2. Load processed CSV data

In [None]:
data_cfg = config["data"]
root = Path(data_cfg["data_root"])

level = data_cfg["level"]

if level == "word":
    csv_path = root / data_cfg["word_csv"]
elif level == "sentence":
    csv_path = root / data_cfg["sentence_csv"]
elif level == "dialogue":
    csv_path = root / data_cfg["dialogue_csv"]
else:
    raise ValueError("Unknown level in config")

print("Loading:", csv_path)

df = pd.read_csv(csv_path)
df.head()

# 3. Build Vocab and Balanced DataLoader

In [None]:
all_texts = df["Text"].tolist()
vocab = build_vocab(all_texts, min_freq=config["vocab"]["min_freq"])

In [None]:
# 1. split
train_df, val_df, test_df = split_by_pid(df)

# 2. create datasets
train_dataset = DeepPHQDataset(
    data=list(zip(train_df["PID"], train_df["Text"], train_df["PHQ_Score"])),
    vocab=vocab,
    max_length=config["data"]["max_length"]
)

val_dataset = DeepPHQValDataset(
    val_df,
    vocab,
    max_length=config["data"]["max_length"],
    stride=128
)
test_dataset = DeepPHQValDataset(
    test_df,
    vocab,
    max_length=config["data"]["max_length"],
    stride=128
)

val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# 3. dataloaders
train_loader = create_balanced_dataloader(
    train_dataset,
    batch_size=config["dataloader"]["batch_size"]
)

# 4. verify shapes
batch = next(iter(train_loader))
print(batch["input_ids"].shape)
print(batch["label"].shape)
print(batch["pid"].shape)

# 4. Init Transformer Model

In [None]:
# ---- Load model config ----
model_cfg = config["model"]

# ---- Auto-select device ----
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Using device:", device)

# ---- Init model ----
model = DeepPHQTransformer(
    input_size=len(vocab),
    output_size=model_cfg["output_size"],
    hidden_dim=model_cfg["hidden_dim"],
    nhead=model_cfg["nhead"],
    num_layers=model_cfg["num_layers"],
    dropout=model_cfg["dropout"]
).to(device)

# 5. Train and Evaluate the Model

In [None]:
def evaluate_pid_level(model, val_loader, device="cuda"):
    model.eval()
    pid2preds = {}
    pid2labels = {}

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].cpu().numpy()
            pids = batch["pid"].cpu().numpy()

            outputs = model(input_ids).squeeze().cpu().numpy()

            for pid, lab, pred in zip(pids, labels, outputs):
                pid2preds.setdefault(pid, []).append(pred)
                pid2labels[pid] = lab  # same true PHQ for all windows

    final_preds = []
    final_labels = []

    for pid in pid2preds:
        final_preds.append(np.mean(pid2preds[pid]))
        final_labels.append(pid2labels[pid])

    final_preds = np.array(final_preds)
    final_labels = np.array(final_labels)

    mse = ((final_preds - final_labels) ** 2).mean()
    mae = np.abs(final_preds - final_labels).mean()
    rmse = mse ** 0.5

    return mse, mae, rmse

In [None]:
train_cfg = config["training"]

optimizer = AdamW(
    model.parameters(),
    lr=float(train_cfg["learning_rate"]),
    weight_decay=float(train_cfg["weight_decay"])
)

criterion = nn.MSELoss()
train_losses = []
val_mses = []
val_maes = []
val_rmses = []

for epoch in range(train_cfg["num_epochs"]):
    # ==========================
    #   1. TRAIN
    # ==========================
    model.train()
    epoch_loss = 0

    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in progress:
        input_ids = batch["input_ids"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids).squeeze()

        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), train_cfg["gradient_clip"])
        optimizer.step()

        epoch_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    avg_train_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # ==========================
    #   2. PID-LEVEL VALIDATION
    # ==========================
    mse, mae, rmse = evaluate_pid_level(model, val_loader, device)

    val_mses.append(mse)
    val_maes.append(mae)
    val_rmses.append(rmse)

    print(f"[Epoch {epoch+1}] "
          f"Train Loss = {avg_train_loss:.4f} | "
          f"Val MSE = {mse:.4f} | Val MAE = {mae:.4f} | Val RMSE = {rmse:.4f}")

# 6. Save Result as pt Format

In [None]:
save_dir = Path(config["checkpoint"]["save_dir"])
level = config["data"]["level"]   # "word", "sentence", "dialogue"
 
model_name = f"{level}.pt"
save_path = save_dir / model_name

save_dir.mkdir(parents=True, exist_ok=True)

torch.save({
    "model_state": model.state_dict(),
    "vocab": vocab,
    "config": config,
}, save_path)

print(f"[âœ“] Saved model to: {save_path}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label="Train Loss (MSE)")
plt.plot(val_mses, label="Val MSE")
plt.plot(val_maes, label="Val MAE")
plt.plot(val_rmses, label="Val RMSE")
plt.legend()
plt.grid(True)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss (PID-level)")
plt.show()