In [None]:
import sys
from pathlib import Path
PROJECT_ROOT = Path().cwd().resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))

print("Added:", PROJECT_ROOT)

print("CWD:", Path.cwd())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("sys.path contains project root?:", str(PROJECT_ROOT) in sys.path)

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from src.dataset import HANDataset, build_vocab, create_balanced_dataloader, split_by_pid

from models.attention_rnn.attention_model import AttentionRNN
import yaml

# Load config
CONFIG_PATH = Path("config.yaml")

with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

In [None]:
data_cfg = config["data"]
root = Path(data_cfg["data_root"])

word_df = pd.read_csv(root / data_cfg["word_csv"])
sentence_df = pd.read_csv(root / data_cfg["sentence_csv"])
dialogue_df = pd.read_csv(root / data_cfg["dialogue_csv"])

In [None]:
# PID | Text (40–60 words) | PHQ_Score
print(word_df.head()) 
# PID | Text (1–2 sentences) | PHQ_Score
print(sentence_df.head())
# PID | Text | PHQ_Score
print(dialogue_df.head())

# Current HAN (Document → Sentences → Words)
# dialogue_df (Text) -> sentence_df (sentences) -> word tokenization

In [None]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

# 1. Building comprehensive vocab
all_texts = (
    word_df["Text"].tolist() + 
    sentence_df["Text"].tolist() + 
    dialogue_df["Text"].tolist()
)

vocab = build_vocab(all_texts, min_freq=config["vocab"]["min_freq"])

# 2-1. nltk (converting to hierarchical documents)
    # Working as a main source of data
train_df, val_df, test_df = split_by_pid(dialogue_df)

train_dataset = HANDataset(train_df, vocab)
val_dataset   = HANDataset(val_df, vocab)
test_dataset  = HANDataset(test_df, vocab)

# 2-2. Index mapping (for each batch)
unique_scores = sorted(dialogue_df["PHQ_Score"].unique())
score2idx = {score: idx for idx, score in enumerate(unique_scores)}
idx2score = {idx: score for score, idx in score2idx.items()}    

dialogue_df["label_idx"] = dialogue_df["PHQ_Score"].map(score2idx)
# 3. dataloaders
train_loader = create_balanced_dataloader(
    train_dataset,
    batch_size=config["dataloader"]["batch_size"]
)

val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 4. verify shapes
batch = next(iter(train_loader))
print(batch["input_ids"].shape)
print(batch["label"].shape)
print(batch["pid"].shape)

In [None]:
# ---- Load model config ----
model_cfg = config["model"]

# ---- Auto-select device ----
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Using device:", device)

In [None]:
NUM_CLASSES = dialogue_df["PHQ_Score"].nunique()
NUM_CLASSES

In [None]:
# Model initialization
model = AttentionRNN(
    vocab_size=len(vocab),
    embed_dim=128,
    hidden_dim=64,
    num_classes=NUM_CLASSES,
    pad_idx=0,
    max_words=50,
    max_sentences=10
).to(device)

In [None]:
train_cfg = config["training"]

optimizer = AdamW(
    model.parameters(),
    lr=float(train_cfg["learning_rate"]),
    weight_decay=float(train_cfg["weight_decay"])
)

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

In [None]:
# # Training
# num_epochs = train_cfg["num_epochs"]

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     for batch in train_loader:
#         inputs = batch["input_ids"].to(device)      
#         labels = batch["label"].to(device)
        
#         optimizer.zero_grad()

#         # forward pass
#         outputs = model(inputs) 
#         loss = criterion(outputs, labels)

#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

In [None]:
save_path="best_han.pt"
best_val_loss = float("inf")
num_epochs = train_cfg["num_epochs"]

avg_training_loss_list = []
avg_validation_loss_list = []

for epoch in range(num_epochs):

    # =========================================================
    #   TRAINING
    # =========================================================
    model.train()
    train_losses = []

    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress:
        optimizer.zero_grad()

        docs = batch["input_ids"].to(device)          # [B, S, W]
        labels = batch["label"].to(device)      # [B]

        outputs = model(docs).squeeze(1)        # [B]

        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), train_cfg['gradient_clip'])
        optimizer.step()

        train_losses.append(loss.item())
        progress.set_postfix({"loss": loss.item()})

    avg_train_loss = np.mean(train_losses)
    avg_training_loss_list.append(avg_train_loss)

    # =========================================================
    #   VALIDATION
    # =========================================================
    model.eval()
    val_losses = []

    with torch.no_grad():
        for batch in val_loader:
            docs = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(docs).squeeze(1)
            loss = criterion(outputs, labels)
            
            val_losses.append(loss.item())

    avg_val_loss = np.mean(val_losses)
    avg_validation_loss_list.append(avg_val_loss)

    print(f"Epoch {epoch+1}: Train={avg_train_loss:.4f}  Val={avg_val_loss:.4f}")

    # =========================================================
    #   SAVE BEST MODEL
    # =========================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)
        print(f"Saved best model → {save_path}")

print("\nTraining complete!")
print(f"Best validation loss: {best_val_loss:.4f}")

In [None]:
# Future exploration:
# 1. switching from GRU to BiLSTM 
    # self.gru = nn.LSTM(... bidirectional=True)
# 2. Utilizing sentence_df
    # sentence_df
# 3. Adding regularization (e.g. dropout)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(avg_training_loss_list, label="Train Loss", marker='o')
plt.plot(avg_validation_loss_list, label="Val Loss", marker='o')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True)
plt.show()