In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.getcwd())

# Just run this block. Please do not modify the following code.
import math
import time
import io
import numpy as np
import csv
from IPython.display import Image

# Pytorch package
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Tqdm progress bar
from tqdm import tqdm_notebook, tqdm

# Code provide to you for training and evaluation
# Not sure why this import doesn't work
from utils import train, evaluate, set_seed_nb, plot_curves

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython


In [2]:
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)

You are using device: cuda


In [7]:
# Load in the transcripts and process them into their relevent chunks
from src import context_chunker 
import pandas as pd
from pathlib import Path
import re

# --- Paths ---
path = "data/raw/transcripts"
meta_path = "data/raw/full_test_split.csv"

# --- Load Data ---
phq_transcript_alignment = context_chunker.match_phq_transcripts(path, meta_path)


transcripts = context_chunker.generate_dataset(path, phq_transcript_alignment)


# --- Build Representations 
sequence_len = 512
num_samples_per_pid = 20
output_dir = "data/processed"
dataset_word, dataset_sentence, dataset_dialogue = context_chunker.build_text_representations(transcripts, sequence_len, num_samples_per_pid)

context_chunker.save_text_representations(
    dataset_word,
    dataset_sentence,
    dataset_dialogue,
    output_dir
)
    

Loaded PHQ mapping for 188 participants.
⚠ Missing PHQ score for: [303, 304, 305, 307, 310, 312, 313, 315, 316, 317, 318, 319, 320, 321, 322, 324, 325, 326, 327, 328, 330, 331, 333, 335, 336, 338, 339, 340, 341, 343, 344, 345, 346, 347, 348, 350, 351, 352, 353, 355, 356, 357, 358, 360, 362, 363, 364, 366, 367, 368, 369, 370, 371, 372, 374, 375, 376, 377, 379, 380, 381, 382, 383, 385, 386, 388, 389, 390, 391, 392, 393, 395, 397, 400, 401, 402, 403, 404, 406, 409, 412, 413, 414, 415, 416, 417, 418, 419, 420, 422, 423, 425, 426, 427, 428, 429, 430, 433, 434, 436, 437, 439, 440, 441, 443, 444, 445, 446, 447, 448, 449, 451, 454, 455, 456, 457, 458, 459, 463, 464, 468, 471, 472, 473, 474, 475, 476, 477, 478, 479, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492]
[!] Empty or malformed transcript for 487, skipping.
Loaded 187 transcripts with PHQ scores.
Saved word-level dataset → data/processed\word_level.csv
Saved sentence-level dataset → data/processed\sentence_level.csv
Saved dialogu

In [20]:
# Turn the chunks generated above into train split datasets
from src import dataset_builder

# if datasets were not loaded previously, load from csv
# df = pd.read_csv('your_data.csv')

df = pd.DataFrame(dataset_word, columns=["PID",'Text', 'PHQ_Score'])
normed_scores, score_mean, score_std = dataset_builder.normalize_scores(df["PHQ_Score"])
word_train, word_test, vocab = dataset_builder.preprocess_data(df["Text"], normed_scores)

Score normalization:
  Original: mean=1.01, std=4.72, range=[-1.00, 22.00]
  Normalized: mean=0.00, std=1.00, range=[-0.42, 4.45]
Preprocessing 3740 samples
Train size: 2805, Test size: 935
Vocabulary size: 8212
Train dataset size: 2805
Test dataset size: 935


In [6]:
from models import TextCNN
from models import embeddings

embeddings_file = "models/glove_wiki50/wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt"

#embedding_matrix = embeddings.load_embedding_file(embeddings_file)

embedding_matrix = embeddings.create_embedding_matrix(vocab, embeddings_file, embedding_dim=50)


Loading embeddings from models/glove_wiki50/wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt...
Loaded 1287614 word vectors (dimension: 50)
Building embedding matrix for vocabulary size: 8212
Found embeddings for 7728/8212 words (94.1%)


In [23]:
cnn_model = TextCNN.CNNTextRegressor(len(vocab), embedding_dim = 50, kernel_size = 4, pretrained_embedding = True, embedding_matrix = embedding_matrix, freeze_embeddings = False)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = torch.optim.Adam(cnn_model.parameters(),lr=0.001)

history = train(cnn_model, criterion, optimizer, word_train, word_test, lr=0.001)
results = evaluate(cnn_model, word_test)

denormalized_predictions = dataset_builder.denormalize_predictions(results['predictions'], score_mean, score_std)
denormalized_actual = dataset_builder.denormalize_predictions(results['actual'], score_mean, score_std)

print(results)


Epoch 1/20
Train Loss (MSE): 1.0693, Train MAE: 0.7623
Val Loss (MSE): 1.0140, Val MAE: 0.6666
--------------------------------------------------
Epoch 2/20
Train Loss (MSE): 1.0074, Train MAE: 0.6698
Val Loss (MSE): 1.0128, Val MAE: 0.6582
--------------------------------------------------
Epoch 3/20
Train Loss (MSE): 1.0026, Train MAE: 0.6690
Val Loss (MSE): 1.0126, Val MAE: 0.6569
--------------------------------------------------
Epoch 4/20
Train Loss (MSE): 1.0023, Train MAE: 0.6658
Val Loss (MSE): 1.0122, Val MAE: 0.6529
--------------------------------------------------
Epoch 5/20
Train Loss (MSE): 1.0020, Train MAE: 0.6599
Val Loss (MSE): 1.0121, Val MAE: 0.6522
--------------------------------------------------
Epoch 6/20
Train Loss (MSE): 1.0074, Train MAE: 0.6605
Val Loss (MSE): 1.0119, Val MAE: 0.6494
--------------------------------------------------
Epoch 7/20
Train Loss (MSE): 1.0069, Train MAE: 0.6570
Val Loss (MSE): 1.0118, Val MAE: 0.6488
-----------------------------