In [None]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.getcwd())

# Just run this block. Please do not modify the following code.
import math
import time
import io
import numpy as np
import csv
from IPython.display import Image

# Pytorch package
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Tqdm progress bar
from tqdm import tqdm_notebook, tqdm

# Code provide to you for training and evaluation
# Not sure why this import doesn't work
from utils import train, evaluate, set_seed_nb, plot_curves

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython


In [None]:
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)

In [None]:
# Load in the transcripts and process them into their relevent chunks
from src import context_chunker 
import pandas as pd
from pathlib import Path
import re

# --- Paths ---
path = "data/raw/transcripts"
meta_path = "data/raw/full_test_split.csv"

# --- Load Data ---
phq_transcript_alignment = context_chunker.match_phq_transcripts(path, meta_path)


transcripts = context_chunker.generate_dataset(path, phq_transcript_alignment)


# --- Build Representations 
sequence_len = 512
num_samples_per_pid = 20
output_dir = "data/processed"
dataset_word, dataset_sentence, dataset_dialogue = context_chunker.build_text_representations(transcripts, sequence_len, num_samples_per_pid)

context_chunker.save_text_representations(
    dataset_word,
    dataset_sentence,
    dataset_dialogue,
    output_dir
)
    

In [None]:
# Turn the chunks generated above into train split datasets
from src import dataset_builder

# if datasets were not loaded previously, load from csv
# df = pd.read_csv('your_data.csv')

df = pd.DataFrame(dataset_word, columns=["PID",'Text', 'PHQ_Score'])
normed_scores, score_mean, score_std = dataset_builder.normalize_scores(df["PHQ_Score"])
word_train, word_test, vocab = dataset_builder.preprocess_data(df["Text"], normed_scores)

In [None]:
from models import TextCNN
from models import embeddings

embeddings_file = "models/glove_wiki50/wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt"

#embedding_matrix = embeddings.load_embedding_file(embeddings_file)

embedding_matrix = embeddings.create_embedding_matrix(vocab, embeddings_file, embedding_dim=50)


In [None]:
cnn_model = TextCNN.CNNTextRegressor(len(vocab), embedding_dim = 50, kernel_size = 4, pretrained_embedding = True, embedding_matrix = embedding_matrix, freeze_embeddings = False)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = torch.optim.Adam(cnn_model.parameters(),lr=0.001)

history = train(cnn_model, criterion, optimizer, word_train, word_test, lr=0.001)
results = evaluate(cnn_model, word_test)

denormalized_predictions = dataset_builder.denormalize_predictions(results['predictions'], score_mean, score_std)
denormalized_actual = dataset_builder.denormalize_predictions(results['actual'], score_mean, score_std)

print(results)
