In [1]:
# IMPORTS

import os
import warnings
from transformers.utils import logging as hf_logging

warnings.filterwarnings("ignore", category=FutureWarning)
hf_logging.set_verbosity_error()

import numpy as np
import pandas as pd
import glob
from scipy.io import wavfile
from IPython.display import display
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import random
from transformers import BertConfig
from collections import defaultdict
import copy

In [2]:
# PARAMETERS

BATCH_SIZE = 16
PATIENCE = 15
DROPOUT = 0.4
NUM_EPOCHS = 80
LEARNING_RATE = 9e-6
MIN_DELTA = 0.005
CHUNK_LENGTH = 256
CHUNK_STRIDE = 128

In [3]:
# Seeder

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# Load CSVs and build label dicts

train_file = pd.read_csv('00_dataset_daicwoz/train_split_Depression_AVEC2017.csv')
val_file = pd.read_csv('00_dataset_daicwoz/dev_split_Depression_AVEC2017.csv')
test_file = pd.read_csv('00_dataset_daicwoz/full_test_split.csv')

train_label_dict = train_file.set_index('Participant_ID')['PHQ8_Binary'].to_dict()
val_label_dict = val_file.set_index('Participant_ID')['PHQ8_Binary'].to_dict()
test_label_dict = test_file.set_index('Participant_ID')['PHQ_Binary'].to_dict()

print("Train label distribution:\n", pd.Series(list(train_label_dict.values())).value_counts().to_string(index=True))
print("\nVal label distribution:\n", pd.Series(list(val_label_dict.values())).value_counts().to_string(index=True))
print("\nTest label distribution:\n", pd.Series(list(test_label_dict.values())).value_counts().to_string(index=True))

Train label distribution:
 0    77
1    30

Val label distribution:
 0    23
1    12

Test label distribution:
 0    33
1    14


In [5]:
# PUT ALL TRANSCRIPT ANSWERS AS ONE DATAFRAME

interviews = []

directory = os.fsencode('02_transcripts')
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    
    interview = []
    transcript_file = pd.read_csv(directory + '/' + filename, sep='\t', lineterminator='\r')
    for index, row in transcript_file.iterrows():
        speaker = row.get('speaker', '')
        value = row.get('value', '')
        if(speaker and value):
            interview.append(f"{speaker}: ({value})")
    interviews.append([int(filename.split('_')[0]), ". ".join(interview)])

df = pd.DataFrame(interviews, columns=['Participant_ID', 'interview'])
print('Rows: ', len(df))
display(df.head())

lengths = [len(data) for _, data in interviews]
min_length = min(lengths)
max_length = max(lengths)
print("Min length:", min_length)
print("Max length:", max_length)

Rows:  187


Unnamed: 0,Participant_ID,interview
0,481,Participant: (<synch>). Ellie: (IntroV4Confirm...
1,407,Participant: (<sync>). Ellie: (IntroV4Confirma...
2,344,Ellie: (hi i'm ellie thanks for coming in toda...
3,393,Participant: (<sync>). Ellie: (IntroV4Confirma...
4,477,Participant: (<synch>). Ellie: (IntroV4Confirm...


Min length: 5595
Max length: 31505


In [6]:
# TRAIN & VAL DATAFRAME

train_subset = train_file[['Participant_ID', 'PHQ8_Binary']]
train_df = pd.merge(train_subset, df, on='Participant_ID', how='inner')

val_subset = val_file[['Participant_ID', 'PHQ8_Binary']]
val_df = pd.merge(val_subset, df, on='Participant_ID', how='inner')

test_subset = test_file[['Participant_ID', 'PHQ_Binary']]
test_df = pd.merge(test_subset, df, on='Participant_ID', how='inner')
test_df = test_df.rename(columns={'PHQ_Binary': 'PHQ8_Binary'})

print(f"Train Rows: {len(train_df)}, Val Rows: {len(val_df)}, Test Rows: {len(test_df)}")

Train Rows: 106, Val Rows: 35, Test Rows: 46


In [7]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512, stride=256):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.samples = []

        for _, row in df.iterrows():
            text = row['interview']
            label = row['PHQ8_Binary']
            participant_id = row['Participant_ID']

            # Encode full text (no truncation)
            tokens = tokenizer.encode(text, add_special_tokens=True)

            # Chunk the token list
            for i in range(0, len(tokens), max_length - stride):
                chunk = tokens[i:i + max_length]
                if len(chunk) < 10:
                    continue
                self.samples.append({
                    'input_ids': chunk,
                    'label': label,
                    'participant_id': participant_id
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        input_ids = sample['input_ids']
        label = sample['label']
        participant_id = sample['participant_id']

        encoding = self.tokenizer.prepare_for_model(
            input_ids,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        item['participant_id'] = torch.tensor(participant_id, dtype=torch.long)
        return item

In [8]:
# Early Stopping

class EarlyStopping:
    def __init__(self):
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > MIN_DELTA:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= PATIENCE:
                self.early_stop = True

In [9]:
# Load Model and Tokenizer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name, hidden_dropout_prob=DROPOUT, attention_probs_dropout_prob=DROPOUT)
model = BertForSequenceClassification.from_pretrained(model_name, config=config)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [10]:
# Worker Seed

def seed_worker(worker_id):
    worker_seed = SEED + worker_id
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [11]:
# DataLoaders

train_dataset = TextDataset(train_df, tokenizer, CHUNK_LENGTH, CHUNK_STRIDE)
val_dataset = TextDataset(val_df, tokenizer, CHUNK_LENGTH, CHUNK_STRIDE)
test_dataset = TextDataset(test_df, tokenizer, CHUNK_LENGTH, CHUNK_STRIDE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, worker_init_fn=seed_worker)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0, worker_init_fn=seed_worker)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=0, worker_init_fn=seed_worker)

In [12]:
# Values

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['PHQ8_Binary']),
    y=train_df['PHQ8_Binary']
)
#class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights = torch.tensor([1.0, 3.0], dtype=torch.float).to(device)
print(class_weights);

loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.002)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=3)

tensor([1., 3.], device='cuda:0')


In [13]:
# Best Values

best_run_val_loss = float('inf')
best_run_f1 = 0.0
best_run_model_state = None

In [58]:
# Training Code
early_stopping = EarlyStopping()

for epoch in range(NUM_EPOCHS):
    # Training
    model.train()
    current_lr = optimizer.param_groups[0]['lr']
    train_loss = 0
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'participant_id'}
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, inputs['labels'])

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    participant_logits = defaultdict(list)
    participant_labels = {}

    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k not in ['participant_id']}
            participant_ids = batch['participant_id']

            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            val_loss += loss.item()

            logits = outputs.logits.detach().cpu().numpy()
            labels = inputs['labels'].cpu().numpy()

            for i in range(len(participant_ids)):
                pid = participant_ids[i].item()
                participant_logits[pid].append(logits[i])
                participant_labels[pid] = labels[i]

    val_loss /= len(val_loader)

    # Aggregate logits and compute predictions
    final_preds, final_labels = [], []
    for pid in participant_logits:
        avg_logits = np.mean(participant_logits[pid], axis=0)
        pred = np.argmax(avg_logits)
        final_preds.append(pred)
        final_labels.append(participant_labels[pid])
        
    # Compute macro F1 score
    val_f1 = f1_score(final_labels, final_preds, average='macro')    

    print(f"Epoch {epoch+1} complete. Train Loss: {train_loss:.4f}  Val Loss: {val_loss:.4f}  Val Macro F1: {val_f1:.4f}  LR: {current_lr:.8f}")

    # Save best model of current training
    if val_f1 > best_run_f1 or (val_f1 == best_run_f1 and val_loss < best_run_val_loss):
        best_run_f1 = val_f1
        best_run_val_loss = val_loss
        best_run_model_state = copy.deepcopy(model.state_dict())
    
    # Early stopping on val loss
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

    scheduler.step(val_loss)

Epoch 1 complete. Train Loss: 0.7071  Val Loss: 0.7067  Val Macro F1: 0.2553  LR: 0.00000900
Epoch 2 complete. Train Loss: 0.7043  Val Loss: 0.6870  Val Macro F1: 0.3966  LR: 0.00000900
Epoch 3 complete. Train Loss: 0.6990  Val Loss: 0.6929  Val Macro F1: 0.4898  LR: 0.00000900
Epoch 4 complete. Train Loss: 0.6788  Val Loss: 0.7655  Val Macro F1: 0.4582  LR: 0.00000900
Epoch 5 complete. Train Loss: 0.6660  Val Loss: 0.7683  Val Macro F1: 0.4582  LR: 0.00000900
Epoch 6 complete. Train Loss: 0.6514  Val Loss: 0.7410  Val Macro F1: 0.4582  LR: 0.00000900
Epoch 7 complete. Train Loss: 0.6251  Val Loss: 0.7524  Val Macro F1: 0.4582  LR: 0.00000630
Epoch 8 complete. Train Loss: 0.5990  Val Loss: 0.7461  Val Macro F1: 0.5100  LR: 0.00000630
Epoch 9 complete. Train Loss: 0.5855  Val Loss: 0.7587  Val Macro F1: 0.4582  LR: 0.00000630
Epoch 10 complete. Train Loss: 0.5565  Val Loss: 0.7660  Val Macro F1: 0.5100  LR: 0.00000630
Epoch 11 complete. Train Loss: 0.5303  Val Loss: 0.7741  Val Macro F1

In [59]:
# Evaluation of last training

if best_run_model_state is not None:
    print("Evaluating best model from last training run...")
    model.load_state_dict(best_run_model_state)
    model.eval()

    # Run validation again
    participant_logits = defaultdict(list)
    participant_labels = {}

    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'participant_id'}
            participant_ids = batch['participant_id']

            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            labels = inputs['labels'].cpu().numpy()

            for i in range(len(participant_ids)):
                pid = participant_ids[i].item()
                participant_logits[pid].append(probs[i])
                participant_labels[pid] = labels[i]

    # Aggregate predictions
    final_probs, final_preds, final_labels = [], [], []
    for pid in participant_logits:
        avg_probs = np.mean(participant_logits[pid], axis=0)
        final_preds.append(np.argmax(avg_probs))
        final_probs.append(avg_probs[1])  # for AUROC
        final_labels.append(participant_labels[pid])

    # Metrics
    val_f1 = f1_score(final_labels, final_preds, average='macro')
    auroc = roc_auc_score(final_labels, final_probs)

    print(f"val_loss: {best_run_val_loss:.4f}")
    print(f"AUROC: {auroc:.4f}")
    print(classification_report(final_labels, final_preds, digits=2))
else:
    print("No best model was found during this run.")

Evaluating best model from last training run...
val_loss: 0.8274
AUROC: 0.6051
              precision    recall  f1-score   support

           0       0.69      0.78      0.73        23
           1       0.44      0.33      0.38        12

    accuracy                           0.63        35
   macro avg       0.57      0.56      0.56        35
weighted avg       0.61      0.63      0.61        35



In [None]:
# val_loss: 0.3843
#               precision    recall  f1-score   support

#            0       0.94      0.74      0.83        23
#            1       0.65      0.92      0.76        12

#     accuracy                           0.80        35
#    macro avg       0.80      0.83      0.79        35
# weighted avg       0.84      0.80      0.81        35

In [14]:
# Evaluation with the best model

filename_best = '02_best_text_model.pt'
checkpoint = torch.load(filename_best, weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print("Loaded best model weights.")

# Aggregation storage
participant_probs = defaultdict(list)
participant_labels = {}

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'participant_id'}
        participant_ids = batch['participant_id']

        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        labels = inputs['labels'].cpu().numpy()

        for i in range(len(participant_ids)):
            pid = participant_ids[i].item()
            participant_probs[pid].append(probs[i])
            participant_labels[pid] = labels[i]

# Average probabilities per participant
final_probs, final_labels = [], []
for pid in participant_probs:
    avg_prob = np.mean(participant_probs[pid])
    final_probs.append(avg_prob)
    final_labels.append(participant_labels[pid])

# AUROC and classification report
auroc = roc_auc_score(final_labels, final_probs)
preds = [1 if p >= 0.5 else 0 for p in final_probs]

print(f"\nValidation AUROC: {auroc:.4f}")
print(classification_report(final_labels, preds, digits=2))

Loaded best model weights.

Validation AUROC: 0.6377
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        23
           1       0.42      0.42      0.42        12

    accuracy                           0.60        35
   macro avg       0.56      0.56      0.56        35
weighted avg       0.60      0.60      0.60        35



In [None]:
# Loaded best model weights
# Validation AUROC: 0.8333
#               precision    recall  f1-score   support

#            0       0.90      0.78      0.84        23
#            1       0.67      0.83      0.74        12

#     accuracy                           0.80        35
#    macro avg       0.78      0.81      0.79        35
# weighted avg       0.82      0.80      0.80        35

In [15]:
# Test Evaluation

model.eval()
participant_probs = defaultdict(list)
participant_labels = {}

with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'participant_id'}
        participant_ids = batch['participant_id']

        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        labels = inputs['labels'].cpu().numpy()

        for i in range(len(participant_ids)):
            pid = participant_ids[i].item()
            participant_probs[pid].append(probs[i])
            participant_labels[pid] = labels[i]

# Aggregate per participant
final_probs, final_labels = [], []
for pid in participant_probs:
    avg_prob = np.mean(participant_probs[pid])
    final_probs.append(avg_prob)
    final_labels.append(participant_labels[pid])

# Compute AUROC and classification report
auroc = roc_auc_score(final_labels, final_probs)
preds = [1 if p >= 0.5 else 0 for p in final_probs]

print(f"Test AUROC: {auroc:.4f}")
print(classification_report(final_labels, preds, digits=2))

Test AUROC: 0.5982
              precision    recall  f1-score   support

           0       0.76      0.69      0.72        32
           1       0.41      0.50      0.45        14

    accuracy                           0.63        46
   macro avg       0.59      0.59      0.59        46
weighted avg       0.65      0.63      0.64        46



In [None]:
# Test AUROC: 0.5915
#               precision    recall  f1-score   support

#            0       0.74      0.72      0.73        32
#            1       0.40      0.43      0.41        14

#     accuracy                           0.63        46
#    macro avg       0.57      0.57      0.57        46
# weighted avg       0.64      0.63      0.63        46

In [50]:
# Save Best Model

torch.save({
    'model_state_dict': model.state_dict(),
    'best_val_loss': best_run_val_loss,
    'best_f1': val_f1,
}, filename_best)

print("Model Saved!")

Model Saved!
