In [None]:
#This script firstly pre-processes the dataset and deals with problems raised by Data Prep Team and EDA
#Then develops a custom hybrid model
#Trains model
#Uses model to predict medication and therapy based on age, gender and diagnosis
#Evaluates performance and generates .csv file of results

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from datetime import datetime
import os

# Load dataset
df = pd.read_csv("train_dataset.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Preprocess categorical labels
medication_encoder = LabelEncoder()
therapy_encoder = LabelEncoder()
gender_encoder = LabelEncoder()

df["Medication"] = medication_encoder.fit_transform(df["Medication"])
df["Therapy Type"] = therapy_encoder.fit_transform(df["Therapy Type"])
df["Gender"] = gender_encoder.fit_transform(df["Gender"])

# Normalize Age
scaler = StandardScaler()
df["Age"] = scaler.fit_transform(df[["Age"]])
# Compute class weights for Medication and Therapy
medication_weights = compute_class_weight(class_weight="balanced", classes=df["Medication"].unique(), y=df["Medication"])
therapy_weights = compute_class_weight(class_weight="balanced", classes=df["Therapy Type"].unique(), y=df["Therapy Type"])

# Convert to PyTorch tensors
medication_weights_tensor = torch.tensor(medication_weights, dtype=torch.float32).to(device)
therapy_weights_tensor = torch.tensor(therapy_weights, dtype=torch.float32).to(device)

# Load tokeniser
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


# Custom Dataset
class MentalHealthDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.diagnosis = df["Diagnosis"].tolist()
        self.age = torch.tensor(df["Age"].values, dtype=torch.float32).unsqueeze(1)
        self.gender = torch.tensor(df["Gender"].values, dtype=torch.long)
        self.medication = torch.tensor(df["Medication"].values, dtype=torch.long)
        self.therapy = torch.tensor(df["Therapy Type"].values, dtype=torch.long)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.diagnosis)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.diagnosis[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze() for key in encoding}
        item["age"] = self.age[idx]
        item["gender"] = self.gender[idx]
        item["medication"] = self.medication[idx]
        item["therapy"] = self.therapy[idx]
        return item

# Model Class
class HybridMentalHealthModel(nn.Module):
    def __init__(self, bert_model, num_genders, num_medications, num_therapies, hidden_size=128):
        super(HybridMentalHealthModel, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model)
        bert_output_size = self.bert.config.hidden_size

        # MLP for Age & Gender
        self.age_fc = nn.Linear(1, 16)
        self.gender_fc = nn.Embedding(num_genders, 16)

        # Combined Layer
        self.fc = nn.Linear(bert_output_size + 32, hidden_size)

        # Output Layers
        self.medication_head = nn.Linear(hidden_size, num_medications)
        self.therapy_head = nn.Linear(hidden_size, num_therapies)

    def forward(self, input_ids, attention_mask, age, gender):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        age_out = self.age_fc(age)
        gender_out = self.gender_fc(gender)
        combined = torch.cat((bert_output, age_out, gender_out), dim=1)
        hidden = torch.relu(self.fc(combined))
        return self.medication_head(hidden), self.therapy_head(hidden)

# Prepare DataLoader
dataset = MentalHealthDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialise Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_genders = len(gender_encoder.classes_)
num_medications = len(medication_encoder.classes_)
num_therapies = len(therapy_encoder.classes_)

model = HybridMentalHealthModel("emilyalsentzer/Bio_ClinicalBERT", num_genders, num_medications, num_therapies)
model.to(device)

# Training Setup
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
medication_criterion = nn.CrossEntropyLoss(weight=medication_weights_tensor)
therapy_criterion = nn.CrossEntropyLoss(weight=therapy_weights_tensor)

# Training Loop
epochs = 20
length = 128
num_epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        age = batch["age"].to(device)
        gender = batch["gender"].to(device)
        medication_labels = batch["medication"].to(device)
        therapy_labels = batch["therapy"].to(device)

        # Forward pass
        med_pred, therapy_pred = model(input_ids, attention_mask, age, gender)

        # Compute weighted loss
        loss_med = medication_criterion(med_pred, medication_labels)
        loss_therapy = therapy_criterion(therapy_pred, therapy_labels)
        loss = loss_med + loss_therapy

        # Backpropagation
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")
os.makedirs("models/BERT", exist_ok=True)
# Save the model named with a timestamp and hyperparameter configurations
current_time = datetime.now().strftime("%d.%m.%Y-%H.%M")
model_save_path = f"models/BERT/{current_time}-ML{length}E{num_epochs}"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch 1/20, Loss: 3.1845
Epoch 2/20, Loss: 3.1490
Epoch 3/20, Loss: 3.1368
Epoch 4/20, Loss: 3.1281
Epoch 5/20, Loss: 3.1184
Epoch 6/20, Loss: 3.1268
Epoch 7/20, Loss: 3.1181
Epoch 8/20, Loss: 3.1312
Epoch 9/20, Loss: 3.1221
Epoch 10/20, Loss: 3.1230
Epoch 11/20, Loss: 3.1146
Epoch 12/20, Loss: 3.1101
Epoch 13/20, Loss: 3.1189
Epoch 14/20, Loss: 3.1118
Epoch 15/20, Loss: 3.1064
Epoch 16/20, Loss: 3.1064
Epoch 17/20, Loss: 3.1029
Epoch 18/20, Loss: 3.1113
Epoch 19/20, Loss: 3.1098
Epoch 20/20, Loss: 3.1125
Model saved to models/BERT/26.03.2025-00.46-ML128E20


In [3]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Initialise Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_genders = 2
num_medications = 6
num_therapies = 4

model = HybridMentalHealthModel("emilyalsentzer/Bio_ClinicalBERT", num_genders, num_medications, num_therapies)


# Load the saved state dictionary
model.load_state_dict(torch.load(model_save_path, map_location=device))
model.to(device)

model.eval()  # Put model in evaluation mode


HybridMentalHealthModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [4]:
# Load test dataset
test_df = pd.read_csv("test_dataset.csv")
# Encode categorical labels using the same LabelEncoders
test_df["Medication"] = medication_encoder.transform(test_df["Medication"])
test_df["Therapy Type"] = therapy_encoder.transform(test_df["Therapy Type"])
test_df["Gender"] = gender_encoder.transform(test_df["Gender"])

# Normalize Age
test_df["Age"] = scaler.transform(test_df[["Age"]])


In [5]:
class MentalHealthTestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.diagnosis = df["Diagnosis"].tolist()
        self.age = torch.tensor(df["Age"].values, dtype=torch.float32).unsqueeze(1)
        self.gender = torch.tensor(df["Gender"].values, dtype=torch.long)
        self.medication = torch.tensor(df["Medication"].values, dtype=torch.long)
        self.therapy = torch.tensor(df["Therapy Type"].values, dtype=torch.long)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.diagnosis)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.diagnosis[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze() for key in encoding}
        item["age"] = self.age[idx]
        item["gender"] = self.gender[idx]
        item["medication"] = self.medication[idx]  # True labels
        item["therapy"] = self.therapy[idx]  # True labels
        return item


In [6]:
# Prepare DataLoader
test_dataset = MentalHealthTestDataset(test_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialise lists for evaluation
true_medications, pred_medications = [], []
true_therapies, pred_therapies = [], []

# Inference loop
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        age = batch["age"].to(device)
        gender = batch["gender"].to(device)
        true_medication = batch["medication"].cpu().numpy()
        true_therapy = batch["therapy"].cpu().numpy()

        # Get predictions
        med_logits, therapy_logits = model(input_ids, attention_mask, age, gender)

        # Convert logits to predicted class labels
        med_preds = torch.argmax(med_logits, dim=1).cpu().numpy()
        therapy_preds = torch.argmax(therapy_logits, dim=1).cpu().numpy()

        # Store results
        true_medications.extend(true_medication)
        pred_medications.extend(med_preds)
        true_therapies.extend(true_therapy)
        pred_therapies.extend(therapy_preds)


In [7]:
from sklearn.metrics import accuracy_score, classification_report

# Medication Prediction Accuracy
med_accuracy = accuracy_score(true_medications, pred_medications)
med_report = classification_report(true_medications, pred_medications, target_names=medication_encoder.classes_)

# Therapy Prediction Accuracy
therapy_accuracy = accuracy_score(true_therapies, pred_therapies)
therapy_report = classification_report(true_therapies, pred_therapies, target_names=therapy_encoder.classes_)

print(f"Medication Prediction Accuracy: {med_accuracy:.4f}")
print(med_report)
print(f"Therapy Prediction Accuracy: {therapy_accuracy:.4f}")
print(therapy_report)

# Save results to CSV
results_df = pd.DataFrame({
    "Diagnosis": test_df["Diagnosis"],
    "True Medication": medication_encoder.inverse_transform(true_medications),
    "Predicted Medication": medication_encoder.inverse_transform(pred_medications),
    "True Therapy": therapy_encoder.inverse_transform(true_therapies),
    "Predicted Therapy": therapy_encoder.inverse_transform(pred_therapies)
})

results_df.to_csv("test_predictions.csv", index=False)
print("Predictions saved to test_predictions.csv")


Medication Prediction Accuracy: 0.1600
                  precision    recall  f1-score   support

 Antidepressants       0.00      0.00      0.00        14
  Antipsychotics       0.12      0.50      0.20         8
     Anxiolytics       0.00      0.00      0.00        13
 Benzodiazepines       0.11      0.10      0.11        10
Mood Stabilizers       0.00      0.00      0.00        17
           SSRIs       0.21      0.54      0.30        13

        accuracy                           0.16        75
       macro avg       0.07      0.19      0.10        75
    weighted avg       0.06      0.16      0.09        75

Therapy Prediction Accuracy: 0.2400
                                precision    recall  f1-score   support

  Cognitive Behavioral Therapy       0.33      0.23      0.27        13
Dialectical Behavioral Therapy       0.18      0.32      0.23        19
         Interpersonal Therapy       0.33      0.14      0.20        21
     Mindfulness-Based Therapy       0.26      0.27  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
incorrect_predictions = results_df[results_df["True Medication"] != results_df["Predicted Medication"]]
print("Incorrect Medication Predictions:")
print(incorrect_predictions.head(10))

incorrect_predictions_therapy = results_df[results_df["True Therapy"] != results_df["Predicted Therapy"]]
print("Incorrect Therapy Predictions:")
print(incorrect_predictions_therapy.head(10))


Incorrect Medication Predictions:
     Diagnosis   True Medication Predicted Medication  \
0      Anxiety       Anxiolytics                SSRIs   
1     Suicidal   Benzodiazepines       Antipsychotics   
2      Bipolar   Antidepressants       Antipsychotics   
4     Suicidal   Benzodiazepines       Antipsychotics   
5      Anxiety  Mood Stabilizers                SSRIs   
7      Anxiety   Antidepressants                SSRIs   
8   Depression  Mood Stabilizers      Benzodiazepines   
9      Bipolar  Mood Stabilizers       Antipsychotics   
11     Bipolar       Anxiolytics       Antipsychotics   
12     Anxiety  Mood Stabilizers                SSRIs   

                      True Therapy               Predicted Therapy  
0     Cognitive Behavioral Therapy  Dialectical Behavioral Therapy  
1        Mindfulness-Based Therapy           Interpersonal Therapy  
2            Interpersonal Therapy       Mindfulness-Based Therapy  
4            Interpersonal Therapy           Interpersonal The