In [None]:
!pip install --upgrade wandb -qqq

In [1]:
import wandb
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from datetime import datetime

#use gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Define the label mapping
#label_mapping = {"Anxiety": 0, "Normal": 1, "Depression": 2, "Suicidal": 3, "Stress": 4}

# Load data
train_df = pd.read_csv("Dataprep/train_dataset.csv")
val_df = pd.read_csv("Dataprep/val_dataset.csv")
test_df = pd.read_csv("Dataprep/test_dataset.csv")

# Map the diagnosis labels to numerical labels using the "status" column
medication_list = train_df["Medication"].unique().tolist()
medication_mapping = {med: idx for idx, med in enumerate(medication_list)}

train_df["label"] = train_df["Medication"].map(medication_mapping)
val_df["label"] = val_df["Medication"].map(medication_mapping)
test_df["label"] = test_df["Medication"].map(medication_mapping)

train_df.dropna(subset=['label'], inplace=True)
val_df.dropna(subset=['label'], inplace=True)
test_df.dropna(subset=['label'], inplace=True)

# Define hyperparameters
length= 128
num_epochs = 10

# Class for the dataset
class MentalHealthDataset2(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze() for key in encoding}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Load Bio_ClinicalBERT tokenizer
model_name = "distilbert-base-uncased"  # Or "emilyalsentzer/Bio_ClinicalBERT" if needed

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(medication_mapping))
#tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
# Create dataset objects for train, validation, and test sets using the "statement" column for text
train_dataset = MentalHealthDataset2(train_df['Diagnosis'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = MentalHealthDataset2(val_df['Diagnosis'].tolist(), val_df['label'].tolist(), tokenizer)
test_dataset = MentalHealthDataset2(test_df['Diagnosis'].tolist(), test_df['label'].tolist(), tokenizer)

# Load the pre-trained Bio_ClinicalBERT model with a classification head
#model = AutoModelForSequenceClassification.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", num_labels=len(medication_mapping))

# Define training arguments
training_args = TrainingArguments(
    output_dir="../results",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Evaluate on the test set after training
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test evaluation results:", eval_results)

# Save the model named with a timestamp and hyperparameter configurations
current_time = datetime.now().strftime("%d.%m.%Y-%H.%M")
model_save_path = f"models/BERT/{current_time}-ML{length}E{num_epochs}"
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")


cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33magarwalm-9032[0m ([33magarwalm-9032-king-s-college-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,1.803254
2,No log,1.804792
3,1.786700,1.803906
4,1.786700,1.817041
5,1.777600,1.819546
6,1.777600,1.825556
7,1.766500,1.825692
8,1.766500,1.826506
9,1.766500,1.826138
10,1.767300,1.826421


Test evaluation results: {'eval_loss': 1.815127968788147, 'eval_runtime': 0.3102, 'eval_samples_per_second': 241.751, 'eval_steps_per_second': 16.117, 'epoch': 10.0}
Model saved to models/BERT/21.03.2025-01.27-ML128E10


In [4]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import logging
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the test data
test_df = pd.read_csv("Dataprep/test_dataset.csv")
num_rows = 100
test_df = test_df.head(num_rows)
total_examples = len(test_df)
logging.info(f"Loaded test data with {num_rows} examples")
current_directory = os.getcwd()

# Construct the model path relative to the current directory
model_path = os.path.join(current_directory, "models", "BERT", "21.03.2025-01.27-ML128E10")

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Put the model in evaluation mode
model.eval()

# Define the label mapping and its inverse
medication_mapping = {'Anxiolytics': 0, 'Benzodiazepines': 1, 'Antidepressants': 2, 'Mood Stabilizers': 3, 'Antipsychotics': 4, 'Stimulants': 5}
inv_medication_mapping = {v: k for k, v in medication_mapping.items()}
# List to hold prediction results
predictions = []

logging.info("Starting predictions on test data...")

# Loop through each row in the test data
for idx, row in test_df.iterrows():
    text = row['Diagnosis']

    # Tokenize the statement with the same settings as during training
    encoding = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=128
    )

    # Disable gradient calculations for inference
    with torch.no_grad():
        outputs = model(**encoding)

    # Convert logits to probabilities using softmax
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=1)

    # Get top 3 predictions
    top3_prob, top3_idx = torch.topk(probabilities, k=3, dim=1)
    top3_prob = top3_prob.squeeze()  # Tensor of shape (3,)
    top3_idx = top3_idx.squeeze()  # Tensor of shape (3,)

    predicted_med_1 = inv_medication_mapping[top3_idx[0].item()]
    predicted_med_2 = inv_medication_mapping[top3_idx[1].item()]
    predicted_med_3 = inv_medication_mapping[top3_idx[2].item()]

    confidence_score_1 = top3_prob[0].item()
    confidence_score_2 = top3_prob[1].item()
    confidence_score_3 = top3_prob[2].item()

    predictions.append({
        "example_number": row["example number"] if "example number" in row else idx,
        "statement": text,
        "true_status": row["Medication"] if "Medication" in row else None,
        "predicted_med_1": predicted_med_1,
        "confidence_score_1": confidence_score_1,
        "predicted_med_2": predicted_med_2,
        "confidence_score_2": confidence_score_2,
        "predicted_med_3": predicted_med_3,
        "confidence_score_3": confidence_score_3
    })

    # Log progress every 100 rows or at the end
    if (idx + 1) % 100 == 0 or (idx + 1) == total_examples:
        logging.info(f"Processed {idx + 1}/{total_examples} examples.")

# Create a DataFrame with predictions and save to CSV
predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv("test_predictions.csv", index=False)
logging.info("Predictions with top 3 diagnoses and confidence scores saved to test_predictions.csv")

# --- Analysis ---

if predictions_df["true_status"].notnull().all():
    total = len(predictions_df)
    correct_top1 = 0
    correct_top2 = 0
    correct_top3 = 0

    # Loop over each prediction and check for correctness
    for idx, row in predictions_df.iterrows():
        true_status = row["true_status"]

        # Top-1 accuracy: Check if the top prediction matches true status
        if row["predicted_med_1"] == true_status:
            correct_top1 += 1

        # Top-2 accuracy: Check if true status is in either of the top 2 predictions
        if true_status in [row["predicted_med_1"], row["predicted_med_2"]]:
            correct_top2 += 1

        # Top-3 accuracy: Check if true status is in any of the top 3 predictions
        if true_status in [row["predicted_med_1"], row["predicted_med_2"], row["predicted_med_3"]]:
            correct_top3 += 1

    # Compute accuracy percentages
    top1_accuracy = (correct_top1 / total) * 100
    top2_accuracy = (correct_top2 / total) * 100
    top3_accuracy = (correct_top3 / total) * 100

    # Logging accuracy results
    logging.info(f"Top-1 accuracy: {correct_top1} correct out of {total} ({top1_accuracy:.2f}%).")
    logging.info(f"Top-2 accuracy: {correct_top2} correct out of {total} ({top2_accuracy:.2f}%).")
    logging.info(f"Top-3 accuracy: {correct_top3} correct out of {total} ({top3_accuracy:.2f}%).")

    # Save accuracy results to CSV
    accuracy_results = pd.DataFrame({
        "Metric": ["Top-1 Accuracy", "Top-2 Accuracy", "Top-3 Accuracy"],
        "Correct Predictions": [correct_top1, correct_top2, correct_top3],
        "Total Predictions": [total, total, total],
        "Accuracy (%)": [top1_accuracy, top2_accuracy, top3_accuracy]
    })

    accuracy_results.to_csv("model_accuracy_analysis.csv", index=False)
    logging.info("Model accuracy analysis saved to model_accuracy_analysis.csv.")
else:
   logging.warning("True status values not found in all rows. Skipping analysis.")

cuda
