In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

Load the data

In [6]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

Tokenization of text

In [7]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Create a mapping from status labels to integers
label_map = {status: i for i, status in enumerate(train_df['status'].unique())}
id_map = {i: status for i, status in enumerate(train_df['status'].unique())}
print(id_map)

def tokenize(batch):
    # Map the status to numerical labels and add to the batch
    batch["label"] = [label_map[status] for status in batch["status"]]
    return tokenizer(batch["statement"], padding=True, truncation=True)

# Convert pandas DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply tokenization
train_dataset_encoded = train_dataset.map(tokenize, batched=True)
val_dataset_encoded = val_dataset.map(tokenize, batched=True)
test_dataset_encoded = test_dataset.map(tokenize, batched=True)

{0: 'Personality disorder', 1: 'Suicidal', 2: 'Depression', 3: 'Anxiety', 4: 'Normal', 5: 'Stress', 6: 'Bipolar'}


Map:   0%|          | 0/12360 [00:00<?, ? examples/s]

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

Map:   0%|          | 0/1546 [00:00<?, ? examples/s]

Model building

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=7)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(base_model, lora_config).to(device)
model.print_trainable_parameters()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 743,431 || all params: 67,702,286 || trainable%: 1.0981


In [9]:
batch_size = 64
model_name = "distilbert-LORA-finetuned-mental-health"

training_args = TrainingArguments(output_dir = model_name,
                                  num_train_epochs=3,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size= batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay=0.01,
                                  eval_strategy = 'epoch',
                                  disable_tqdm=False,
                                  report_to = "none")

Define metrics

In [10]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

Train the model

In [11]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_dataset_encoded,
                  eval_dataset=val_dataset_encoded,
                  tokenizer=tokenizer)
trainer.train()

  trainer = Trainer(model=model,


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.605595,0.431068,0.361086
2,No log,1.353621,0.519741,0.467818
3,1.565800,1.292137,0.535275,0.493285


TrainOutput(global_step=582, training_loss=1.529619983791076, metrics={'train_runtime': 1481.7058, 'train_samples_per_second': 25.025, 'train_steps_per_second': 0.393, 'total_flos': 4997013171978240.0, 'train_loss': 1.529619983791076, 'epoch': 3.0})

Evaluate performance on test data

In [12]:
preds_outputs = trainer.predict(test_dataset_encoded)
preds_outputs.metrics

{'test_loss': 1.3088897466659546,
 'test_accuracy': 0.5394566623544631,
 'test_f1': 0.499537248422549,
 'test_runtime': 25.6583,
 'test_samples_per_second': 60.253,
 'test_steps_per_second': 0.974}

In [14]:
y_preds = np.argmax(preds_outputs.predictions, axis=1)
y_true = [label for label in test_dataset_encoded["label"]]
print(id_map)
print(classification_report(y_true, y_preds, zero_division=0))

{0: 'Personality disorder', 1: 'Suicidal', 2: 'Depression', 3: 'Anxiety', 4: 'Normal', 5: 'Stress', 6: 'Bipolar'}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        72
           1       0.52      0.68      0.59       302
           2       0.52      0.47      0.49       309
           3       0.46      0.55      0.50       239
           4       0.64      0.93      0.76       289
           5       0.47      0.37      0.42       182
           6       0.76      0.10      0.18       153

    accuracy                           0.54      1546
   macro avg       0.48      0.44      0.42      1546
weighted avg       0.53      0.54      0.50      1546



Testing for custom input

In [15]:
text = "I want to die."
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1)
predicted_label_id = predictions.item()
print(f"Predicted label ID: {predicted_label_id}")
print(f"Predicted label: {id_map[predicted_label_id]}")

Predicted label ID: 1
Predicted label: Suicidal


In [16]:
text = "Today is another good day."
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1)
predicted_label_id = predictions.item()
print(f"Predicted label ID: {predicted_label_id}")
print(f"Predicted label: {id_map[predicted_label_id]}")

Predicted label ID: 4
Predicted label: Normal
