In [None]:
# importing libs

import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding)

from sklearn.model_selection import train_test_split




In [None]:
# check gpu

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()

In [None]:
# load data

df = pd.read_excel("dataset.xlsx")

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
#Basic cleaning

df = df.dropna(subset=['posts', 'labels'])
df.shape


In [None]:
# label mapping

label_map = {1:0, 2:1, 3:2, 4:3}
df['labels'] = df['labels'].map(label_map)

df.head()

In [None]:
df.dropna(subset=['labels'])
df['labels'] = df['labels'].astype(int)

df.shape

In [None]:
# split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['labels'], random_state=42)



In [None]:
#convert to dataset format

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
dataset = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

print(f"Training samples: {len(train_dataset)} (80%)")
print(f"Validation samples: {len(val_dataset)} (10%)")
print(f"Testing samples: {len(test_dataset)} (10%)")

In [None]:
#Using BanglaBERT

MODEL_ID = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [None]:
def preprocess_function(examples):
    # Max length 128 is optimized for RTX 3050 VRAM. 
    # If you get memory errors, reduce to 64. If you have space, try 256.
    return tokenizer(examples["posts"], truncation=True, max_length=128)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Model Initialization

id2label = {0:"None", 1:"Mild", 2:"Medium", 3:"Severe"}
label2id = {"None":0, "Mild":1, "Medium": 2, "Severe":3}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID, 
    num_labels=4,
    id2label=id2label,
    label2id=label2id
)
model.to(device)


In [None]:
# Metrics

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    acc = accuracy.compute(predictions=predictions, references=labels)
    # 'weighted' handles class imbalance common in mental health data
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    prec = precision.compute(predictions=predictions, references=labels, average="weighted")
    rec = recall.compute(predictions=predictions, references=labels, average="weighted")
    
    return {
        "accuracy": acc["accuracy"],
        "f1": f1_score["f1"],
        "precision": prec["precision"],
        "recall": rec["recall"]
    }

In [None]:
# Training arguments [hyperparameters]

batch_size = 8
epochs = 5
learning_rate = 2e-5



training_args = TrainingArguments(
    output_dir="./bangla_depression_model",
    learning_rate=learning_rate,              # MIRoBERTa recommended rate [3]
    per_device_train_batch_size=batch_size,   # Optimized for 3050 GPU
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,   # Effective batch size = 8*2 = 16
    num_train_epochs=epochs,              # Good starting point
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,                       # Mixed precision (Faster & saves VRAM)
    logging_dir='./logs',
)

In [None]:


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    
    # CHANGE: Use validation set for evaluation during training
    eval_dataset=tokenized_dataset["validation"], 
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
print("\nRunning Final Evaluation on Test Set...")
test_results = trainer.evaluate(tokenized_dataset["test"])
print(test_results)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# 1. Get Predictions on the Test Set
print("Generating predictions...")
predictions_output = trainer.predict(tokenized_dataset["test"])
preds = np.argmax(predictions_output.predictions, axis=1)
true_labels = predictions_output.label_ids

# 2. Define Class Names (Map 0,1,2,3 back to text)
# Ensure this matches the order you used in label2id earlier
class_names = ["None", "Mild", "Medium", "Severe"] 

# 3. Create Confusion Matrix
cm = confusion_matrix(true_labels, preds)

# 4. Plot Heatmap (MIRoBERTa Style)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix: Bangla Depression Severity')
plt.show()

In [None]:
print(classification_report(true_labels, preds, target_names=class_names))

In [None]:
import shap
import torch

# 1. Define a prediction function wrapper
# SHAP needs a function that takes a list of texts and returns probabilities
def f(texts):
    # Ensure texts are strings
    texts = [str(text) for text in texts]
    # Tokenize input texts
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Apply Softmax to get probabilities (scores between 0 and 1)
    scores = torch.nn.functional.softmax(outputs.logits, dim=1).detach().cpu().numpy()
    return scores

# 2. Initialize the Explainer with a proper text masker
masker = shap.maskers.Text(tokenizer)
explainer = shap.Explainer(f, masker)

# 3. Select examples to explain
# Let's pick 3 examples from the TEST set to see why the model classified them that way
examples_to_explain = test_df['posts'].iloc[0:3].tolist()

# 4. Calculate SHAP values
print("Calculating SHAP values (this may take a minute)...")
shap_values = explainer(examples_to_explain)

# 5. Visualize
# This creates the red/blue highlight text plot seen in the MIRoBERTa paper (Fig 4)
shap.plots.text(shap_values)

In [None]:
model.save_pretrained("./bangla_depression_1")
tokenizer.save_pretrained("./bangla_depression_1")