In [3]:
!pip install -q transformers datasets accelerate scikit-learn


In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch version: 2.8.0+cu126
CUDA available: True


In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:


# Path to your labeled dataset
DATA_PATH = '/content/drive/My Drive/collab_dataset/comments_author_replied.csv'

# Load the file into a DataFrame
try:
    df = pd.read_csv(DATA_PATH)

    # Display the first few rows to confirm it loaded correctly
    print("Dataset loaded successfully!")
    print(df.head())

except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the folder and file name.")

Dataset loaded successfully!
  forum_name  thread_id                               cleaned_post_content  \
0    anxiety     516409  hi everyone, i'm having a horrible week, so ju...   
1    anxiety     605332  dear alljust need a connection to feel the sup...   
2    anxiety     611099  recently my anxiety has 'flared'. as soon as i...   
3    anxiety     593234  hello everyone, i have had anxiety for the pas...   
4    anxiety     610773  i need a little help. on a generalised anxiety...   

                            cleaned_comments_content  \
0  hey whatsinaname, the shame spiral is such a g...   
1  hey sammy! i'm pretty much the same with relat...   
2  white knight: hi, welcome your symptoms are qu...   
3  hi jordan, thank you for your openness in shar...   
4  dear new member~welcome here to the forum wher...   

                                  comments_sentiment  Author_replied  \
0  positive || positive || positive || positive |...               1   
1                    

In [7]:


# 7 Ekman emotions (as we agreed)
EMOTIONS = ['sadness', 'joy', 'anger', 'fear', 'disgust', 'neutral', 'surprise']
emotion_set = set(EMOTIONS)

# Label mappings
label2id = {emo: i for i, emo in enumerate(EMOTIONS)}
id2label = {i: emo for emo, i in label2id.items()}
label2id, id2label


({'sadness': 0,
  'joy': 1,
  'anger': 2,
  'fear': 3,
  'disgust': 4,
  'neutral': 5,
  'surprise': 6},
 {0: 'sadness',
  1: 'joy',
  2: 'anger',
  3: 'fear',
  4: 'disgust',
  5: 'neutral',
  6: 'surprise'})

In [8]:
df = pd.read_csv(DATA_PATH)
print("Original shape:", df.shape)
df.head()


Original shape: (7321, 9)


Unnamed: 0,forum_name,thread_id,cleaned_post_content,cleaned_comments_content,comments_sentiment,Author_replied,cleaned_authors_comment,pred_authors_reply_emotions,pred_authors_reply_probabilities
0,anxiety,516409,"hi everyone, i'm having a horrible week, so ju...","hey whatsinaname, the shame spiral is such a g...",positive || positive || positive || positive |...,1,"hi gems, thanks for the reply. depending on th...",joy,"{""anger"": 0.00171, ""disgust"": 0.000294, ""fear""..."
1,anxiety,605332,dear alljust need a connection to feel the sup...,hey sammy! i'm pretty much the same with relat...,neutral || negative,1,thanks a lot. your message was sound and clear...,joy,"{""anger"": 0.004643, ""disgust"": 0.000598, ""fear..."
2,anxiety,611099,recently my anxiety has 'flared'. as soon as i...,"white knight: hi, welcome your symptoms are qu...",neutral || neutral || neutral || positive || n...,1,thanks i'll have a look at these suggestions. ...,joy,"{""anger"": 9.6e-05, ""disgust"": 1.5e-05, ""fear"":..."
3,anxiety,593234,"hello everyone, i have had anxiety for the pas...","hi jordan, thank you for your openness in shar...",positive || neutral,1,i am replying after a year. thank you for your...,joy,"{""anger"": 7.7e-05, ""disgust"": 1.7e-05, ""fear"":..."
4,anxiety,610773,i need a little help. on a generalised anxiety...,dear new member~welcome here to the forum wher...,negative || neutral,1,it started when my best friend (alyssa) got qu...,sadness,"{""anger"": 0.024631, ""disgust"": 0.005077, ""fear..."


In [9]:
# Ensure key text columns are strings and non-null
df["cleaned_post_content"] = df["cleaned_post_content"].fillna("").astype(str)
df["cleaned_comments_content"] = df["cleaned_comments_content"].fillna("").astype(str)

# Normalize sentiment
df["comments_sentiment"] = (
    df["comments_sentiment"]
    .astype(str)
    .str.lower()
    .str.strip()
)

# Normalize author reply flag
df["Author_replied"] = (
    df["Author_replied"]
    .astype(str)
    .str.lower()
    .str.strip()
)

# Normalize emotion labels
df["pred_authors_reply_emotions"] = (
    df["pred_authors_reply_emotions"]
    .astype(str)
    .str.lower()
    .str.strip()
)

df.head()


Unnamed: 0,forum_name,thread_id,cleaned_post_content,cleaned_comments_content,comments_sentiment,Author_replied,cleaned_authors_comment,pred_authors_reply_emotions,pred_authors_reply_probabilities
0,anxiety,516409,"hi everyone, i'm having a horrible week, so ju...","hey whatsinaname, the shame spiral is such a g...",positive || positive || positive || positive |...,1,"hi gems, thanks for the reply. depending on th...",joy,"{""anger"": 0.00171, ""disgust"": 0.000294, ""fear""..."
1,anxiety,605332,dear alljust need a connection to feel the sup...,hey sammy! i'm pretty much the same with relat...,neutral || negative,1,thanks a lot. your message was sound and clear...,joy,"{""anger"": 0.004643, ""disgust"": 0.000598, ""fear..."
2,anxiety,611099,recently my anxiety has 'flared'. as soon as i...,"white knight: hi, welcome your symptoms are qu...",neutral || neutral || neutral || positive || n...,1,thanks i'll have a look at these suggestions. ...,joy,"{""anger"": 9.6e-05, ""disgust"": 1.5e-05, ""fear"":..."
3,anxiety,593234,"hello everyone, i have had anxiety for the pas...","hi jordan, thank you for your openness in shar...",positive || neutral,1,i am replying after a year. thank you for your...,joy,"{""anger"": 7.7e-05, ""disgust"": 1.7e-05, ""fear"":..."
4,anxiety,610773,i need a little help. on a generalised anxiety...,dear new member~welcome here to the forum wher...,negative || neutral,1,it started when my best friend (alyssa) got qu...,sadness,"{""anger"": 0.024631, ""disgust"": 0.005077, ""fear..."


In [10]:
# Consider these values as "author replied" – adjust list if needed
TRUE_VALUES = {"yes", "y", "true", "1", "t"}

train_mask = df["Author_replied"].isin(TRUE_VALUES)
train_df = df[train_mask].copy()

# Filter to valid emotion labels only
train_df = train_df[train_df["pred_authors_reply_emotions"].isin(emotion_set)].copy()

print("Rows where author replied & emotion valid:", train_df.shape[0])
train_df["pred_authors_reply_emotions"].value_counts()


Rows where author replied & emotion valid: 7321


Unnamed: 0_level_0,count
pred_authors_reply_emotions,Unnamed: 1_level_1
joy,5924
sadness,727
fear,281
surprise,257
anger,87
neutral,41
disgust,4


In [11]:
train_df["label"] = train_df["pred_authors_reply_emotions"].map(label2id)
train_df[["pred_authors_reply_emotions", "label"]].head()


Unnamed: 0,pred_authors_reply_emotions,label
0,joy,1
1,joy,1
2,joy,1
3,joy,1
4,sadness,0


In [12]:
def build_input_text(row):
    post = row["cleaned_post_content"]
    comments = row["cleaned_comments_content"]
    sentiment = str(row["comments_sentiment"]).upper()
    return (
        f"POST:\n{post}\n\n"
        f"COMMENTS:\n{comments}\n\n"
        f"[SENTIMENT: {sentiment}]"
    )

train_df["input_text"] = train_df.apply(build_input_text, axis=1)
train_df[["input_text", "pred_authors_reply_emotions"]].head()


Unnamed: 0,input_text,pred_authors_reply_emotions
0,"POST:\nhi everyone, i'm having a horrible week...",joy
1,POST:\ndear alljust need a connection to feel ...,joy
2,POST:\nrecently my anxiety has 'flared'. as so...,joy
3,"POST:\nhello everyone, i have had anxiety for ...",joy
4,POST:\ni need a little help. on a generalised ...,sadness


In [13]:
train_split, val_split = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df["label"],
    random_state=42
)

print("Train rows:", train_split.shape[0])
print("Val rows:", val_split.shape[0])
train_split["pred_authors_reply_emotions"].value_counts(), val_split["pred_authors_reply_emotions"].value_counts()


Train rows: 5856
Val rows: 1465


(pred_authors_reply_emotions
 joy         4738
 sadness      581
 fear         225
 surprise     206
 anger         70
 neutral       33
 disgust        3
 Name: count, dtype: int64,
 pred_authors_reply_emotions
 joy         1186
 sadness      146
 fear          56
 surprise      51
 anger         17
 neutral        8
 disgust        1
 Name: count, dtype: int64)

In [14]:
!pip install -q huggingface_hub
!pip install -q transformers


In [15]:
from huggingface_hub import login
login(new_session=False)

In [16]:
MODEL_NAME = "mental/mental-bert-base-uncased"  # MentalBERT model on Hugging Face

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(EMOTIONS),
    id2label=id2label,
    label2id=label2id,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda')

In [17]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

train_dataset = EmotionDataset(
    texts=train_split["input_text"],
    labels=train_split["label"],
    tokenizer=tokenizer,
    max_length=256
)

val_dataset = EmotionDataset(
    texts=val_split["input_text"],
    labels=val_split["label"],
    tokenizer=tokenizer,
    max_length=256
)


In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "f1_macro": f1_macro,
    }


In [20]:
output_dir = "mentalbert_author_top_emotion"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # lower if GPU OOM
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=2,
    report_to="none",  # disable WandB, etc.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [21]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.6942,0.793475,0.809556,0.127822
2,0.704,0.703722,0.808191,0.134987
3,0.5243,0.763648,0.795904,0.148205


TrainOutput(global_step=2196, training_loss=0.6579291512188798, metrics={'train_runtime': 979.3488, 'train_samples_per_second': 17.938, 'train_steps_per_second': 2.242, 'total_flos': 2311271265484800.0, 'train_loss': 0.6579291512188798, 'epoch': 3.0})

In [22]:
metrics = trainer.evaluate()
metrics


{'eval_loss': 0.763648271560669,
 'eval_accuracy': 0.7959044368600683,
 'eval_f1_macro': 0.14820464808899655,
 'eval_runtime': 29.7372,
 'eval_samples_per_second': 49.265,
 'eval_steps_per_second': 3.094,
 'epoch': 3.0}

In [23]:
logits, labels, _ = trainer.predict(val_dataset)
preds = np.argmax(logits, axis=-1)

print(classification_report(
    labels,
    preds,
    target_names=EMOTIONS,
    digits=4
))


              precision    recall  f1-score   support

     sadness     0.2778    0.1027    0.1500       146
         joy     0.8175    0.9705    0.8874      1186
       anger     0.0000    0.0000    0.0000        17
        fear     0.0000    0.0000    0.0000        56
     disgust     0.0000    0.0000    0.0000         1
     neutral     0.0000    0.0000    0.0000         8
    surprise     0.0000    0.0000    0.0000        51

    accuracy                         0.7959      1465
   macro avg     0.1565    0.1533    0.1482      1465
weighted avg     0.6895    0.7959    0.7334      1465



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
import json

save_dir = Path(output_dir)
save_dir.mkdir(parents=True, exist_ok=True)

trainer.model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

mapping = {"id2label": id2label, "label2id": label2id, "emotions": EMOTIONS}
with open(save_dir / "label_mapping.json", "w") as f:
    json.dump(mapping, f, indent=2)

print("Model saved to:", save_dir)


Model saved to: mentalbert_author_top_emotion


In [25]:
!cp -r mentalbert_author_top_emotion /content/drive/MyDrive/mentalbert_models/
