<a href="https://colab.research.google.com/github/Saiphanindra-Git/Mental-Health-Risk-Assessment-Using-NLP-on-Social-Media-Posts/blob/main/p2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW




In [None]:
df = pd.read_csv(
    "/instagram_social_media_mental_health_dataset_5000.csv" # Please replace "instagram_social_media_mental_health_dataset_5000.csv" with the actual path to your dataset file.
    # Common ways to make the file accessible:
    # 1. Upload the file directly to Colab session storage.
    # 2. Mount Google Drive: from google.colab import drive; drive.mount('/content/drive') and then use '/content/drive/MyDrive/path/to/your_dataset.csv'.
)

df.head()

Unnamed: 0,post_id,post_text,label,risk_level
0,1,Smiling in pics but breaking inside.,depression,medium_risk
1,2,Life update: overwhelmed.,stress,low_risk
2,3,Feels like I'm invisible to everyone.,depression,medium_risk
3,4,I don't think I can do this anymore.,suicide,high_risk
4,5,Feels like I'm invisible to everyone.,depression,medium_risk


In [None]:
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])

print(le.classes_)


['depression' 'normal' 'stress' 'suicide']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["post_text"],
    df["label_encoded"],
    test_size=0.2,
    random_state=42,
    stratify=df["label_encoded"]
)


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_dataset = TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    torch.tensor(y_train.values)
)

test_dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    torch.tensor(y_test.values)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(le.classes_)
)

model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
model.train()

for epoch in range(1):
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed | Loss: {total_loss:.4f}")


In [None]:
model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())


In [None]:
print(
    classification_report(
        true_labels,
        predictions,
        target_names=le.classes_
    )
)


              precision    recall  f1-score   support

  depression       1.00      1.00      1.00       247
      normal       1.00      1.00      1.00       253
      stress       1.00      1.00      1.00       255
     suicide       1.00      1.00      1.00       245

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [None]:
def predict_text(text):
    model.eval()
    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_class_id = torch.argmax(probabilities, dim=1).item()

    predicted_label = le.inverse_transform([predicted_class_id])[0]
    # Convert probabilities to a list of floats for easier printing
    scores = probabilities[0].cpu().numpy().tolist()

    return predicted_label, scores

label, scores = predict_text(
    "I don’t feel motivated anymore and I avoid everyone"
)

print("Prediction:", label)
print("Scores:", scores)

Prediction: suicide
Scores: [0.08921278268098831, 0.019525056704878807, 0.06827861815690994, 0.8229835629463196]


In [None]:
import torch
import torch.nn.functional as F


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def analyze_post(text):
    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        probs = F.softmax(outputs.logits, dim=1)

    pred_id = torch.argmax(probs, dim=1).item()
    label = le.classes_[pred_id]
    confidence = probs[0][pred_id].item()

    keywords = [
        w for w in text.lower().split()
        if w in [
            "sad", "tired", "exhausted", "hopeless", "empty",
            "alone", "worthless", "nothing", "kill", "die",
            "depressed", "anxious", "stress"
        ]
    ]

    early_warning = (
        "Detected signs of emotional distress and negative affect that may indicate early-stage mental health risk."
        if label != "normal" # Changed from "No Risk" to "normal" based on le.classes_
        else
        "No strong distress indicators detected in the text."
    )

    severity_map = {
        "normal": "No Risk — language appears emotionally stable.", # Changed key from "No Risk" to "normal"
        "depression": "Moderate Risk — sustained emotional distress detected.", # Added mapping for 'depression'
        "stress": "Mild Distress — early emotional discomfort detected.", # Added mapping for 'stress'
        "suicide": "High / Suicidal Risk — urgent intervention may be required."
    }

    explanation = (
        f"Key influential words contributing to the prediction include: {', '.join(keywords)}."
        if keywords else
        "No strongly influential distress-related terms were detected."
    )

    return {
        "Early Warning & Severity Assessment": early_warning,
        "Multi-Level Risk Classification": severity_map[label],
        "Explainable AI Integration": explanation,
        "Confidence Score": round(confidence, 3)
    }

text = "I feel exhausted and hopeless, nothing feels meaningful anymore"


In [None]:
result = analyze_post(text)

for key, value in result.items():
    print(f"{key}: {value}\n")



Multi-Level Risk Classification: High / Suicidal Risk — urgent intervention may be required.

Explainable AI Integration: Key influential words contributing to the prediction include: exhausted, nothing.

Confidence Score: 0.607



In [29]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
!cd /content
!git clone https://github.com/Saiphanindra-Git/Mental-Health-Risk-Assessment-Using-NLP-on-Social-Media-Posts.git

Cloning into 'Mental-Health-Risk-Assessment-Using-NLP-on-Social-Media-Posts'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
