In [1]:
pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn tqdm



In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import re

In [5]:
df = pd.read_csv("suicidal_ideation_reddit_annotated.csv",
                 engine="python",
                 on_bad_lines="skip")


print(df.head())
print(df["label"].value_counts())

                                            usertext  label
0  I just want to end my life so badly. My life i...      1
1  My relationship is complicated and painful, bu...      1
2  I owe a lot of money , so I have to work.The c...      1
3  On the 2 of October I overdosed I just felt so...      1
4  Everyone tells me how wonderful I am, but not ...      1
label
1    6609
0    6047
Name: count, dtype: int64


In [6]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["usertext"].apply(clean_text)


In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

In [8]:
MODEL_NAME = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [9]:
def tokenize(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)

In [10]:
class StressDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
train_dataset = StressDataset(train_encodings, train_labels)
val_dataset = StressDataset(val_encodings, val_labels)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Using device:", device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Using device: cuda


In [14]:
def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [15]:
def eval_model(model, dataloader):
    model.eval()
    preds = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            predictions = torch.argmax(outputs.logits, dim=1)

            preds.extend(predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average="weighted")

    return acc, f1

In [16]:
EPOCHS = 3
best_acc = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")

    train_loss = train_epoch(model, train_loader, optimizer)
    val_acc, val_f1 = eval_model(model, val_loader)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Accuracy: {val_acc:.4f}")
    print(f"Val F1: {val_f1:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        model.save_pretrained("student_stress_text_model")
        tokenizer.save_pretrained("student_stress_text_model")
        print("✅ Best model saved")


Epoch 1/3


100%|██████████| 1266/1266 [04:32<00:00,  4.64it/s]


Train Loss: 0.2834
Val Accuracy: 0.9017
Val F1: 0.9008


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Best model saved

Epoch 2/3


100%|██████████| 1266/1266 [04:36<00:00,  4.58it/s]


Train Loss: 0.1888
Val Accuracy: 0.8886
Val F1: 0.8883

Epoch 3/3


100%|██████████| 1266/1266 [04:36<00:00,  4.58it/s]


Train Loss: 0.1328
Val Accuracy: 0.8910
Val F1: 0.8909


In [17]:
def predict_stress(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.softmax(outputs.logits, dim=1)
    return probs.cpu().numpy()

print(predict_stress("I feel exhausted from exams and family pressure"))


[[4.6363223e-02 9.5350075e-01 1.3602656e-04]]


In [18]:
print(predict_stress("I had a productive day at college and finished my assignments on time."))


[[2.7579244e-02 9.7229135e-01 1.2939952e-04]]


In [19]:
print(predict_stress("I can’t sleep, I feel anxious all the time, and I don’t know how to cope anymore."))

[[4.5070544e-02 9.5458418e-01 3.4526928e-04]]
