<a href="https://colab.research.google.com/github/WesselBoi/Suicidal-Content-detection/blob/main/models_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install -qqq transformers datasets

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
# Import packages
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from evaluate import load as load_metric

In [None]:
# Specify GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Change to your own directory
try:
    os.chdir("/content/drive/MyDrive/SuicideModel")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


## Define constants

In [None]:
# Define constants
EPOCHS = 1
BATCH_SIZE = 6
LEARNING_RATE = 1e-5
SEED = 4222

MODEL_SAVE_PATH = "Models/bert"
MODEL_CHECKPOINT_PATH = "Models/bert_checkpoint"
MODEL_LOGGING_PATH = "Models/bert_checkpoint/logs"

In [None]:
df = pd.read_csv('Data/suicide_detection_final_cleaned.csv')  # No need for names or header=0!
df['label'] = df['class'].map({'suicide': 1, 'non-suicide': 0})
df = df.drop(columns=['class', 'cleaned_text'])

df.head()


Unnamed: 0,text,label
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,It ends tonight.I can’t do it anymore. \nI quit.,1


In [None]:
# Split dataset into train, validation and test sets
train, temp = train_test_split(df,
                               random_state=SEED,
                               test_size=0.2,
                               stratify=df['label'])

val, test = train_test_split(temp,
                             random_state=SEED,
                             test_size=0.5,
                             stratify=temp['label'])

#Load BERT Model

In [None]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def dataset_conversion(train, test, val):
  """Converts pandas dataframe to Dataset."""

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

In [None]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/139881 [00:00<?, ? examples/s]

Map:   0%|          | 0/17486 [00:00<?, ? examples/s]

Map:   0%|          | 0/17485 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    metric_acc = load_metric("accuracy")
    metric_rec = load_metric("recall")
    metric_pre = load_metric("precision")
    metric_f1 = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=predictions, references=labels)["accuracy"],
        "recall": metric_rec.compute(predictions=predictions, references=labels)["recall"],
        "precision": metric_pre.compute(predictions=predictions, references=labels)["precision"],
        "f1": metric_f1.compute(predictions=predictions, references=labels)["f1"]
    }

In [None]:
training_args = TrainingArguments(
    output_dir=MODEL_CHECKPOINT_PATH,
    overwrite_output_dir=True,
    report_to="none", # disables wandb
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    seed=SEED,
    logging_dir=MODEL_LOGGING_PATH,
    save_strategy="steps",
    save_steps=1500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
last_checkpoint_path = "Models/bert_checkpoint/checkpoint-15000"

In [None]:
# trainer.train(resume_from_checkpoint=last_checkpoint_path)

In [None]:
checkpoint_path = "Models/bert_checkpoint/checkpoint-15000"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",  # or any output dir
    per_device_eval_batch_size=16,  # or appropriate batch size
)

In [None]:
eval_dataset = Dataset.from_pandas(val)

tokenized_val = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/17485 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    eval_dataset=tokenized_val,  # Validation set
)

  trainer = Trainer(


In [None]:
# eval_results = trainer.evaluate()
# print(eval_results)

In [None]:
# import numpy as np

# small_val = val.sample(n=200, random_state=42)  # Choose n as per your time limit
# from datasets import Dataset
# eval_dataset = Dataset.from_pandas(small_val)
# tokenized_val = eval_dataset.map(tokenize_function, batched=True)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     tokenizer=tokenizer,
#     eval_dataset=tokenized_val
# )
# eval_results = trainer.evaluate()
# print(eval_results)


In [None]:
model.eval()  # Ensure model is in evaluation mode

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def predict_text(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    # Get logits (model prediction scores)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).numpy()[0]
    # Get the most likely prediction
    pred_label = int(np.argmax(probs))
    confidence = probs[pred_label]
    return pred_label, confidence

In [None]:
# Example: test a custom sentence
sentence = "good weather today"








label, confidence = predict_text(sentence)
if label == 1:
    print(f"Suicidal intent detected ({confidence:.2f})")
else:
    print(f"No suicidal intent detected ({confidence:.2f})")

No suicidal intent detected (1.00)
