In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
%pip install datasets transformers -q

## Loading dataset

In [3]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("Ram07/Detection-for-Suicide")

# Convert the 'train' split of the dataset to a pandas DataFrame
df = dataset['train'].to_pandas()

print("DataFrame created from the 'train' split:")
display(df.head())

print("\nInfo about the DataFrame:")
df.info()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

detection_final_cleaned.csv:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/174436 [00:00<?, ? examples/s]

DataFrame created from the 'train' split:


Unnamed: 0,class,text,cleaned_text
0,suicide,Ex Wife Threatening SuicideRecently I left my ...,sex wife threaten suicide recently leave wife ...
1,non-suicide,Am I weird I don't get affected by compliments...,weird not affect compliment come know real lif...
2,non-suicide,Finally 2020 is almost over... So I can never ...,finally hear bad year swear fucking god annoying
3,suicide,i need helpjust help me im crying so hard,need help just help cry hard
4,suicide,It ends tonight.I can’t do it anymore. \nI quit.,end tonight not anymore quit



Info about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174436 entries, 0 to 174435
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   class         174436 non-null  object
 1   text          174436 non-null  object
 2   cleaned_text  174436 non-null  object
dtypes: object(3)
memory usage: 4.0+ MB


In [4]:
# Define the number of records to sample
num_samples = 20000

# Separate the dataframe by class
suicide_df = df[df['class'] == 'suicide']
non_suicide_df = df[df['class'] == 'non-suicide']

# Calculate the number of samples to take from each class to maintain balance
# We'll aim for roughly equal numbers of each class if possible within the num_samples limit
num_suicide_samples = min(num_samples // 2, len(suicide_df))
num_non_suicide_samples = min(num_samples - num_suicide_samples, len(non_suicide_df))

# If after the initial split, we still have samples left to reach num_samples,
# distribute the remaining samples proportionally based on the original class distribution
remaining_samples = num_samples - (num_suicide_samples + num_non_suicide_samples)

if remaining_samples > 0:
    total_len = len(suicide_df) + len(non_suicide_df)
    suicide_proportion = len(suicide_df) / total_len
    non_suicide_proportion = len(non_suicide_df) / total_len

    suicide_add = int(remaining_samples * suicide_proportion)
    non_suicide_add = remaining_samples - suicide_add

    num_suicide_samples += min(suicide_add, len(suicide_df) - num_suicide_samples)
    num_non_suicide_samples += min(non_suicide_add, len(non_suicide_df) - num_non_suicide_samples)


# Sample from each class
sampled_suicide_df = suicide_df.sample(n=num_suicide_samples, random_state=42)
sampled_non_suicide_df = non_suicide_df.sample(n=num_non_suicide_samples, random_state=42)

# Concatenate the sampled dataframes
sampled_df = pd.concat([sampled_suicide_df, sampled_non_suicide_df]).sample(frac=1, random_state=42).reset_index(drop=True)


print("Original dataset size:", len(df))
print("Sampled dataset size:", len(sampled_df))
print("\nDistribution of classes in sampled data:")
print(sampled_df['class'].value_counts())

Original dataset size: 174436
Sampled dataset size: 20000

Distribution of classes in sampled data:
class
non-suicide    10000
suicide        10000
Name: count, dtype: int64


## Data Preprocessing

In [5]:
sampled_df.head()

Unnamed: 0,class,text,cleaned_text
0,non-suicide,Petition to change the Rick Roll for September...,petition change rick roll september rick roll ...
1,suicide,I have tried to kill myself before.... Just go...,try kill get divorce breakup relatively calm a...
2,suicide,being selfishits my partners birthday and all ...,selfish it partner birthday think easy good da...
3,suicide,I'm feeling like there is no way outI have com...,feel like no way out i completely give not thi...
4,non-suicide,I am happy to announce that If all goes accord...,happy announce go accord plan february remove ...


In [6]:
sampled_df.isnull().sum()

Unnamed: 0,0
class,0
text,0
cleaned_text,0


## Tokenization

In [7]:
from transformers import AutoTokenizer
from datasets import Dataset

# Load the TinyBERT tokenizer
# We'll use a tokenizer compatible with a TinyBERT model, for example, 'google/bert_uncased_L-2_H-128_A-2'
# This is a very small BERT model often used as a TinyBERT equivalent
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

# Define the maximum sequence length
max_seq_length = 128

# Tokenize the 'cleaned_text' column
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=max_seq_length)

# Apply the tokenizer to the sampled DataFrame
# First, convert the DataFrame to a Hugging Face Dataset
sampled_dataset = Dataset.from_pandas(sampled_df)

tokenized_datasets = sampled_dataset.map(tokenize_function, batched=True)

# Display the first tokenized example
print("First tokenized example:")
print(tokenized_datasets[0])

config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

First tokenized example:
{'class': 'non-suicide', 'text': 'Petition to change the Rick Roll for September Rick Roll is getting old, [this](https://youtu.be/Gs069dndIYk) is what we need for September', 'cleaned_text': 'petition change rick roll september rick roll get old this need september', 'input_ids': [101, 9964, 2689, 6174, 4897, 2244, 6174, 4897, 2131, 2214, 2023, 2342, 2244, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode the 'class' labels into integers (0 for 'non-suicide', 1 for 'suicide')
# The 'class' column is already in the original df
label_encoder = LabelEncoder()
sampled_df['labels'] = label_encoder.fit_transform(sampled_df['class'])

# Now, add the encoded labels to the tokenized dataset
# Ensure the order matches the original df and tokenized_datasets
tokenized_datasets = tokenized_datasets.add_column("labels", sampled_df['labels'].tolist())

print("Original DataFrame with encoded labels:")
display(sampled_df[['cleaned_text', 'class', 'labels']].head())

print("\nTokenized dataset structure with labels:")
print(tokenized_datasets)

print("\nFirst example of the tokenized dataset with label:")
print(tokenized_datasets[0])

Original DataFrame with encoded labels:


Unnamed: 0,cleaned_text,class,labels
0,petition change rick roll september rick roll ...,non-suicide,0
1,try kill get divorce breakup relatively calm a...,suicide,1
2,selfish it partner birthday think easy good da...,suicide,1
3,feel like no way out i completely give not thi...,suicide,1
4,happy announce go accord plan february remove ...,non-suicide,0



Tokenized dataset structure with labels:
Dataset({
    features: ['class', 'text', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

First example of the tokenized dataset with label:
{'class': 'non-suicide', 'text': 'Petition to change the Rick Roll for September Rick Roll is getting old, [this](https://youtu.be/Gs069dndIYk) is what we need for September', 'cleaned_text': 'petition change rick roll september rick roll get old this need september', 'input_ids': [101, 9964, 2689, 6174, 4897, 2244, 6174, 4897, 2131, 2214, 2023, 2342, 2244, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Split dataset into train and test

In [9]:
from datasets import ClassLabel, Value

# Cast the 'labels' column to ClassLabel
# First, get the unique class names from the label_encoder fitted earlier
# Assuming label_encoder is still available from the previous steps
class_names = label_encoder.classes_.tolist()

tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(names=class_names))


# Split the tokenized dataset into training and testing sets
# Use the 'labels' column for stratification to ensure similar class distribution in both splits
train_test_split_dataset = tokenized_datasets.train_test_split(test_size=0.2, stratify_by_column="labels")

train_dataset = train_test_split_dataset['train']
test_dataset = train_test_split_dataset['test']

print("Tokenized and split datasets:")
print(train_dataset)
print(test_dataset)

# Display class distribution in train and test sets to verify stratification
print("\nClass distribution in training set:")
train_labels = [example['labels'] for example in train_dataset]
train_label_counts = pd.Series(train_labels).value_counts()
print(train_label_counts)

print("\nClass distribution in testing set:")
test_labels = [example['labels'] for example in test_dataset]
test_label_counts = pd.Series(test_labels).value_counts()
print(test_label_counts)

Casting the dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Tokenized and split datasets:
Dataset({
    features: ['class', 'text', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 16000
})
Dataset({
    features: ['class', 'text', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4000
})

Class distribution in training set:
1    8000
0    8000
Name: count, dtype: int64

Class distribution in testing set:
1    2000
0    2000
Name: count, dtype: int64


## Model Training - Fine Tuning

In [10]:
from transformers import AutoModelForSequenceClassification

# Define the model checkpoint (the specific TinyBERT version)
model_checkpoint = "google/bert_uncased_L-2_H-128_A-2" # This corresponds to the tokenizer we used

# Load the pre-trained TinyBERT model for sequence classification
# We specify the number of labels (2 for suicide and non-suicide)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

print("TinyBERT model loaded successfully!")
print(model.config)

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TinyBERT model loaded successfully!
BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [11]:
%pip install evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
# Including accuracy, precision, recall and f1 score during model training
import numpy as np
import evaluate

# Load the necessary metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    """Computes accuracy, precision, recall, and f1-score from prediction logits."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute individual metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="binary") # Use binary average for binary classification
    recall = recall_metric.compute(predictions=predictions, references=labels, average="binary")     # Use binary average for binary classification
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")         # Use binary average for binary classification

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [13]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,                # log every 10 steps
    eval_strategy="epoch",     # evaluate every epoch
    save_strategy="epoch",           # save checkpoint every epoch
    load_best_model_at_end=True,     # load the best model at the end of training
    metric_for_best_model="eval_loss", # use evaluation loss to determine the best model
    report_to="none", # Disable reporting to services like Weights & Biases
)

# Initialize the Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=data_collator,         # Add the data collator here
    compute_metrics=compute_metrics      # Add the compute_metrics function here
)

print("Training arguments and Trainer initialized!")

Training arguments and Trainer initialized!


In [14]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3041,0.256529,0.9045,0.887824,0.926,0.90651
2,0.1793,0.233907,0.9195,0.914116,0.926,0.92002
3,0.3111,0.234915,0.92225,0.919523,0.9255,0.922502


TrainOutput(global_step=3000, training_loss=0.2889377280473709, metrics={'train_runtime': 48.9984, 'train_samples_per_second': 979.624, 'train_steps_per_second': 61.226, 'total_flos': 15245844480000.0, 'train_loss': 0.2889377280473709, 'epoch': 3.0})

In [15]:
# Evaluate the model on the test dataset
import evaluate
evaluation_results = trainer.evaluate()

print("Evaluation Results:")
print(evaluation_results)

Evaluation Results:
{'eval_loss': 0.2339065670967102, 'eval_accuracy': 0.9195, 'eval_precision': 0.914116485686081, 'eval_recall': 0.926, 'eval_f1': 0.920019870839543, 'eval_runtime': 5.977, 'eval_samples_per_second': 669.236, 'eval_steps_per_second': 41.827, 'epoch': 3.0}


## Saving the model and tokenizer

In [16]:
# Define the directory to save the model
save_directory = "./fine_tuned_tinybert_suicide_detection"

# Save the trained model
trainer.save_model(save_directory)

# Also save the tokenizer (it's usually saved with the model, but good to be explicit)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to: {save_directory}")

Model and tokenizer saved to: ./fine_tuned_tinybert_suicide_detection


In [17]:
import os
from google.colab import files

# Define the directory to be zipped
save_directory = "./fine_tuned_tinybert_suicide_detection"

# Define the name for the zip file
zip_filename = "fine_tuned_tinybert_suicide_detection.zip"

# Change to the parent directory to zip the saved_directory
os.chdir(os.path.dirname(save_directory) or '.')

# Create the zip file
!zip -r "{zip_filename}" "{os.path.basename(save_directory)}"

# Offer the zip file for download
print(f"Zipped model and tokenizer to {zip_filename}")
files.download(zip_filename)

  adding: fine_tuned_tinybert_suicide_detection/ (stored 0%)
  adding: fine_tuned_tinybert_suicide_detection/config.json (deflated 50%)
  adding: fine_tuned_tinybert_suicide_detection/model.safetensors (deflated 7%)
  adding: fine_tuned_tinybert_suicide_detection/vocab.txt (deflated 53%)
  adding: fine_tuned_tinybert_suicide_detection/special_tokens_map.json (deflated 42%)
  adding: fine_tuned_tinybert_suicide_detection/tokenizer.json (deflated 71%)
  adding: fine_tuned_tinybert_suicide_detection/tokenizer_config.json (deflated 74%)
  adding: fine_tuned_tinybert_suicide_detection/training_args.bin (deflated 53%)
Zipped model and tokenizer to fine_tuned_tinybert_suicide_detection.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>