In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
#!pip install -qqq transformers datasets wandb

In [4]:
# Import packages
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
     




In [5]:
# Specify GPU
device = torch.device("cuda")

## Define constants

In [6]:
# Define constants
EPOCHS = 1
BATCH_SIZE = 6
LEARNING_RATE = 1e-5
SEED = 4222

MODEL_SAVE_PATH = "Models/bert"
MODEL_CHECKPOINT_PATH = "Models/bert_checkpoint"
MODEL_LOGGING_PATH = "Models/bert_checkpoint/logs"

WANDB_ENTITY = "irshad-shariq-liverpool-john-moores-university"
WANDB_PROJECT = "suicide-detection"
WANDB_RUN = "bert"

### Load Datset

In [7]:
# Load dataset
df = pd.read_csv('suicide_detection_final_cleaned.csv', header=0, names=['text', 'label', 'cleaned_text'])
df['label'] = df['label'].map({'suicide': 1, 'non-suicide': 0})
df.drop(columns=['cleaned_text'], inplace = True)
df.head()


Unnamed: 0,text,label
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,It ends tonight.I can’t do it anymore. \nI quit.,1


In [8]:
# Split dataset into train, validation and test sets
train, temp = train_test_split(df,
                               random_state=SEED,
                               test_size=0.2,
                               stratify=df['label'])

val, test = train_test_split(temp,
                             random_state=SEED,
                             test_size=0.5,
                             stratify=temp['label'])

## Load BERT Model

In [9]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [10]:

def dataset_conversion(train, test, val):
  """Converts pandas dataframe to Dataset."""

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

In [11]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/140523 [00:00<?, ? examples/s]

Map:   0%|          | 0/17566 [00:00<?, ? examples/s]

Map:   0%|          | 0/17565 [00:00<?, ? examples/s]

In [12]:
# Tokenise datasets
SAMPLE_SIZE = 20
small_train_dataset = tokenized_datasets["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
small_val_dataset = tokenized_datasets["val"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))

full_train_dataset = tokenized_datasets["train"]
full_test_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]
     

In [13]:
# Import BERT-base pretrained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import os
import wandb

# Set environment variables
os.environ['WANDB_BASE_URL'] = 'https://api.wandb.ai'
os.environ['WANDB_NOTEBOOK_NAME'] = 'bert.ipynb'

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mirshad-shariq[0m ([33mirshad-shariq-liverpool-john-moores-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
# Initialise wandb
wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=WANDB_RUN)

In [16]:
# Define custom metrics for computation
def compute_metrics(eval_pred):
    metric_acc = load_metric("accuracy",trust_remote_code=True )
    metric_rec = load_metric("recall",trust_remote_code=True)
    metric_pre = load_metric("precision",trust_remote_code=True)
    metric_f1 = load_metric("f1",trust_remote_code=True)
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    recall = metric_rec.compute(predictions=predictions, references=labels)["recall"]
    precision = metric_pre.compute(predictions=predictions, references=labels)["precision"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}

In [17]:

# Define model and training parameters
training_args = TrainingArguments(
    output_dir=MODEL_CHECKPOINT_PATH,
    overwrite_output_dir = True,
    report_to = 'wandb',
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    seed=SEED,
    # evaluation_strategy="epoch",
    run_name=WANDB_RUN,
    logging_dir=MODEL_LOGGING_PATH,
    save_strategy="steps",
    save_steps=1500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
     

## Pre-trained BERT

In [18]:

# Predict before fine-tuning
trainer.predict(full_test_dataset).metrics

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/2928 [00:00<?, ?it/s]

  metric_acc = load_metric("accuracy",trust_remote_code=True )


{'test_loss': 0.6368388533592224,
 'test_accuracy': 0.655641580325629,
 'test_recall': 0.22600936220011703,
 'test_precision': 0.6708640903169778,
 'test_f1': 0.3381113907429697,
 'test_runtime': 407.1263,
 'test_samples_per_second': 43.146,
 'test_steps_per_second': 7.192}

## Fine tuned - BERT

In [19]:

# To observe training progress live
#%%wandb 

# Fine-tune model
trainer.train()

# Resume fine-tuning from checkpoint
trainer.train(MODEL_CHECKPOINT_PATH + "/" + "checkpoint-10500")

  0%|          | 0/23421 [00:00<?, ?it/s]

{'loss': 0.2599, 'grad_norm': 133.1292724609375, 'learning_rate': 9.7865163741941e-06, 'epoch': 0.02}
{'loss': 0.2104, 'grad_norm': 18.876489639282227, 'learning_rate': 9.573032748388199e-06, 'epoch': 0.04}
{'loss': 0.1786, 'grad_norm': 0.33040517568588257, 'learning_rate': 9.3595491225823e-06, 'epoch': 0.06}
{'loss': 0.1663, 'grad_norm': 2.0942471027374268, 'learning_rate': 9.146065496776399e-06, 'epoch': 0.09}
{'loss': 0.1803, 'grad_norm': 0.2263522744178772, 'learning_rate': 8.932581870970498e-06, 'epoch': 0.11}
{'loss': 0.1412, 'grad_norm': 0.09051486104726791, 'learning_rate': 8.719098245164597e-06, 'epoch': 0.13}
{'loss': 0.16, 'grad_norm': 1.1409860849380493, 'learning_rate': 8.505614619358696e-06, 'epoch': 0.15}
{'loss': 0.1436, 'grad_norm': 40.575416564941406, 'learning_rate': 8.292130993552795e-06, 'epoch': 0.17}
{'loss': 0.1413, 'grad_norm': 12.876981735229492, 'learning_rate': 8.078647367746895e-06, 'epoch': 0.19}
{'loss': 0.1676, 'grad_norm': 0.053067564964294434, 'learnin

  0%|          | 0/23421 [00:00<?, ?it/s]

{'loss': 0.1299, 'grad_norm': 0.006499331444501877, 'learning_rate': 5.3033602322701845e-06, 'epoch': 0.47}
{'loss': 0.1077, 'grad_norm': 0.0816364586353302, 'learning_rate': 5.089876606464284e-06, 'epoch': 0.49}
{'loss': 0.1121, 'grad_norm': 0.01721714809536934, 'learning_rate': 4.8763929806583834e-06, 'epoch': 0.51}
{'loss': 0.1018, 'grad_norm': 0.09837473928928375, 'learning_rate': 4.662909354852483e-06, 'epoch': 0.53}
{'loss': 0.1227, 'grad_norm': 0.01886860653758049, 'learning_rate': 4.449425729046582e-06, 'epoch': 0.56}
{'loss': 0.1263, 'grad_norm': 0.01636587083339691, 'learning_rate': 4.235942103240681e-06, 'epoch': 0.58}
{'loss': 0.1149, 'grad_norm': 0.1511491984128952, 'learning_rate': 4.022458477434781e-06, 'epoch': 0.6}
{'loss': 0.1244, 'grad_norm': 0.01034220214933157, 'learning_rate': 3.8089748516288807e-06, 'epoch': 0.62}
{'loss': 0.1366, 'grad_norm': 34.354061126708984, 'learning_rate': 3.5954912258229797e-06, 'epoch': 0.64}
{'loss': 0.113, 'grad_norm': 0.06431149691343

TrainOutput(global_step=23421, training_loss=0.05946184355974106, metrics={'train_runtime': 4951.4707, 'train_samples_per_second': 28.38, 'train_steps_per_second': 4.73, 'total_flos': 3.697315483235328e+16, 'train_loss': 0.05946184355974106, 'epoch': 1.0})

In [20]:

# Save fine-tuned model
trainer.save_model(MODEL_SAVE_PATH)

In [21]:
# Evaluate fine-tuned model
trainer.evaluate()

  0%|          | 0/2928 [00:00<?, ?it/s]

{'eval_loss': 0.09914879500865936,
 'eval_accuracy': 0.9769427839453458,
 'eval_recall': 0.9704505558806319,
 'eval_precision': 0.9703086148895714,
 'eval_f1': 0.970379580194544,
 'eval_runtime': 360.5237,
 'eval_samples_per_second': 48.721,
 'eval_steps_per_second': 8.122,
 'epoch': 1.0}

In [22]:
# Predict after fine-tuning
trainer.predict(full_test_dataset).metrics

  0%|          | 0/2928 [00:00<?, ?it/s]

{'test_loss': 0.09680378437042236,
 'test_accuracy': 0.9772856654901514,
 'test_recall': 0.9723522527794032,
 'test_precision': 0.9693743619658743,
 'test_f1': 0.970861023880815,
 'test_runtime': 418.5664,
 'test_samples_per_second': 41.967,
 'test_steps_per_second': 6.995}

In [25]:
def get_training_history(wandb_run):
  """Extract key metrics from training and eval from wandb run data."""

  # Get training history from wandb
  api = wandb.Api()
  run = api.run(wandb_run)
  history = run.history()

  # Rename columns
  train_column_dict = {'train/epoch': 'epoch', 'train/loss': 'training_loss'}
  val_column_dict = {'train/epoch': 'epoch', 'eval/loss': 'validation_loss', 'eval/accuracy': 'accuracy',
                'eval/precision': 'precision', 'eval/recall': 'recall', 'eval/f1': 'f1'}

  # Train data
  train_history = history[list(train_column_dict.keys())]
  train_history.columns = [train_column_dict.get(x, x) for x in train_history.columns]
  train_history = train_history.dropna()

  # Val data
  val_history = history[list(val_column_dict.keys())]
  val_history.columns = [val_column_dict.get(x, x) for x in val_history.columns]
  val_history = val_history.dropna()

  return pd.merge(train_history, val_history, how="right", on="epoch")

# Get dataframe for training history
WANDB_RUN_ID = "5jbw2h1y" # Replace with your wandb run details, found in the training cell

training_history = get_training_history(WANDB_ENTITY + "/" + WANDB_PROJECT + "/"  +WANDB_RUN_ID)
training_history

Unnamed: 0,epoch,training_loss,validation_loss,accuracy,precision,recall,f1
0,1.0,,0.099149,0.976943,0.970309,0.970451,0.97038


In [26]:
# Load fine-tuned model
saved_model = AutoModelForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

# Load trainer after fine-tune
saved_trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Predict after fine-tuning
saved_trainer.predict(full_test_dataset).metrics

  0%|          | 0/2928 [00:00<?, ?it/s]

{'test_loss': 0.09680378437042236,
 'test_accuracy': 0.9772856654901514,
 'test_recall': 0.9723522527794032,
 'test_precision': 0.9693743619658743,
 'test_f1': 0.970861023880815,
 'test_runtime': 422.9951,
 'test_samples_per_second': 41.528,
 'test_steps_per_second': 6.922}

In [27]:
# Terminate wandb run
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
test/accuracy,▁██
test/f1,▁██

0,1
eval/accuracy,0.97694
eval/f1,0.97038
eval/loss,0.09915
eval/precision,0.97031
eval/recall,0.97045
eval/runtime,360.5237
eval/samples_per_second,48.721
eval/steps_per_second,8.122
test/accuracy,0.97729
test/f1,0.97086


## GPU Memory Utilities

In [28]:

# Delete variables and empty cache
del trainer
del model
torch.cuda.empty_cache()

In [29]:

# Python garbage collection
import gc
gc.collect()

1701

In [30]:
# Check memory allocation
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

456246784
635437056


In [31]:
# check memory summary
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 445553 KiB |   3148 MiB | 270726 GiB | 270725 GiB |
|       from large pool | 445056 KiB |   3144 MiB | 270459 GiB | 270459 GiB |
|       from small pool |    497 KiB |      4 MiB |    266 GiB |    266 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 445553 KiB |   3148 MiB | 270726 GiB | 270725 GiB |
|       from large pool | 445056 KiB |   3144 MiB | 270459 GiB | 270459 GiB |
|       from small pool |    497 KiB |      4 MiB |    266 GiB |    266 GiB |
|---------------------------------------------------------------

In [32]:
# Check GPU allocation and acprocesses
!nvidia-smi

Tue Jul 30 02:27:14 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 556.12                 Driver Version: 556.12         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   47C    P5             17W /   85W |     985MiB /   8192MiB |     30%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                