## Environment Configuration

In [None]:
!pip install -U tqdm
!pip install transformers datasets evaluate accelerate
!pip install -U datasets
!pip install -U scikit-learn
!pip install -U torch
!pip install -U numpy



In [None]:
# General Dataloaders
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
tqdm.pandas()
# Training
from transformers import pipeline
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoConfig
import evaluate
import torch
from datasets import load_metric
# Machine Learning
import torch
import numpy as np
# Data Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from evaluate import evaluator

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


## Data Preparation

In [None]:
# Load Dataset & Drop Rows with Null Values
messages = pd.read_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Augments/UPDATED_Suicide_Detection.csv.parquet.gzip')
messages = messages.drop(['text'], axis=1)
messages.dropna(axis = 0)
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 6 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  232074 non-null  int64  
 1   clean_text                  232074 non-null  object 
 2   average_words_per_sentence  232074 non-null  float64
 3   sentiment                   232074 non-null  object 
 4   num_emojis                  232074 non-null  int64  
 5   class                       232074 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 10.6+ MB


In [None]:
# Encode labels into numbers for BERT Tuning

X = messages[['clean_text']]
y = messages['class']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y = y_encoded

In [None]:
# Generate train, test, split csv files for load_dataset

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = 0.8,
    random_state = 5
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    train_size = 0.8,
    random_state = 21
)

In [None]:
X_train['label'] = y_train
X_train.to_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/train.parquet')

X_test['label'] = y_test
X_test.to_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/test.parquet')

X_val['label'] = y_val
X_val.to_parquet('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/val.parquet')

## Training Pipeline

In [None]:
# We explicitly load train, test, and validation splits using load_dataset
suicide_dataset = load_dataset(
    'parquet',
    data_files = {
        'train': '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/train.parquet',
        'test': '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/test.parquet',
        'validation': '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/val.parquet'
    }
)

In [None]:
# We load our models and tokenizer for finetuning purposes.
hf_card = 'google-bert/bert-base-cased'
# Pretrained head is discarded. We will finetune the randomly initialized head that supports 2 classes.
model = AutoModelForSequenceClassification.from_pretrained(hf_card, num_labels=2)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(hf_card, device = device)



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["clean_text"], padding="max_length", truncation=True)
tokenized_datasets = suicide_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/148527 [00:00<?, ? examples/s]

Map:   0%|          | 0/46415 [00:00<?, ? examples/s]

Map:   0%|          | 0/37132 [00:00<?, ? examples/s]

In [None]:
metric = load_metric("f1")

  metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning',
    evaluation_strategy="steps",
    num_train_epochs = 3,
    learning_rate = 5e-05,
    save_strategy='steps',
    auto_find_batch_size=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
    )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset= tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,F1
500,0.1062,0.296926,0.952824
1000,0.1524,0.126856,0.968174
1500,0.1574,0.12012,0.969311
2000,0.1811,0.140844,0.967239
2500,0.2646,0.317534,0.903269


KeyboardInterrupt: 

In [None]:
tokenizer.save_pretrained('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning')

('/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/tokenizer_config.json',
 '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/special_tokens_map.json',
 '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/vocab.txt',
 '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/added_tokens.json',
 '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/tokenizer.json')

## Model Playground

In [None]:
hf_card = '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/checkpoint-1500'


id2label = {
    0: 'non-suicide',
    1: 'suicide'
}

label2id = dict((v,k) for k,v in id2label.items())

model = AutoModelForSequenceClassification.from_pretrained(hf_card, id2label=id2label, label2id = label2id)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(hf_card, device = device)


suicide_classifier = pipeline(
    'text-classification',
    model,
    tokenizer = tokenizer,
    padding = True,
    truncation = True
)

suicide_classifier('Hello World!')

[{'label': 'non-suicide', 'score': 0.9976804256439209}]

## Test

In [None]:
hf_card = '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Models/Tuning/checkpoint-1500'

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForSequenceClassification.from_pretrained(hf_card)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(hf_card, device = device)

In [None]:
# We explicitly load train, test, and validation splits using load_dataset
suicide_dataset = load_dataset(
    'parquet',
    data_files = {
        'train': '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/train.parquet',
        'test': '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/test.parquet',
        'validation': '/content/drive/MyDrive/Graduate/Research/SU24/Suicide/Code/Datasets/splits/val.parquet'
    }
)


In [None]:
metric = load_metric("f1")

  metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["clean_text"], padding="max_length", truncation=True)
tokenized_datasets = suicide_dataset.map(tokenize_function, batched=True)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset= tokenized_datasets['train'],
    eval_dataset= tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()