In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

class SuicideCommentsDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_length=512):
        self.comments = comments
        self.labels = [1 if label == 'suicide' else 0 for label in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        comment = str(self.comments[idx])
        label = self.labels[idx]
        tokens = self.tokenizer(comment, padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt')
        return {
            'input_ids': tokens['input_ids'].flatten(),
            'attention_mask': tokens['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load dataset
df = pd.read_csv('data/sample.csv')
comments = df['text'].values
labels = df['class'].values

train_comments, val_comments, train_labels, val_labels = train_test_split(comments, labels, test_size=0.1, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

train_dataset = SuicideCommentsDataset(train_comments, train_labels, tokenizer)
val_dataset = SuicideCommentsDataset(val_comments, val_labels, tokenizer)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SuicideClassifier(pl.LightningModule):
    def __init__(self, num_labels=2):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels, torch_dtype=torch.float32, attn_implementation="flash_attention_2")

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return output
    
    def training_step(self, batch, batch_idx):
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = self(**inputs, labels=labels)
        loss = outputs.loss
        return loss
    
    def validation_step(self, batch, batch_idx):
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = self(**inputs, labels=labels)
        val_loss = outputs.loss
        self.log('val_loss', val_loss, prog_bar=True)
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=2e-5)

class SuicideClassifier_WithoutFA2(pl.LightningModule):
    def __init__(self, num_labels=2):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels, torch_dtype=torch.float32)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return output
    
    def training_step(self, batch, batch_idx):
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = self(**inputs, labels=labels)
        loss = outputs.loss
        return loss
    
    def validation_step(self, batch, batch_idx):
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = self(**inputs, labels=labels)
        val_loss = outputs.loss
        self.log('val_loss', val_loss, prog_bar=True)
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=2e-5)


In [5]:


train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=8)

# Model checkpointing
checkpoint_callback = ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')


# Initialize the trainer
trainer = pl.Trainer(
    precision="16-mixed",
    max_epochs=5,
    limit_train_batches=0.1,
    callbacks=[checkpoint_callback],
    accelerator="auto",
    devices="auto"
)


# Initialize model
model = SuicideClassifier()
model.to('cuda')

# Train the model
trainer.fit(model, train_loader, val_loader)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                                | Params
--------------------------------------------------------------
0 | model | DistilBertForSequenceClassification | 67.0 M
--------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Epoch 4: 100%|██████████| 77/77 [00:41<00:00,  1.85it/s, v_num=2, val_loss=0.199]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 77/77 [00:41<00:00,  1.85it/s, v_num=2, val_loss=0.199]


In [4]:
# Initialize model
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=8)
checkpoint_callback = ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')
trainer = pl.Trainer(
    precision=32,
    max_epochs=5,
    limit_train_batches=0.1,
    callbacks=[checkpoint_callback],
    accelerator="auto",
    devices="auto"
)
model = SuicideClassifier_WithoutFA2()
model.to('cuda')

# Train the model
trainer.fit(model, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                                | Params
--------------------------------------------------------------
0 | model | DistilBertForSequenceClassification | 67.0 M
--------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Epoch 4: 100%|██████████| 77/77 [02:27<00:00,  0.52it/s, v_num=1, val_loss=0.164]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 77/77 [02:30<00:00,  0.51it/s, v_num=1, val_loss=0.164]
