In [None]:
!pip install torchmetrics --quiet
!pip install lightning --quiet
!pip install transformers --quiet
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install wandb --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m834.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
import sys
import pathlib

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR, CyclicLR
from torch.utils.data import DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import get_linear_schedule_with_warmup
from datasets import Dataset, DatasetDict

import lightning as L
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import LearningRateMonitor
from torchmetrics.classification import BinaryF1Score, BinaryAccuracy

In [None]:
backbone = 'allenai/longformer-base-4096'

In [None]:
wandb_logger = WandbLogger(project='LLM-Detection')
lr_monitor = LearningRateMonitor(logging_interval='step')

In [None]:
NUM_EPOCHS = 2
learning_rate = 2e-5
decay = 0.01

In [None]:
class TokenizeMapWrapper:
    def __init__(self, tokenizer, features, option=None):
        if option is None:
            option = {
                'max_length': 4096,
                'truncation': True,
                'padding': 'max_length',
            }

        self.option = option
        self.features = features
        self.tokenizer = tokenizer

    def __call__(self, batch):
        for feature in self.features:
            output = self.tokenizer(batch[feature], **self.option)
            for key in output.keys():
                batch[feature + '_' + key] = output[key]
        return batch

    def __repr__(self):
        return f'{self.__class__.__name__}(tokenizer={self.tokenizer})'

In [None]:
class LLMDetectDataModule(L.LightningDataModule):
    def __init__(self, backbone, train_df, pred_df, batch_size=1):
        super().__init__()
        self.backbone = backbone
        self.tokenizer = AutoTokenizer.from_pretrained(self.backbone)
        self.train_df = train_df
        self.pred_df = pred_df
        self.batch_size = batch_size
        self.train_dataset = Dataset.from_pandas(self.train_df)
        self.dataset_dict = self.train_dataset.train_test_split(test_size=0.1)
        self.pred_dataset = Dataset.from_pandas(self.pred_df)
        self.tokenizer_wrapper = TokenizeMapWrapper(self.tokenizer, ['text', 'instructions'])

    def prepare_data_per_node(self):
        pass

    def setup(self, stage=None):
        if stage == 'fit':
            self.dataset_dict['train'] = self.dataset_dict['train'].map(self.tokenizer_wrapper)
        elif stage == 'test':
            self.dataset_dict['test'] = self.dataset_dict['test'].map(self.tokenizer_wrapper)

    def train_dataloader(self):
        return DataLoader(self.dataset_dict['train'].with_format("torch"), batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.dataset_dict['test'].with_format("torch"), batch_size=self.batch_size)

In [None]:
class LLMDetectModel(L.LightningModule):
    def __init__(self, backbone, lr, weight_decay):
        super().__init__()
        self.backbone = backbone
        self.lr = lr
        self.weight_decay = weight_decay
        self.model = AutoModelForSequenceClassification.from_pretrained(backbone)

        self.accuracy_metric = BinaryAccuracy()
        self.f1_metric = BinaryF1Score()

        self.test_step_outputs = []

    def forward(self, **kwargs):
        x = self.model(**kwargs)
        return x

    def configure_optimizers(self):
        params = self.model.parameters()
        optimizer = AdamW(params, lr=self.lr, weight_decay=self.weight_decay)
        scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=self.lr / 10)
        scheduler_config = {
		    'scheduler': scheduler,
		    'interval': 'step',
        }

        return {
            'optimizer': optimizer,
	        'lr_scheduler': scheduler_config,
        }

    def training_step(self, batch, batch_idx):
        X = {
                    'input_ids': batch['text_input_ids'],
                    'attention_mask': batch['text_attention_mask'],
                }
        y = batch['generated']

        outputs = self(**X, labels=y)
        loss = outputs.loss

        accuracy = self.accuracy_metric(outputs.logits[:, 1], y)
        f1 = self.f1_metric(outputs.logits[:, 1], y)

        self.log('train_loss', loss, on_epoch=True)
        self.log('accuracy', accuracy, on_epoch=True)
        self.log('F1', f1, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        X = {
            'input_ids': batch['text_input_ids'],
            'attention_mask': batch['text_attention_mask']
        }

        y = batch['generated']

        outputs = self(**X, labels=y)
        loss = outputs.loss

        test_output = {
            'test_loss': loss,
            'pred': torch.exp(outputs.logits[:, 1]),
            'y': y
        }
        self.test_step_outputs.append(test_output)
        return test_output

    def on_test_epoch_end(self):
        pred = torch.stack([x['pred'] for x in self.test_step_outputs])
        ground_truth = torch.stack([x['y'] for x in self.test_step_outputs])

        self.log('test_loss', torch.stack([x['test_loss'] for x in self.test_step_outputs]).mean())
        self.log('test_accuracy', self.accuracy_metric(pred, ground_truth))
        self.log('test_f1', self.f1_metric(pred, ground_truth))
        self.test_step_outputs.clear()


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/projects/detect-LLM/llm-detect-dataset/train_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/projects/detect-LLM/llm-detect-dataset/test_essays.csv')

In [None]:
datamodule = LLMDetectDataModule(backbone, train_df, test_df, batch_size=1)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model = LLMDetectModel(backbone=backbone, lr=learning_rate, weight_decay=decay)

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = L.Trainer(max_epochs=1, accelerator='gpu', callbacks=[lr_monitor])

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, datamodule=datamodule)



Map:   0%|          | 0/1240 [00:00<?, ? examples/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name            | Type                                | Params
------------------------------------------------------------------------
0 | model           | LongformerForSequenceClassification | 148 M 
1 | accuracy_metric | BinaryAccuracy                      | 0     
2 | f1_metric       | BinaryF1Score                       | 0     
------------------------------------------------------------------------
148 M     Trainable params
0         Non-trainable params
148 M     Total params
594.644   Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name            | Type                                | Params
------------------------------------------------------------------------
0 | model           | LongformerForSequenceClassification | 148 M 
1 | accuracy_metric | BinaryAccuracy                      | 0  

Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
INFO: `Trainer.fit` stopped: `max_epochs=1` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
trainer.test(model, datamodule=datamodule)

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.0005710849654860795, 'test_accuracy': 1.0, 'test_f1': 0.0}]