In [1]:
import numpy as np
import pandas as pd
import opendatasets as od

import os
import warnings
import torch

import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
np.random.seed(42)

In [2]:
data_path_dict = {
    'data_path': '../data',
    'model_data': '../data/model_data',
    'data_subset': '../data/model_data/data_subset',
    'models': '../data/models'
}

for file_path_key in data_path_dict:
    if not os.path.exists(data_path_dict[file_path_key]):
        print(f'Path does not Exist: {data_path_dict[file_path_key]}')

        os.makedirs(data_path_dict[file_path_key])

In [3]:
if not (os.path.exists('../data/human-vs-llm-text-corpus') or os.path.exists('../data/human_vs_llm_text_corpus')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/starblasters8/human-vs-llm-text-corpus", data_dir='../data/')
    os.rename('../data/human-vs-llm-text-corpus/', '../data/human_vs_llm_text_corpus/')

In [4]:
raw_data_path = '../data/human_vs_llm_text_corpus/data.parquet'
data_subset_path = '../data/model_data/data_subset/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
raw_data = pd.read_parquet(raw_data_path)

raw_data = raw_data[raw_data['source'].isin(['Human', 'GPT-3.5'])]
raw_data = raw_data.groupby('source').apply(lambda row: row.sample(n=26000)).reset_index(drop=True)

raw_data['source'] = np.where(raw_data['source'] == 'Human', 0, 1)

X_train, X_test, y_train, y_test = train_test_split(raw_data['text'], raw_data['source'], test_size=0.02, stratify=raw_data['source'], random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.02, stratify=y_train, random_state=42)

X_train.shape, X_test.shape, X_val.shape

((49940,), (1040,), (1020,))

In [6]:
def count_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    return trainable_params, non_trainable_params

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_length=1024, is_split_into_words=True)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

for param in model.parameters():
    param.requires_grad = False

for param in model.roberta.encoder.layer[-2:].parameters():
    param.requires_grad = True

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: 124,647,170
Non-trainable parameters: 0
Trainable parameters: 14,175,744
Non-trainable parameters: 110,471,426


In [7]:
train_dataset = datasets.Dataset.from_dict({'text': X_train, 'labels': y_train})
val_dataset = datasets.Dataset.from_dict({'text': X_val, 'labels': y_val})
test_dataset = datasets.Dataset.from_dict({'text': X_test, 'labels': y_test})

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenization, batched=True, batch_size=128)
val_dataset = val_dataset.map(tokenization, batched=True, batch_size=128)
test_dataset = test_dataset.map(tokenization, batched=True, batch_size=128)

Map:   0%|          | 0/49940 [00:00<?, ? examples/s]

Map:   0%|          | 0/1020 [00:00<?, ? examples/s]

Map:   0%|          | 0/1040 [00:00<?, ? examples/s]

In [8]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [10]:
training_args = TrainingArguments(
    output_dir='../data/models/roberta_base_model_checkpoint', num_train_epochs=5, per_device_train_batch_size=8, per_device_eval_batch_size=8,
    gradient_accumulation_steps=8, disable_tqdm=False, load_best_model_at_end=True, overwrite_output_dir=True,
    logging_steps=8, fp16 = True, logging_dir='../model_logs', dataloader_num_workers = 8,
    run_name = 'roberta-base-classification-v1', evaluation_strategy='epoch', save_strategy='epoch')

In [11]:
trainer = Trainer(
    model=model, args=training_args, compute_metrics=compute_metrics,
    train_dataset=train_dataset, eval_dataset=val_dataset)

In [12]:
trainer.train()

  0%|          | 0/3900 [00:00<?, ?it/s]

{'loss': 0.6851, 'grad_norm': 0.6409693360328674, 'learning_rate': 4.9897435897435904e-05, 'epoch': 0.01}
{'loss': 0.6444, 'grad_norm': 1.40797758102417, 'learning_rate': 4.979487179487179e-05, 'epoch': 0.02}
{'loss': 0.5061, 'grad_norm': 2.27135968208313, 'learning_rate': 4.969230769230769e-05, 'epoch': 0.03}
{'loss': 0.3192, 'grad_norm': 1.144919753074646, 'learning_rate': 4.9589743589743594e-05, 'epoch': 0.04}
{'loss': 0.2051, 'grad_norm': 3.2549245357513428, 'learning_rate': 4.948717948717949e-05, 'epoch': 0.05}
{'loss': 0.1739, 'grad_norm': 1.5920976400375366, 'learning_rate': 4.9384615384615384e-05, 'epoch': 0.06}
{'loss': 0.1643, 'grad_norm': 2.354426860809326, 'learning_rate': 4.9282051282051285e-05, 'epoch': 0.07}
{'loss': 0.1639, 'grad_norm': 2.9242634773254395, 'learning_rate': 4.917948717948718e-05, 'epoch': 0.08}
{'loss': 0.1668, 'grad_norm': 0.9624106287956238, 'learning_rate': 4.907692307692308e-05, 'epoch': 0.09}
{'loss': 0.0959, 'grad_norm': 2.321692705154419, 'learnin

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.09480076283216476, 'eval_accuracy': 0.9715686274509804, 'eval_f1': 0.9722488038277513, 'eval_precision': 0.9495327102803738, 'eval_recall': 0.996078431372549, 'eval_runtime': 27.7572, 'eval_samples_per_second': 36.747, 'eval_steps_per_second': 4.611, 'epoch': 1.0}
{'loss': 0.0374, 'grad_norm': 1.575455665588379, 'learning_rate': 3.996153846153846e-05, 'epoch': 1.0}
{'loss': 0.0326, 'grad_norm': 0.11803194135427475, 'learning_rate': 3.985897435897436e-05, 'epoch': 1.01}
{'loss': 0.0444, 'grad_norm': 6.529575824737549, 'learning_rate': 3.975641025641026e-05, 'epoch': 1.03}
{'loss': 0.0237, 'grad_norm': 0.38208717107772827, 'learning_rate': 3.9653846153846156e-05, 'epoch': 1.04}
{'loss': 0.0275, 'grad_norm': 0.7189464569091797, 'learning_rate': 3.955128205128205e-05, 'epoch': 1.05}
{'loss': 0.0258, 'grad_norm': 1.602541208267212, 'learning_rate': 3.944871794871795e-05, 'epoch': 1.06}
{'loss': 0.0478, 'grad_norm': 1.2721970081329346, 'learning_rate': 3.9346153846153847e-05,

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.12702469527721405, 'eval_accuracy': 0.961764705882353, 'eval_f1': 0.9630331753554502, 'eval_precision': 0.9321100917431193, 'eval_recall': 0.996078431372549, 'eval_runtime': 27.8389, 'eval_samples_per_second': 36.639, 'eval_steps_per_second': 4.598, 'epoch': 2.0}
{'loss': 0.0151, 'grad_norm': 4.799314975738525, 'learning_rate': 2.991025641025641e-05, 'epoch': 2.01}
{'loss': 0.0065, 'grad_norm': 1.7374595403671265, 'learning_rate': 2.9807692307692308e-05, 'epoch': 2.02}
{'loss': 0.0145, 'grad_norm': 0.22422803938388824, 'learning_rate': 2.9705128205128206e-05, 'epoch': 2.03}
{'loss': 0.0176, 'grad_norm': 1.4680359363555908, 'learning_rate': 2.9602564102564108e-05, 'epoch': 2.04}
{'loss': 0.0122, 'grad_norm': 0.010903277434408665, 'learning_rate': 2.95e-05, 'epoch': 2.05}
{'loss': 0.0292, 'grad_norm': 0.23019663989543915, 'learning_rate': 2.93974358974359e-05, 'epoch': 2.06}
{'loss': 0.0289, 'grad_norm': 3.4860281944274902, 'learning_rate': 2.9294871794871798e-05, 'epoch'

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.04725463688373566, 'eval_accuracy': 0.9833333333333333, 'eval_f1': 0.9835430784123911, 'eval_precision': 0.97131931166348, 'eval_recall': 0.996078431372549, 'eval_runtime': 27.8102, 'eval_samples_per_second': 36.677, 'eval_steps_per_second': 4.603, 'epoch': 3.0}
{'loss': 0.0093, 'grad_norm': 0.0653543621301651, 'learning_rate': 1.9974358974358975e-05, 'epoch': 3.0}
{'loss': 0.0104, 'grad_norm': 0.09557028114795685, 'learning_rate': 1.987179487179487e-05, 'epoch': 3.01}
{'loss': 0.0039, 'grad_norm': 2.573183298110962, 'learning_rate': 1.976923076923077e-05, 'epoch': 3.02}
{'loss': 0.0188, 'grad_norm': 6.7460455894470215, 'learning_rate': 1.9666666666666666e-05, 'epoch': 3.03}
{'loss': 0.0067, 'grad_norm': 5.2027106285095215, 'learning_rate': 1.9564102564102564e-05, 'epoch': 3.04}
{'loss': 0.0092, 'grad_norm': 1.425170660018921, 'learning_rate': 1.9461538461538462e-05, 'epoch': 3.05}
{'loss': 0.0065, 'grad_norm': 0.03341037034988403, 'learning_rate': 1.935897435897436e-05

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.06667043268680573, 'eval_accuracy': 0.9833333333333333, 'eval_f1': 0.9835748792270531, 'eval_precision': 0.9695238095238096, 'eval_recall': 0.9980392156862745, 'eval_runtime': 27.7842, 'eval_samples_per_second': 36.712, 'eval_steps_per_second': 4.607, 'epoch': 4.0}
{'loss': 0.0061, 'grad_norm': 0.6552647352218628, 'learning_rate': 9.923076923076923e-06, 'epoch': 4.01}
{'loss': 0.0203, 'grad_norm': 0.27809083461761475, 'learning_rate': 9.820512820512821e-06, 'epoch': 4.02}
{'loss': 0.0083, 'grad_norm': 0.062367238104343414, 'learning_rate': 9.71794871794872e-06, 'epoch': 4.03}
{'loss': 0.0059, 'grad_norm': 0.008710673078894615, 'learning_rate': 9.615384615384616e-06, 'epoch': 4.04}
{'loss': 0.0037, 'grad_norm': 0.30105650424957275, 'learning_rate': 9.512820512820514e-06, 'epoch': 4.05}
{'loss': 0.0155, 'grad_norm': 2.5642073154449463, 'learning_rate': 9.41025641025641e-06, 'epoch': 4.06}
{'loss': 0.0076, 'grad_norm': 0.05147373676300049, 'learning_rate': 9.30769230769230

  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 0.12530921399593353, 'eval_accuracy': 0.9735294117647059, 'eval_f1': 0.9741626794258373, 'eval_precision': 0.9514018691588785, 'eval_recall': 0.9980392156862745, 'eval_runtime': 28.8652, 'eval_samples_per_second': 35.337, 'eval_steps_per_second': 4.434, 'epoch': 5.0}
{'train_runtime': 4119.4413, 'train_samples_per_second': 60.615, 'train_steps_per_second': 0.947, 'train_loss': 0.031737986804965215, 'epoch': 5.0}


TrainOutput(global_step=3900, training_loss=0.031737986804965215, metrics={'train_runtime': 4119.4413, 'train_samples_per_second': 60.615, 'train_steps_per_second': 0.947, 'total_flos': 6.566830964097024e+16, 'train_loss': 0.031737986804965215, 'epoch': 4.997597308986064})

In [17]:
model = RobertaForSequenceClassification.from_pretrained('../data/models/roberta_base_model_checkpoint/checkpoint-3900')

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

for param in model.parameters():
    param.requires_grad = True

trainable_params, non_trainable_params = count_parameters(model)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {non_trainable_params:,}")

Trainable parameters: 124,647,170
Non-trainable parameters: 0
Trainable parameters: 124,647,170
Non-trainable parameters: 0


In [18]:
trainer = Trainer(model=model, compute_metrics=compute_metrics)

In [19]:
trainer.evaluate(test_dataset, metric_key_prefix='test')

  0%|          | 0/130 [00:00<?, ?it/s]

{'test_loss': 0.20604324340820312,
 'test_accuracy': 0.9576923076923077,
 'test_f1': 0.959409594095941,
 'test_precision': 0.9219858156028369,
 'test_recall': 1.0,
 'test_runtime': 11.5177,
 'test_samples_per_second': 90.296,
 'test_steps_per_second': 11.287}

In [20]:
trainer.evaluate(train_dataset, metric_key_prefix='train')

  0%|          | 0/6243 [00:00<?, ?it/s]

{'train_loss': 0.11609598249197006,
 'train_accuracy': 0.9731678013616339,
 'train_f1': 0.9738505971430802,
 'train_precision': 0.9496840983481769,
 'train_recall': 0.9992791349619543,
 'train_runtime': 538.3044,
 'train_samples_per_second': 92.773,
 'train_steps_per_second': 11.598}