In [None]:
!pip install transformers==4.17



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import random
import gc
import os

In [None]:
dataset = load_dataset("artem9k/ai-text-detection-pile")['train']
texts, labels, seen = [], [], set()
for sample in dataset:
    if sample['text'] and sample['text'].strip() and sample['text'] not in seen:
        seen.add(sample['text'])
        texts.append(sample['text'])
        labels.append(1 if sample['source'] == 'ai' else 0)

raw_dataset = Dataset.from_dict({"text": texts, "label": labels})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

(…)-00000-of-00007-bc5952582e004d67.parquet:   0%|          | 0.00/758M [00:00<?, ?B/s]

(…)-00001-of-00007-71c80017bc45f30d.parquet:   0%|          | 0.00/318M [00:00<?, ?B/s]

(…)-00002-of-00007-ee2d43f396e78fbc.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

(…)-00003-of-00007-529931154b42b51d.parquet:   0%|          | 0.00/137M [00:00<?, ?B/s]

(…)-00004-of-00007-b269dc49374a2c0b.parquet:   0%|          | 0.00/137M [00:00<?, ?B/s]

(…)-00005-of-00007-3dce5e05ddbad789.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

(…)-00006-of-00007-3d8a471ba0cf1c8d.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1392522 [00:00<?, ? examples/s]

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=256)

In [None]:
def train_model(model_id, name, resume=False):
    print(f"\n Training {name}...")


    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)


    tokenized = raw_dataset.map(tokenize, batched=True, remove_columns=["text"])
    tokenized.set_format("torch")


    train_test = tokenized.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test['train']
    val_dataset = train_test['test']


    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results-{name}",
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir=f"./logs-{name}",
        logging_steps=500,
        learning_rate=2e-5,
        report_to="none",
        fp16=torch.cuda.is_available(),
        resume_from_checkpoint=resume
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    trainer.train(resume_from_checkpoint=resume)

    preds = trainer.predict(val_dataset)
    y_pred = np.argmax(preds.predictions, axis=1)
    y_true = val_dataset["label"]
    print(f"\n {name} Classification Report:")
    print(classification_report(y_true, y_pred))


In [None]:
train_model("bert-base-uncased", "BERT")


 Training BERT...


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Map:   0%|          | 0/1385843 [00:00<?, ? examples/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss
1,0.5524,0.732828
2,0.5671,0.847979
3,0.5689,0.892085


***** Running Evaluation *****
  Num examples = 277169
  Batch size = 32
Saving model checkpoint to ./results-BERT/checkpoint-69293
Configuration saved in ./results-BERT/checkpoint-69293/config.json
Model weights saved in ./results-BERT/checkpoint-69293/pytorch_model.bin
  ctx_manager = autocast(dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 277169
  Batch size = 32
Saving model checkpoint to ./results-BERT/checkpoint-138586
Configuration saved in ./results-BERT/checkpoint-138586/config.json
Model weights saved in ./results-BERT/checkpoint-138586/pytorch_model.bin
  ctx_manager = autocast(dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 277169
  Batch size = 32
Saving model checkpoint to ./results-BERT/checkpoint-207879
Configuration saved in ./results-BERT/checkpoint-207879/config.json
Model weights saved in ./results-BERT/checkpoint-207879/pytorch_model.bin
Deleting older checkpoint [results-BERT/checkpoint-138586] due to args.save_total_li


 BERT Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00    204442
           1       0.26      1.00      0.42     72727

    accuracy                           0.26    277169
   macro avg       0.13      0.50      0.21    277169
weighted avg       0.07      0.26      0.11    277169



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
