In [None]:
!pip install transformers==4.17
!pip install datasets

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import random
import gc
import os


Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl.metadata (67 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m61.4/67.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from transformers==4.17)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses, transforme

In [None]:
print("\n🔹 Loading and deduplicating dataset...")
dataset = load_dataset("artem9k/ai-text-detection-pile")["train"]
texts, labels, seen = [], [], set()
for sample in dataset:
    if sample['text'] and sample['text'].strip() and sample['text'] not in seen:
        seen.add(sample['text'])
        texts.append(sample['text'])
        labels.append(1 if sample['source'] == 'ai' else 0)



🔹 Loading and deduplicating dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

(…)-00000-of-00007-bc5952582e004d67.parquet:   0%|          | 0.00/758M [00:00<?, ?B/s]

(…)-00001-of-00007-71c80017bc45f30d.parquet:   0%|          | 0.00/318M [00:00<?, ?B/s]

(…)-00002-of-00007-ee2d43f396e78fbc.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

(…)-00003-of-00007-529931154b42b51d.parquet:   0%|          | 0.00/137M [00:00<?, ?B/s]

(…)-00004-of-00007-b269dc49374a2c0b.parquet:   0%|          | 0.00/137M [00:00<?, ?B/s]

(…)-00005-of-00007-3dce5e05ddbad789.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

(…)-00006-of-00007-3d8a471ba0cf1c8d.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1392522 [00:00<?, ? examples/s]

In [None]:
ai_texts = [t for t, l in zip(texts, labels) if l == 1]
human_texts = [t for t, l in zip(texts, labels) if l == 0]
min_len = min(len(ai_texts), len(human_texts))
ai_texts = random.sample(ai_texts, min_len)
human_texts = random.sample(human_texts, min_len)

balanced_texts = ai_texts + human_texts
balanced_labels = [1]*min_len + [0]*min_len
combined = list(zip(balanced_texts, balanced_labels))
random.shuffle(combined)
balanced_texts, balanced_labels = zip(*combined)

raw_dataset = Dataset.from_dict({"text": balanced_texts, "label": balanced_labels})


In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=256)

def train_model(model_id, name, resume=False):
    print(f"\n Training {name}...")

    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    tokenized = raw_dataset.map(tokenize, batched=True, remove_columns=["text"])
    tokenized.set_format("torch")

    train_test = tokenized.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test['train']
    val_dataset = train_test['test']

    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results-{name}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir=f"./logs-{name}",
        logging_steps=500,
        report_to="none",
        fp16=torch.cuda.is_available(),
        resume_from_checkpoint=resume
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train(resume_from_checkpoint=resume)

    preds = trainer.predict(val_dataset)
    y_pred = np.argmax(preds.predictions, axis=1)
    y_true = val_dataset["label"]
    print(f"\n📊 {name} Classification Report:")
    print(classification_report(y_true, y_pred))


In [None]:
train_model("bert-base-uncased", "BERT", resume=False)


 Training BERT...


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Map:   0%|          | 0/725532 [00:00<?, ? examples/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss
1,0.1886,0.148485
2,0.1473,0.130074
3,0.078,0.142492


***** Running Evaluation *****
  Num examples = 145107
  Batch size = 32
Saving model checkpoint to ./results-BERT/checkpoint-36277
Configuration saved in ./results-BERT/checkpoint-36277/config.json
Model weights saved in ./results-BERT/checkpoint-36277/pytorch_model.bin
  ctx_manager = autocast(dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 145107
  Batch size = 32
Saving model checkpoint to ./results-BERT/checkpoint-72554
Configuration saved in ./results-BERT/checkpoint-72554/config.json
Model weights saved in ./results-BERT/checkpoint-72554/pytorch_model.bin
  ctx_manager = autocast(dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 145107
  Batch size = 32
Saving model checkpoint to ./results-BERT/checkpoint-108831
Configuration saved in ./results-BERT/checkpoint-108831/config.json
Model weights saved in ./results-BERT/checkpoint-108831/pytorch_model.bin
Deleting older checkpoint [results-BERT/checkpoint-36277] due to args.save_total_limit



📊 BERT Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97     72483
           1       0.96      0.97      0.97     72624

    accuracy                           0.97    145107
   macro avg       0.97      0.97      0.97    145107
weighted avg       0.97      0.97      0.97    145107

