In [None]:
!pip install transformers==4.17



In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import random
import gc
import os

In [None]:
dataset = load_dataset("artem9k/ai-text-detection-pile")['train']
texts, labels, seen = [], [], set()
for sample in dataset:
    if sample['text'] and sample['text'].strip() and sample['text'] not in seen:
        seen.add(sample['text'])
        texts.append(sample['text'])
        labels.append(1 if sample['source'] == 'ai' else 0)

raw_dataset = Dataset.from_dict({"text": texts, "label": labels})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=256)

In [None]:
def train_model(model_id, name, resume=False):
    print(f"\n Training {name}...")


    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)


    tokenized = raw_dataset.map(tokenize, batched=True, remove_columns=["text"])
    tokenized.set_format("torch")


    train_test = tokenized.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test['train']
    val_dataset = train_test['test']


    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results-{name}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_dir=f"./logs-{name}",
        logging_steps=500,
        report_to="none",
        fp16=torch.cuda.is_available(),
        resume_from_checkpoint=resume
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    trainer.train(resume_from_checkpoint=resume)

    preds = trainer.predict(val_dataset)
    y_pred = np.argmax(preds.predictions, axis=1)
    y_true = val_dataset["label"]
    print(f"\n {name} Classification Report:")
    print(classification_report(y_true, y_pred))


In [None]:
train_model("distilbert-base-uncased", "DistilBERT", resume=True)


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}




 Training DistilBERT...


loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.76ea01b4b85ac16e2cec55c398c

Map:   0%|          | 0/1385843 [00:00<?, ? examples/s]

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a

0it [00:00, ?it/s]

  ctx_manager = autocast(dtype=self.amp_dtype)


Epoch,Training Loss,Validation Loss
3,0.0189,0.084638


***** Running Evaluation *****
  Num examples = 277169
  Batch size = 32
Saving model checkpoint to ./results-DistilBERT/checkpoint-207879
Configuration saved in ./results-DistilBERT/checkpoint-207879/config.json
Model weights saved in ./results-DistilBERT/checkpoint-207879/pytorch_model.bin
Deleting older checkpoint [results-DistilBERT/checkpoint-138586] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results-DistilBERT/checkpoint-69293 (score: 0.07963487505912781).
***** Running Prediction *****
  Num examples = 277169
  Batch size = 32
  ctx_manager = autocast(dtype=self.amp_dtype)



 DistilBERT Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    204442
           1       0.94      0.98      0.96     72727

    accuracy                           0.98    277169
   macro avg       0.97      0.98      0.97    277169
weighted avg       0.98      0.98      0.98    277169

