In [19]:
import os
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset
from transformers import (
    GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)

# 1. Load dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
texts = [t for t in dataset["train"]["text"] if t.strip()]  # remove empties



In [56]:
# 2. Train tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer = tokenizer.train_new_from_iterator(texts, vocab_size=8000, )
tokenizer.pad_token = "<pad>"

# 3. Tokenize dataset
def tok_fn(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

tokenized = dataset["train"].filter(lambda x: x["text"].strip() != "")
tokenized = tokenized.map(tok_fn, batched=True, remove_columns=["text"])
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

# 4. Build small GPT-2 model
config = GPT2Config(vocab_size=len(tokenizer), pad_token_id=tokenizer.pad_token_id)
model = GPT2LMHeadModel(config)

# 5. Train
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
args = TrainingArguments("out", per_device_train_batch_size=4, num_train_epochs=1, logging_steps=50, report_to=[])
trainer = Trainer(model=model, args=args, train_dataset=tokenized, data_collator=collator)
trainer.train()


Map:   0%|          | 0/23767 [00:00<?, ? examples/s]



Step,Training Loss


KeyboardInterrupt: 