In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM ,TrainingArguments,Trainer
import torch
from datasets import load_dataset


In [4]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset["train"][0]

{'text': ''}

In [5]:
# from datasets import Dataset

# texts = open("my_corpus.txt").read().split("\n\n")
# dataset = Dataset.from_dict({"text": texts})


Tokenize Text for GPT-2

In [6]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token, use EOS

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map: 100%|██████████| 4358/4358 [00:00<00:00, 6542.15 examples/s]
Map: 100%|██████████| 36718/36718 [00:03<00:00, 10085.97 examples/s]
Map: 100%|██████████| 3760/3760 [00:00<00:00, 17137.19 examples/s]


Load GPT-2 Model

In [8]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
)


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(2000)),  # small subset for speed
    eval_dataset=tokenized_datasets["validation"].select(range(500)),
)
