In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2Model

dataset = load_dataset('ag_news')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Step 3: Preprocess the Dataset
def preprocess_function(examples):
    # Tokenize and truncate text to fit GPT-2's context size
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=256)

# Step 4: Apply Preprocessing
# Efficiently map the preprocess function to your dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
from transformers import GPT2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Step 1: Load GPT-2 for classification
model = GPT2ForSequenceClassification.from_pretrained(
    'gpt2',
    num_labels=4
)

# Step 2: Set pad token (GPT-2 has no default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Step 3: Use padding-aware collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,  # effective batch size = 4
    num_train_epochs=4,
    learning_rate=2e-5,
    logging_steps=500,
    eval_steps=2000,
    save_strategy="steps",
    save_steps=2000,
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

# Step 3: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()