<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/_Generative_AI_with_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load dataset and tokenizer
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token if it does not exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize the input text and add padding
    tokenized_output = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    # Make labels identical to input_ids
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()
    return tokenized_output

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split dataset for training and evaluation
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# Load model and resize token embeddings to match tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Set up data collator specifically for language modeling tasks
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # mlm=False for causal LM (GPT-style)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    evaluation_strategy="epoch",  # Set to "epoch" to match save_strategy
    save_strategy="epoch",        # Set to "epoch" to match evaluation_strategy
    logging_dir='./logs',
    logging_steps=100,            # Log every 100 steps
    load_best_model_at_end=True,
    fp16=True,                    # Enable mixed precision if training on GPU
)

# Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()