<a href="https://colab.research.google.com/github/Shadabur-Rahaman/30-days-ml-projects/blob/main/Day_13_FineTune_GPT2_TextGeneration/notebooks/Day_13_FineTune_GPT2_TextGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📦 Install & Import Libraries
Run this cell once at the top of your notebook

In [None]:
!pip install transformers==4.41.2 datasets==2.19.1 torch tqdm --quiet

import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from datasets import load_dataset
from tqdm import tqdm

# 📂 Create Output Directories

In [None]:
# Create directory for model checkpoints
os.makedirs("model_checkpoints", exist_ok=True)

# 📝 Load & Prepare Dataset

In [None]:
# Load a small slice of WikiText-2 for demo
raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")

# 🔍 Initialize Tokenizer & Model

In [None]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 🔢 Tokenize the Dataset

In [None]:
def tokenize_batch(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

tokenized = raw_dataset.map(tokenize_batch, batched=True)

# 🧰 Convert to PyTorch Dataset & DataLoader

In [None]:
class TextDataset(Dataset):
    def __init__(self, hf_dataset):
        self.examples = hf_dataset

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        enc = self.examples[idx]
        return {
            "input_ids": torch.tensor(enc["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long),
        }

train_dataset = TextDataset(tokenized)
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# 🔄 Fine-Tuning Loop

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

model.train()
for epoch in range(num_epochs):
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids,  # GPT-2 causal LM uses inputs as labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

    # Save checkpoint
    ckpt_path = f"model_checkpoints/gpt2_epoch{epoch+1}.pt"
    torch.save(model.state_dict(), ckpt_path)
    print(f"✅ Checkpoint saved: {ckpt_path}")

# ✨ Generate & Save Sample Outputs

In [None]:
model.eval()

prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

generated = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=3,
    no_repeat_ngram_size=2,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

with open("generated_text_samples.txt", "w") as f:
    for i, gen_ids in enumerate(generated, 1):
        text = tokenizer.decode(gen_ids, skip_special_tokens=True)
        f.write(f"=== Sample {i} ===\n{text}\n\n")
print("✅ Generated samples saved to generated_text_samples.txt")