<a href="https://colab.research.google.com/github/Shadabur-Rahaman/30-days-ml-projects/blob/main/Day_13_FineTune_GPT2_TextGeneration/notebooks/Day_13_FineTune_GPT2_TextGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 1: Install and Import Required Libraries

In [None]:
!pip install transformers==4.41.2 datasets==2.19.1 numpy==2.0.0



## Step 2: Load and Explore Your Dataset

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

## Step 3: Preprocess and Tokenize the Dataset

In [None]:
# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

## Step 4: Convert Tokenized Data into Torch Format

In [None]:
dataloader = DataLoader(tokenized_dataset, batch_size=2)

## Step 5: Create DataLoader

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # In case we added pad token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Step 6: Initialize the GPT-2 Model and Optimizer

In [None]:
from torch.utils.data import Dataset as TorchDataset

class GPT2Dataset(TorchDataset):
    def __init__(self, hf_dataset):
        self.input_ids = [torch.tensor(x) for x in hf_dataset["input_ids"]]
        self.attention_mask = [torch.tensor(x) for x in hf_dataset["attention_mask"]]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx]
        }

## Step 7: Fine-Tune GPT-2 Model on Your Dataset

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


## Step 8: Save Model Checkpoints

In [None]:
# Your raw text samples
texts = [
    "Once upon a time, there was a brave knight.",
    "The AI revolution is happening faster than expected.",
    "ChatGPT is trained to assist with coding and writing.",
    "Deep learning models require lots of data to train."
]

# Tokenize and convert to torch tensors
encodings = [tokenizer(text, return_tensors="pt", max_length=64, truncation=True, padding="max_length") for text in texts]

class MyDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = {key: val.squeeze(0) for key, val in self.encodings[idx].items()}
        return item

dataset = MyDataset(encodings)
dataloader = DataLoader(dataset, batch_size=2)


## Step 9: Generate Text Samples After Fine-Tuning

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

model.train()
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())


Epoch 1: 100%|██████████| 2/2 [00:22<00:00, 11.01s/it, loss=7.09]
Epoch 2: 100%|██████████| 2/2 [00:07<00:00,  3.51s/it, loss=2.7]
Epoch 3: 100%|██████████| 2/2 [00:07<00:00,  3.55s/it, loss=1.35]


## Step 10: Save Generated Text Samples

In [None]:
import os

# Create output folders if not exist
os.makedirs("model_checkpoints", exist_ok=True)


In [None]:
# Save model checkpoint
checkpoint_path = f"model_checkpoints/gpt2_epoch{epoch+1}.pt"
torch.save(model.state_dict(), checkpoint_path)
print(f"✅ Saved checkpoint: {checkpoint_path}")


✅ Saved checkpoint: model_checkpoints/gpt2_epoch3.pt


In [None]:
# Save generated samples
model.eval()
prompt = "Once upon a time"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

generated_ids = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=3,
    no_repeat_ngram_size=2,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

samples = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]

# Write to file
with open("generated_text_samples.txt", "w") as f:
    for i, sample in enumerate(samples, 1):
        f.write(f"=== Sample {i} ===\n{sample}\n\n")

print("✅ Generated text saved to: generated_text_samples.txt")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Generated text saved to: generated_text_samples.txt
