<a href="https://colab.research.google.com/github/Adithyahh/2203A51804-NLP/blob/main/Assignment%208.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example data (Use a larger dataset for meaningful training)
text = """
Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods.
One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her.
"""

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos_token to avoid NoneType error
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device)

# Prepare dataset without padding here, as padding will be handled in the collate function
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, max_length=50):
        self.tokens = tokenizer(text, return_tensors="pt", truncation=True)["input_ids"][0]

    def __len__(self):
        return len(self.tokens) - 50  # Number of training steps

    def __getitem__(self, idx):
        return self.tokens[idx:idx+50]

# Custom collate function for dynamic padding
def collate_fn(batch):
    max_length = max([len(x) for x in batch])
    padded_batch = [torch.cat([x, torch.full((max_length - len(x),), tokenizer.pad_token_id)]) for x in batch]
    return torch.stack(padded_batch)

dataset = TextDataset(text, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Training function
def train_model(epochs):
    model.train()
    optimizer = AdamW(model.parameters(), lr=3e-5)

    for epoch in range(epochs):
        for batch in dataloader:
            inputs = batch.to(device)
            labels = inputs.clone()
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Train the model with different epochs
for epochs in [20, 60, 70]:
    print(f"Training with {epochs} epochs")
    train_model(epochs)

# Text generation function
def generate_text(seed_text, max_length=50):
    model.eval()
    input_ids = tokenizer.encode(seed_text, return_tensors="pt").to(device)
    generated_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Example of generating new text with seed text
seed_text = "Once upon a time"
generated_text = generate_text(seed_text)
print("Generated Text:", generated_text)


Training with 20 epochs
Epoch 1/20, Loss: 1.1686850786209106
Epoch 2/20, Loss: 0.4779810309410095
Epoch 3/20, Loss: 0.15723954141139984
Epoch 4/20, Loss: 0.08946692943572998
Epoch 5/20, Loss: 0.06836246699094772
Epoch 6/20, Loss: 0.06429968774318695
Epoch 7/20, Loss: 0.07478256523609161
Epoch 8/20, Loss: 0.0884733721613884
Epoch 9/20, Loss: 0.05638786777853966
Epoch 10/20, Loss: 0.027910945937037468
Epoch 11/20, Loss: 0.039185889065265656
Epoch 12/20, Loss: 0.09135650843381882
Epoch 13/20, Loss: 0.03171371668577194
Epoch 14/20, Loss: 0.20846140384674072
Epoch 15/20, Loss: 0.0012732901377603412
Epoch 16/20, Loss: 0.04455846548080444
Epoch 17/20, Loss: 0.009406703524291515
Epoch 18/20, Loss: 0.002999716904014349
Epoch 19/20, Loss: 0.004810686223208904
Epoch 20/20, Loss: 0.1789366900920868
Training with 60 epochs
Epoch 1/60, Loss: 0.0003511669347062707
Epoch 2/60, Loss: 0.004236217588186264
Epoch 3/60, Loss: 0.0003005146572832018
Epoch 4/60, Loss: 0.006520267575979233
Epoch 5/60, Loss: 0.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Epoch 70/70, Loss: 4.95052918267902e-05
Generated Text: Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods.
One day, her mother asked her to take a basket of goodies to her grandma. On her way through
