In [1]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from tqdm import tqdm

In [5]:
# Load and preprocess the Shakespeare dataset
with open("/kaggle/input/shakespeare-txt/shakespeare.txt", "r", encoding="utf-8") as file:
    data = file.read()

In [6]:
# Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenized_text = tokenizer.encode(data, return_tensors="pt")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1850440 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
# Define a custom dataset
class ShakespeareDataset(Dataset):
    def __init__(self, tokenized_text, seq_length=50):
        self.tokenized_text = tokenized_text
        self.seq_length = seq_length

    def __len__(self):
        return len(self.tokenized_text[0]) - self.seq_length

    def __getitem__(self, idx):
        return self.tokenized_text[0][idx : idx + self.seq_length]

In [18]:
# Create a DataLoader for training
dataset = ShakespeareDataset(tokenized_text)
dataloader = DataLoader(dataset, batch_size=6, shuffle=True)

In [19]:
# Define the GPT-2 model
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel(config)

In [20]:
# Fine-tune the model on the Shakespeare dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [22]:
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        batch = batch.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")

Epoch 1/3:   0%|          | 1306/308399 [02:45<10:48:40,  7.89it/s]


KeyboardInterrupt: 

In [None]:
# Save the fine-tuned model
model.save_pretrained("shakespeare_fine_tuned_gpt2")

In [None]:
# Generate text using the fine-tuned model
model.eval()

In [None]:
seed_text = "To be or not to be"
input_ids = tokenizer.encode(seed_text, return_tensors="pt").to(device)

In [None]:
for _ in range(100):
    output = model.generate(input_ids, max_length=50, num_beams=5, temperature=0.7)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(generated_text)
    input_ids = tokenizer.encode(seed_text + " " + generated_text, return_tensors="pt").to(device)