## Importing Libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

## Preparing the training dataset

In [None]:
context_size = 5
X = []
y = []

for token_seq in encoded_corpus_ids:
    if len(token_seq) < context_size + 1:
        continue
    for i in range(len(token_seq) - context_size):
        X.append(token_seq[i:i + context_size])
        y.append(token_seq[i + context_size])

X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

## Defining the model architecture

In [None]:
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, context_size, embed_dim=64, n_heads=2, ff_dim=128):
        super(MiniGPT, self).__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(context_size, embed_dim)

        self.attn = nn.MultiheadAttention(embed_dim, num_heads=n_heads, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

        self.output_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        B, T = x.shape
        token_embeddings = self.token_embed(x)
        positions = torch.arange(T, device=x.device).unsqueeze(0)
        pos_embeddings = self.pos_embed(positions)
        x = token_embeddings + pos_embeddings

        attn_output, _ = self.attn(x, x, x, need_weights=False)
        x = self.ln1(x + attn_output)

        ff_output = self.ff(x)
        x = self.ln2(x + ff_output)

        logits = self.output_head(x)
        return logits

## Model Training

In [None]:
vocab_size = len(token2id)
model = MiniGPT(vocab_size=vocab_size, context_size=context_size)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in loader:
        optimizer.zero_grad()
        logits = model(xb)
        logits = logits[:, -1, :]  # Only last token
        loss = loss_fn(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(loader):.4f}")

## Generating the output

In [None]:
def generate(model, seed_tokens, max_new_tokens=4):
    model.eval()
    tokens = seed_tokens[:]
    for _ in range(max_new_tokens):
        x = torch.tensor(tokens[-context_size:], dtype=torch.long).unsqueeze(0)
        with torch.no_grad():
            logits = model(x)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
            tokens.append(next_token)
    return tokens


## Testing the model

In [None]:
input_text = ["Is this the "]
seed_tokens = gpt_tokenizer(input_text, merges)

# Ensure correct length
if len(seed_tokens) < context_size:
    seed_tokens_new = [0] * (context_size - len(seed_tokens))
    seed_tokens_new.extend(seed_tokens[0])
    seed_tokens = seed_tokens_new
else:
    seed_tokens = seed_tokens[-context_size:]

# Generate and decode output
generated_ids = generate(model, seed_tokens, max_new_tokens=20)
print(generated_ids)
output_text = decode_token_ids(generated_ids)
print("Input Prompt:", input_text)
print("Generated Output:", output_text)