<a href="https://colab.research.google.com/github/Vik7am10/SportsGPT/blob/main/SportsGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Hyperparameters
batch_size = 64
block_size = 128  # Increase context size for longer sentences
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 512  # Increased embedding size
n_head = 8  # More attention heads for better context understanding
n_layer = 8  # More transformer layers
dropout = 0.2

In [9]:
import sentencepiece as spm

# Train a tokenizer
spm.SentencePieceTrainer.train(input='espn_sports.txt', model_prefix='sports_gpt', vocab_size=400)

# Load the trained tokenizer
sp = spm.SentencePieceProcessor(model_file='sports_gpt.model')

# Encode the training data
with open("espn_sports.txt", "r", encoding="utf-8") as f:
    text = f.read()

encoded_text = sp.encode(text, out_type=int)

torch.save(torch.tensor(encoded_text), "espn_sports_encoded.pt")
print("Training data tokenized and saved!")

# Load tokenized data
data = torch.load("espn_sports_encoded.pt")

# Train and validation split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

Training data tokenized and saved!


  data = torch.load("espn_sports_encoded.pt")


In [10]:
def get_batch(split):
    data = train_data if split == 'train' else val_data

    # Ensure block_size is within bounds
    if len(data) <= block_size:
        raise ValueError(f"Dataset too small! Length: {len(data)}, Block size: {block_size}")

    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)


In [11]:
# Define Transformer Model
class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=n_embd, num_heads=n_head, dropout=dropout)
        self.ffn = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
        self.norm1 = nn.LayerNorm(n_embd)
        self.norm2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.attn(x, x, x)[0]
        x = self.norm1(x)
        x = x + self.ffn(x)
        x = self.norm2(x)
        return x

In [12]:
class SportsGPT(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Model setup
vocab_size = 30000  # Tokenized vocabulary size
model = SportsGPT(vocab_size, n_embd, n_head, n_layer).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = {"train": 0, "val": 0}
        model.eval()
        with torch.no_grad():
            for split in ['train', 'val']:
                loss_sum = 0
                for _ in range(eval_iters):
                    X, Y = get_batch(split)
                    _, loss = model(X, Y)
                    loss_sum += loss.item()
                losses[split] = loss_sum / eval_iters
        print(f"Step {iter}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}")
        model.train()

    xb, yb = get_batch('train')
    _, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


Step 0: Train loss 10.4911, Val loss 10.5175
Step 500: Train loss 1.6366, Val loss 7.5207
Step 1000: Train loss 1.6225, Val loss 8.0250
Step 1500: Train loss 1.6170, Val loss 8.2370
Step 2000: Train loss 1.6150, Val loss 8.2175
Step 2500: Train loss 1.6163, Val loss 8.3465
Step 3000: Train loss 1.6150, Val loss 8.5725
Step 3500: Train loss 1.6181, Val loss 8.6348
Step 4000: Train loss 1.6112, Val loss 8.7202
Step 4500: Train loss 1.6070, Val loss 8.7971
Step 5000: Train loss 1.6083, Val loss 8.8162
Step 5500: Train loss 1.6062, Val loss 8.7779
Step 6000: Train loss 1.6046, Val loss 8.9021
Step 6500: Train loss 1.6030, Val loss 8.9220
Step 7000: Train loss 1.6040, Val loss 9.0345
Step 7500: Train loss 1.6060, Val loss 8.9627


In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = model.generate(context, max_new_tokens=500)[0].tolist()
print(sp.decode(generated_text))
