<a href="https://colab.research.google.com/github/ShreyMhatre/nanoGPT/blob/main/nanoGPT_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**`nanoGPT`**

In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "validation.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "thedevastator/tinystories-narrative-classification",
  file_path,
)

df.to_csv('input.txt', header=False, index=False)

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/thedevastator/tinystories-narrative-classification?dataset_version_number=2&file_name=validation.csv...


100%|██████████| 5.74M/5.74M [00:00<00:00, 32.2MB/s]

Extracting zip of validation.csv...





### Imports

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim

### Hyperparameters & Config

In [3]:
batch_size = 64   # Number of sequences per batch
block_size = 512  # Maximum context length (tokens)
max_iters = 5000  # Total training iterations
eval_interval = 100 # Evaluate every N steps
learning_rate = 1e-3 # Initial learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if available
eval_iters = 200 # Number of batches for evaluation
n_embd = 192  # Embedding dimension
n_head = 8  # Number of attention heads
n_layer = 8  # Number of transformer blocks
dropout = 0.2  # Dropout rate for regularization

# Enable FlashAttention if available
if torch.cuda.is_available():
    torch.backends.cuda.enable_flash_sdp(True)

torch.manual_seed(1337)

<torch._C.Generator at 0x7da233d367d0>

### Dataset Loading and Preprocessing

In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

# Simple character-level tokenizer (can be replaced with BPE/WordPiece)
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)} # char -> index
itos = {i: ch for i, ch in enumerate(chars)} # index -> char
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Convert full text to tensor of token indices
data = torch.tensor(encode(text), dtype=torch.long)

# 90-10 train/validation split
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

### Data Batching Utilities

In [5]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])         # Input sequence
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]) # Target sequence
    return x.to(device), y.to(device)

### Evaluation Utility

In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Model Components

In [7]:
# Single attention head with masking for causal (autoregressive) behavior
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=True)
        self.query = nn.Linear(n_embd, head_size, bias=True)
        self.value = nn.Linear(n_embd, head_size, bias=True)
        self.register_buffer('mask', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k, q, v = self.key(x), self.query(x), self.value(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5 # Scaled dot-product attention
        wei = wei.masked_fill(self.mask[:T, :T] == 0, float('-inf'))  # Causal masking
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        return wei @ v

# Multi-head attention: runs several attention heads in parallel
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

# Feedforward MLP with GELU activation
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

# Transformer Block: self-attention + feedforward + layer norm
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # Residual + LayerNorm + MHA
        return x + self.ffwd(self.ln2(x)) # Residual + LayerNorm + FFN

### Language Model Definition

In [8]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # Output logits
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = self.blocks(tok_emb + pos_emb)
        logits = self.lm_head(self.ln_f(x))
        loss = F.cross_entropy(logits.view(B * T, -1), targets.view(B * T)) if targets is not None else None
        return logits, loss

    # Autoregressive generation
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx[:, -block_size:])
            probs = F.softmax(logits[:, -1, :], dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

### Model Initialization & Optimizer

In [9]:
model = BigramLanguageModel().to(device)
print(sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, fused=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_iters)
scaler = torch.amp.GradScaler("cuda") # For mixed precision training

3.696485 M parameters


### Training loop

In [10]:
for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    with torch.autocast(device_type='cuda', dtype=torch.float16):
        logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    scheduler.step()

Step 0: Train loss 5.3065, Val loss 5.3023
Step 100: Train loss 2.3398, Val loss 2.3348
Step 200: Train loss 2.2984, Val loss 2.2932
Step 300: Train loss 2.2731, Val loss 2.2695
Step 400: Train loss 2.2134, Val loss 2.2110
Step 500: Train loss 2.0708, Val loss 2.0756
Step 600: Train loss 1.8602, Val loss 1.8750
Step 700: Train loss 1.6904, Val loss 1.7064
Step 800: Train loss 1.5450, Val loss 1.5663
Step 900: Train loss 1.4406, Val loss 1.4628
Step 1000: Train loss 1.3435, Val loss 1.3640
Step 1100: Train loss 1.2780, Val loss 1.2995
Step 1200: Train loss 1.2201, Val loss 1.2463
Step 1300: Train loss 1.1762, Val loss 1.1997
Step 1400: Train loss 1.1356, Val loss 1.1608
Step 1500: Train loss 1.1065, Val loss 1.1312
Step 1600: Train loss 1.0827, Val loss 1.1020
Step 1700: Train loss 1.0566, Val loss 1.0779
Step 1800: Train loss 1.0344, Val loss 1.0589
Step 1900: Train loss 1.0139, Val loss 1.0408
Step 2000: Train loss 1.0031, Val loss 1.0271
Step 2100: Train loss 0.9855, Val loss 1.0099


KeyboardInterrupt: 

### Text Generation

In [11]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_tokens = model.generate(context, max_new_tokens=1000)
generated_text = decode(generated_tokens[0].tolist())

print("\nGenerated Text:\n", generated_text)


Generated Text:
 
One day, a little girl named All named Lucy happened her called the coves were to make drumses and had but fly, had never seen bed seen. One day, Lily played to the box, the door into an a little girl was all hor hair. Lily was a big carrot, but she asked because if her nerguerer. From then day went up, hennigs playing in the widge of every colow."
"Once upon a time, there was a grow. It was a bead tire, Lily day. Buzzy said her friend continued to keep for sill around her love. They went back to the seal flow something and ally to.
One day, Lily went to deep it the shelf appa on the sky and wanted the sned asked with the sunshine. 

Anbobow it, Billy said, ""The undover think youm, mom an+e with mine, ged with bread. A sweep fawing messs with her. With had teelep!"

Lily said thank the hugson the hill had not reall with the barage and replixed on the growadory together. Everyone with morning she look at her was tired. Sarah ran to the to play 1ty with there around t

In [12]:
torch.save(model.state_dict(), 'model.pth')