In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install datasets tokenizers

import torch
import torch.nn as nn
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    device = 'cuda'
else:
    print("Using Single GPU")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("Loading 1,000,000 rows from TinyStories...")
dataset = load_dataset("roneneldan/TinyStories", split="train[:1000000]")

def truncate(example):
    return {"text": example["text"][:200]}

dataset = dataset.map(truncate, num_proc=4)
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"], vocab_size=4096)

def batch_iterator(dataset, batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

print("Training Tokenizer (Vocab 4096)...")
tokenizer.train_from_iterator(batch_iterator(dataset), trainer=trainer)
vocab_size = tokenizer.get_vocab_size()
print(f"Tokenizer Ready. Vocab Size: {vocab_size}")
print("Encoding Data (this may take a minute)...")
all_text = "".join(dataset["text"])
encoded_ids = tokenizer.encode(all_text).ids
data = torch.tensor(encoded_ids, dtype=torch.long)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Total Training Tokens: {len(train_data)/1e6:.2f} Million")

Using 2 GPUs!
Loading 1,000,000 rows from TinyStories...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(…):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000000 [00:00<?, ? examples/s]

Training Tokenizer (Vocab 4096)...



Tokenizer Ready. Vocab Size: 4096
Encoding Data (this may take a minute)...
Total Training Tokens: 43.38 Million


In [2]:
import torch.nn as nn
from torch.nn import functional as F

batch_size = 64 
block_size = 256
n_embd = 768
n_head = 12
n_layer = 16   

dropout = 0.1

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C**-0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(), 
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = GPTLanguageModel()

if torch.cuda.device_count() > 1:
    print(f"Wrapping model in DataParallel for {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)
model = model.to(device)

if isinstance(model, nn.DataParallel):
    param_count = sum(p.numel() for p in model.module.parameters())
else:
    param_count = sum(p.numel() for p in model.parameters())

print(f"Model Parameters: {param_count/1e6:.2f} Million")

Wrapping model in DataParallel for 2 GPUs
Model Parameters: 119.86 Million


In [3]:
import time

if isinstance(model, nn.DataParallel):
    optimizer = torch.optim.AdamW(model.module.parameters(), lr=3e-4)
else:
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

def get_batch(split):
    data_source = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_source) - block_size, (batch_size,))
    x = torch.stack([data_source[i:i+block_size] for i in ix])
    y = torch.stack([data_source[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(100)
        for k in range(100):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
        
            if loss.ndim > 0: loss = loss.mean()
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
max_iters = 2000 
eval_interval = 200
start_time = time.time()

print(f"Starting Baseline Training (120M Params) on {device}...")
for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        dt = time.time() - start_time
        print(f"Step {iter}: Train {losses['train']:.4f}, Val {losses['val']:.4f} | Time: {dt:.1f}s")

    xb, yb = get_batch('train')
    _, loss = model(xb, yb)
  
    if loss.ndim > 0: loss = loss.mean()
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Baseline Training Complete.")

Starting Baseline Training (120M Params) on cuda...




KeyboardInterrupt: 

In [4]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_baseline(num_samples=50):
    generation_model = model.module if isinstance(model, nn.DataParallel) else model
    generation_model.eval()
    
    scores = []
    smoothing = SmoothingFunction().method1

    print("Calculating Baseline BLEU...")
    with torch.no_grad():
        for _ in range(num_samples):
            idx = torch.randint(len(val_data) - 300, (1,)).item()
            chunk = val_data[idx:idx+300]
            text = tokenizer.decode(chunk.tolist())
            
            words = text.split()
            if len(words) < 60: continue

            prompt_str = " ".join(words[:10])
            ref_words = words[10:60]

            context_ids = tokenizer.encode(prompt_str).ids
            context_tensor = torch.tensor([context_ids], dtype=torch.long, device=device)

            gen_ids = generation_model.generate(context_tensor, max_new_tokens=60)
            full_gen = tokenizer.decode(gen_ids[0].tolist())
            gen_text = full_gen[len(prompt_str):]
 
            ref = " ".join(ref_words).split()
            cand = gen_text.split()
            if len(cand) > 0:
                score = sentence_bleu([ref], cand, weights=(0.5, 0.5), smoothing_function=smoothing)
                scores.append(score)

    avg = sum(scores) / len(scores)
    print(f"\nBASELINE BLEU SCORE: {avg:.4f}")
    return avg

baseline_score = evaluate_baseline()

Calculating Baseline BLEU...

BASELINE BLEU SCORE: 0.1855


In [5]:
import random

configs = [
    {'lr': 5e-4, 'dropout': 0.2, 'name': 'Aggressive'},
    {'lr': 1e-4, 'dropout': 0.05, 'name': 'Conservative'},
    {'lr': 3e-4, 'dropout': 0.2,  'name': 'Balanced'}
]

results = {}

print("--- Starting Short-Horizon Search (500 steps each) ---")

for config in configs:
    print(f"\nTesting Config: {config['name']} (LR: {config['lr']}, Drop: {config['dropout']})")
 
    dropout = config['dropout'] 
    model = GPTLanguageModel()
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model = model.to(device)

    if isinstance(model, nn.DataParallel):
        optimizer = torch.optim.AdamW(model.module.parameters(), lr=config['lr'])
    else:
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])

    for iter in range(500): 
        xb, yb = get_batch('train')
        _, loss = model(xb, yb)
        if loss.ndim > 0: loss = loss.mean()
        
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    final_loss = estimate_loss()['val']
    print(f"Result {config['name']}: Val Loss {final_loss:.4f}")
    results[config['name']] = final_loss

best_config_name = min(results, key=results.get)
print("-" * 50)
print(f"Winner: {best_config_name} with Loss {results[best_config_name]:.4f}")
print(f"Baseline at step 500 was ~2.42. Did we beat it?")

--- Starting Short-Horizon Search (500 steps each) ---

Testing Config: Aggressive (LR: 0.0005, Drop: 0.2)




Result Aggressive: Val Loss 2.2245

Testing Config: Conservative (LR: 0.0001, Drop: 0.05)
Result Conservative: Val Loss 2.7469

Testing Config: Balanced (LR: 0.0003, Drop: 0.2)
Result Balanced: Val Loss 2.3390
--------------------------------------------------
Winner: Aggressive with Loss 2.2245
Baseline at step 500 was ~2.42. Did we beat it?


In [4]:
if isinstance(model, nn.DataParallel):
    optimizer = torch.optim.AdamW(model.module.parameters(), lr=3e-4)
else:
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

def get_batch(split):
    data_source = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_source) - block_size, (batch_size,))
    x = torch.stack([data_source[i:i+block_size] for i in ix])
    y = torch.stack([data_source[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [5]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(100) 
        for k in range(100):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            if loss.ndim > 0: loss = loss.mean()
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [6]:
import time

dropout = 0.2
learning_rate = 5e-4

print("Initializing Final Model (Aggressive Config)...")
model = GPTLanguageModel()
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model = model.to(device)

if isinstance(model, nn.DataParallel):
    optimizer = torch.optim.AdamW(model.module.parameters(), lr=learning_rate)
else:
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

max_iters = 10000
eval_interval = 500
start_time = time.time()
best_val_loss = float('inf')

print(f"Starting Final Training (10000 Steps) on {device}...")


for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            
        dt = time.time() - start_time
        print(f"Step {iter}: Train {losses['train']:.4f}, Val {losses['val']:.4f} | Time: {dt:.1f}s")

    xb, yb = get_batch('train')
    _, loss = model(xb, yb)
    
    if loss.ndim > 0: loss = loss.mean()

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("-" * 50)
print(f"FINAL TRAINING COMPLETE.")
print(f"Best Validation Loss: {best_val_loss:.4f}")

Initializing Final Model (Aggressive Config)...
Starting Final Training (10000 Steps) on cuda...
--------------------------------------------------
Step 0: Train 8.5004, Val 8.5007 | Time: 183.8s
Step 500: Train 2.2523, Val 2.2232 | Time: 1674.6s
Step 1000: Train 1.8962, Val 1.8963 | Time: 3164.6s
Step 1500: Train 1.7538, Val 1.7492 | Time: 4655.6s
Step 2000: Train 1.6743, Val 1.6799 | Time: 6146.7s
Step 2500: Train 1.6145, Val 1.6325 | Time: 7639.6s
Step 3000: Train 1.5705, Val 1.5854 | Time: 9132.5s
Step 3500: Train 1.5387, Val 1.5703 | Time: 10625.0s
Step 4000: Train 1.5067, Val 1.5397 | Time: 12116.3s
Step 4500: Train 1.4801, Val 1.5154 | Time: 13609.1s
Step 5000: Train 1.4589, Val 1.4977 | Time: 15101.3s
Step 5500: Train 1.4413, Val 1.4772 | Time: 16593.6s
Step 6000: Train 1.4240, Val 1.4716 | Time: 18086.3s
Step 6500: Train 1.4166, Val 1.4670 | Time: 19577.8s
Step 7000: Train 1.3910, Val 1.4549 | Time: 21070.8s
Step 7500: Train 1.3910, Val 1.4508 | Time: 22562.1s
Step 8000: Train

In [7]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_final(num_samples=50):
    generation_model = model.module if isinstance(model, nn.DataParallel) else model
    generation_model.eval()
    
    scores = []
    smoothing = SmoothingFunction().method1

    print("Calculating Final BLEU Score...")
    with torch.no_grad():
        for _ in range(num_samples):

            idx = torch.randint(len(val_data) - 300, (1,)).item()
            chunk = val_data[idx:idx+300]
            text = tokenizer.decode(chunk.tolist())
            
            words = text.split()
            if len(words) < 60: continue
            prompt_str = " ".join(words[:10])
            ref_words = words[10:60]
            context_ids = tokenizer.encode(prompt_str).ids
            context_tensor = torch.tensor([context_ids], dtype=torch.long, device=device)
            
            gen_ids = generation_model.generate(context_tensor, max_new_tokens=60)
            full_gen = tokenizer.decode(gen_ids[0].tolist())
            gen_text = full_gen[len(prompt_str):]
            ref = " ".join(ref_words).split()
            cand = gen_text.split()
            if len(cand) > 0:
                score = sentence_bleu([ref], cand, weights=(0.5, 0.5), smoothing_function=smoothing)
                scores.append(score)

    avg = sum(scores) / len(scores)
    print("-" * 30)
    print(f"BASELINE BLEU: 0.1855")
    print(f"FINAL BLEU:    {avg:.4f}")
    
    if avg > 0.1855:
        print("RESULT: SUCCESS - Hyperparameter Tuning Improved Performance!")
    else:
        print("RESULT: NEUTRAL - Model might need even more training.")
        
    return avg

final_score = evaluate_final()

Calculating Final BLEU Score...
------------------------------
BASELINE BLEU: 0.1855
FINAL BLEU:    0.2224
RESULT: SUCCESS - Hyperparameter Tuning Improved Performance!


In [8]:
import torch.nn.functional as F

def generate_creative_story(prompt, temp=0.8):
    generation_model = model.module if isinstance(model, nn.DataParallel) else model
    generation_model.eval()
    context_ids = tokenizer.encode(prompt).ids
    idx = torch.tensor([context_ids], dtype=torch.long, device=device)

    print(f"Prompt: {prompt}")

    with torch.no_grad():
        for _ in range(250):
            idx_cond = idx[:, -block_size:]
            logits, _ = generation_model(idx_cond)
            logits = logits[:, -1, :] 
            logits = logits / temp 
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

    return tokenizer.decode(idx[0].tolist())

story = generate_creative_story("One day, a tiny robot found a magic key", temp=0.8)
print(story)


Prompt: One day, a tiny robot found a magic key
Generating...
--------------------------------------------------
One day , a tiny robot found a magic key . The robot was very happy and wanted to show the way to t Once upon a time , there was a little boy named Timmy . One day , Timmy went to the park to play . He saw a squirrel and wanted to pet it , but the squirrel was too fast and ran away . Timmy was sad and he de Once upon a time , there was a little boy named Timmy . Timmy loved to play outside in the snow with his friends . One day , Timmy ' s mom told him that he needed to wear his gloves so he wouldn ' t get his Once upon a time , there was a little girl named Lily . She loved to play outside and run around in the grass . One day , while she was playing , she found a shiny silver coin on the ground . She was so ve Once upon a time , there was a little girl named Lily . Lily loved to play outside , but one day it was very cold outside . Lily put on her warm coat , but her hands