## CHAYA_62M — Context-aware Hybrid Autoregressive Transformer Architecture

**Phase 1 Training** on WikiText-103 dataset

This notebook implements a from-scratch BPE-based GPT architecture with:

- GPT-2 tokenizer (HuggingFace, 50,257 vocab)
- Factorized embeddings (V×E) with projection E→D and tied output D→E
- **Modern transformer components:** RoPE, RMSNorm, SwiGLU
- Transformer backbone: D_MODEL = 640, E_DIM = 192, 8 layers, 8 heads
- FFN multiplier = 4 with SwiGLU activation
- Causal masked self-attention with Rotary Positional Embeddings
- SEQ_LEN = 256
- **~62M trainable parameters**

Trained for 6 epochs on WikiText-103, achieving:
- Final Val Loss: 3.63
- Final Val PPL: 37.87
- Final Val Accuracy: 40.55%


## Imports

In [1]:
import os
import math
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast
from datasets import load_dataset
from torchinfo import summary
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

torch.backends.cudnn.benchmark = True

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7a1d3cc29b30>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


# GPT-2 Tokenizer

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
vocab_size = tokenizer.vocab_size
print("Loaded GPT-2 tokenizer. Vocab size:", vocab_size)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loaded GPT-2 tokenizer. Vocab size: 50257


# Dataset loading and Tokenization

In [6]:


raw_train = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
raw_val = load_dataset("wikitext", "wikitext-103-raw-v1", split="validation")

raw_train = raw_train.filter(lambda x: x["text"] is not None and x["text"].strip() != "")
raw_val = raw_val.filter(lambda x: x["text"] is not None and x["text"].strip() != "")

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        add_special_tokens=False,
        return_attention_mask=False
    )

print("Tokenizing train...")
tokenized_train = raw_train.map(
    tokenize_function,
    batched=True,
    num_proc=12,    remove_columns=["text"]
)

print("Tokenizing val...")
tokenized_val = raw_val.map(
    tokenize_function,
    batched=True,
    num_proc=12,
    remove_columns=["text"]
)

train_ids = [id for ids in tokenized_train["input_ids"] for id in ids]
val_ids = [id for ids in tokenized_val["input_ids"] for id in ids]

print("Train tokens:", len(train_ids))
print("Val tokens:", len(val_ids))


Tokenizing train...
Tokenizing val...
Train tokens: 117920140
Val tokens: 247289


# Training and val batch creation

In [7]:
BATCH_SIZE = 40
SEQ_LEN = 256

class TokenDataset(Dataset):
    def __init__(self, ids, seq_len):
        self.ids = ids
        self.seq_len = seq_len
        self.n = max(0, (len(self.ids) - 1) // seq_len)

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        start = idx * self.seq_len
        x = torch.tensor(self.ids[start : start + self.seq_len], dtype=torch.long)
        y = torch.tensor(self.ids[start + 1 : start + self.seq_len + 1], dtype=torch.long)
        return x, y

train_ds = TokenDataset(train_ids, SEQ_LEN)
val_ds = TokenDataset(val_ids, SEQ_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

print("Train batches:", len(train_loader), "Val batches:", len(val_loader))


Train batches: 11515 Val batches: 24


## Model Architecture

In [None]:

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization (no mean subtraction)."""
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        rms = torch.sqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)        
        x_normed = x / rms
        return self.scale * x_normed

class RotaryEmbedding(nn.Module):
    """Precompute inverse frequencies for RoPE and provide cos/sin for a sequence length."""
    def __init__(self, dim):
        super().__init__()
        assert dim % 2 == 0, "RoPE head dim must be even"
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))        
        self.register_buffer("inv_freq", inv_freq)

    def get_cos_sin(self, seq_len, device, dtype=torch.float32):
        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)        
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)        
        emb = torch.cat((freqs, freqs), dim=-1)        
        cos = emb.cos().to(dtype)[None, None, :, :]        
        sin = emb.sin().to(dtype)[None, None, :, :]        
        return cos, sin

def apply_rope(q, k, cos, sin):
    """
    q, k : (B, heads, T, head_dim)
    cos/sin: (1,1,T,head_dim)
    Returns rotated q,k with RoPE applied.
    """
    def rotate_half(x):
        x1 = x[..., ::2]
        x2 = x[..., 1::2]
        return torch.cat((-x2, x1), dim=-1)    
    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.0):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.scale = math.sqrt(self.head_dim)

        self.qkv = nn.Linear(d_model, d_model * 3, bias=False)
        self.out = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

        self.rotary = RotaryEmbedding(self.head_dim)

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv(x)        q, k, v = qkv.chunk(3, dim=-1)
        q = q.view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(B, T, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        cos, sin = self.rotary.get_cos_sin(T, x.device, dtype=q.dtype)        
        q, k = apply_rope(q, k, cos, sin)

        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale
        mask = torch.tril(torch.ones((T, T), device=x.device, dtype=torch.bool)).unsqueeze(0).unsqueeze(0)        
        scores = scores.masked_fill(~mask, float("-inf"))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)        
        out = out.permute(0, 2, 1, 3).contiguous().view(B, T, C)        
        return self.out(out)

class SwiGLU(nn.Module):
    """
    SwiGLU variant:
      out = W3( SiLU(W1(x)) * W2(x) )
    This is a gated linear unit with SiLU nonlinearity; effective and parameter-efficient.
    """
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(dim, hidden_dim, bias=False)
        self.w3 = nn.Linear(hidden_dim, dim, bias=False)

    def forward(self, x):
        return self.w3(F.silu(self.w1(x)) * self.w2(x))

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_mult=4, dropout=0.1):
        super().__init__()
        hidden_dim = int(d_model * ff_mult)

        self.norm1 = RMSNorm(d_model)
        self.attn = MultiHeadSelfAttention(d_model, num_heads, dropout=dropout)

        self.norm2 = RMSNorm(d_model)
        self.ffn = SwiGLU(d_model, hidden_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.dropout(self.attn(self.norm1(x)))
        x = x + self.dropout(self.ffn(self.norm2(x)))
        return x


## Building & Instantiating the model

In [None]:

E_DIM = 192        
D_MODEL = 640
NUM_LAYERS = 8
NUM_HEADS = 8     
FF_MULT = 4

LR = 0.00019
WEIGHT_DECAY = 1e-2
CLIP_GRAD = 1.0
DROPOUT = 0.05



class CHAYA_62M(nn.Module):
    def __init__(
        self,
        vocab_size,
        e_dim=E_DIM,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        num_layers=NUM_LAYERS,
        seq_len=SEQ_LEN,
        dropout=DROPOUT,
        ff_mult=FF_MULT
    ):
        super().__init__()

        self.vocab_size = vocab_size
        self.e_dim = e_dim
        self.d_model = d_model
        self.seq_len = seq_len

        self.token_embedding = nn.Embedding(vocab_size, e_dim)

        self.emb_proj = nn.Linear(e_dim, d_model, bias=False)

        self.layers = nn.ModuleList([
            TransformerBlock(
                d_model=d_model,
                num_heads=num_heads,
                ff_mult=ff_mult,
                dropout=dropout
            )
            for _ in range(num_layers)
        ])

        self.norm_f = RMSNorm(d_model)

        self.out_proj = nn.Linear(d_model, e_dim, bias=False)

    def forward(self, idx):

        emb_e = self.token_embedding(idx)        
        x = self.emb_proj(emb_e)
        for layer in self.layers:
            x = layer(x)

        x = self.norm_f(x)
        e_logits = self.out_proj(x)        
        logits = e_logits @ self.token_embedding.weight.t()
        return logits

# -----------------------------------Instantiating a new model with random weight------------------------------------- #

#Instantiating the model
#model = CHAYA_62M(vocab_size=vocab_size, e_dim=E_DIM, d_model=D_MODEL, num_heads=NUM_HEADS, num_layers=NUM_LAYERS, seq_len=SEQ_LEN, dropout=DROPOUT).to(device)
#print("Model instantiated")
# -----------------------------------Instantiating a new model with random weight------------------------------------- #



# -------------------------------Loading fully saved model <not checkpoint model>------------------------------------- #

ckpt = torch.load("/kaggle/input/---", map_location=device)

model = CHAYA_62M(vocab_size=ckpt["vocab_size"], e_dim=ckpt["e_dim"], d_model=ckpt["d_model"],
                    num_heads=ckpt["num_heads"], num_layers=ckpt["num_layers"], seq_len=ckpt["seq_len"]).to(device)
model.load_state_dict(ckpt["model_state_dict"])
print("previously trained model loaded")


if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = torch.nn.DataParallel(model)
# -------------------------------Loading fully saved model <not checkpoint model>------------------------------------- #


# --------------------------------For loading the checkpoint model----------------------------------------------------- #
#        |                                                                                           |
#        |                                                                                           |
# -------|------1--< Instantiate a new model with random weight >------------------------------------|------------ #
#        |                                                                                           |
# -------|------2--< The checkpoint model will be loaded in the training loop section >--------------|------------ #
#        |                                                                                           |
# -------|---------< Checkpoint Loading Function to be specific >------------------------------------|------------ #
#        |                                                                                           |
#        |                                                                                           |
# --------------------------------For loading the checkpoint model------------------------------------------------------ #


previously trained model loaded
Using 2 GPUs


## model summary

In [13]:
dummy_input = torch.zeros((BATCH_SIZE, SEQ_LEN), dtype=torch.long).to(device)
summary(model, input_data=dummy_input)



Layer (type:depth-idx)                             Output Shape              Param #
DataParallel                                       [40, 256, 50257]          --
├─MiniGPT_BPE: 1-1                                 [20, 256, 50257]          62,334,784
├─MiniGPT_BPE: 1-4                                 --                        (recursive)
│    └─Embedding: 2-1                              [20, 256, 192]            9,649,344
├─MiniGPT_BPE: 1-3                                 [20, 256, 50257]          --
├─MiniGPT_BPE: 1-4                                 --                        (recursive)
│    └─Embedding: 2-2                              [20, 256, 192]            --
│    └─Linear: 2-3                                 [20, 256, 640]            122,880
│    └─Linear: 2-4                                 [20, 256, 640]            --
│    └─ModuleList: 2-9                             --                        (recursive)
│    │    └─TransformerBlock: 3-1                  [20, 256, 640]   

In [14]:
if hasattr(model, "module"):
    real_model = model.module
else:
    real_model = model

total_params = sum(p.numel() for p in real_model.parameters())
print("Total parameters:", total_params / 1e6, "Million")


Total parameters: 62.334784 Million


## Training optimizations & train val loop

In [None]:
EPOCHS = 3

# -----------------------------------------optimizer---------------------------------------- #
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# -----------------------------------------optimizer---------------------------------------- #


# --------------------------------------warmup scheduler------------------------------------ #
class WarmupLinearScheduler:
    def __init__(self, optimizer, warmup_steps, total_steps, base_lr):
        self.optimizer = optimizer
        self.warmup_steps = max(1, warmup_steps)
        self.total_steps = max(1, total_steps)
        self.base_lr = base_lr
        self.step_num = 0

    def step(self):
        self.step_num += 1

        if self.step_num < self.warmup_steps:
            lr_scale = self.step_num / self.warmup_steps
        else:
            progress = (self.step_num - self.warmup_steps) / (self.total_steps - self.warmup_steps)
            lr_scale = max(0.0, 1.0 - progress)

        lr = self.base_lr * lr_scale
        for g in self.optimizer.param_groups:
            g["lr"] = lr

total_steps = max(1, len(train_loader) * EPOCHS)
warmup_steps = 2000
scheduler = WarmupLinearScheduler(
    optimizer,
    warmup_steps=warmup_steps,
    total_steps=total_steps,
    base_lr=LR
)

scaler = torch.cuda.amp.GradScaler()
# --------------------------------------warmup scheduler------------------------------------ #


# --------------------------------------Metrics helpers------------------------------------- #
def token_accuracy(logits, targets):
    preds = logits.argmax(dim=-1)
    correct = (preds == targets).float().sum().item()
    total = targets.numel()
    return correct / total

def perplexity_from_loss(loss):
    try:
        return math.exp(loss)
    except OverflowError:
        return float('inf')
# --------------------------------------Metrics helpers------------------------------------- #


# -------------------------------------Train & val loop------------------------------------- #

def train_epoch(model, loader, optimizer, criterion, scheduler=None, scaler=None, clip_grad=1.0):
    model.train()
    total_loss, total_acc = 0.0, 0.0

    pbar = tqdm(loader, desc="Training", leave=False)
    for x, y in pbar:
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            logits = model(x)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))

        scaler.scale(loss).backward()

        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

        scaler.step(optimizer)
        scaler.update()

        if scheduler:
            scheduler.step()
            if scheduler.step_num % 1000 == 0:
                print("LR:", optimizer.param_groups[0]["lr"])

        acc = token_accuracy(logits, y)
        total_loss += loss.item()
        total_acc += acc

        pbar.set_postfix(
            loss=f"{loss.item():.4f}",
            acc=f"{acc:.4f}",
            ppl=f"{perplexity_from_loss(loss.item()):.4f}"
        )


    return total_loss / len(loader), total_acc / len(loader)


def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_acc = 0.0, 0.0
    pbar = tqdm(loader, desc="Validation", leave=False)
    with torch.no_grad():
        for x, y in pbar:
            x = x.to(device); y = y.to(device)
            logits = model(x)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))
            acc = token_accuracy(logits, y)
            total_loss += loss.item()
            total_acc += acc
            pbar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{acc:.4f}")
    return total_loss / len(loader), total_acc / len(loader)
# -------------------------------------Train & val loop------------------------------------- #



  scaler = torch.cuda.amp.GradScaler()


## Training loop

In [None]:
save_path = "chaya_62m_ckp.pt" # for saving the path should be remin as it is , for loading the checkpoint model path update needed  <Pranoy71>
best_val = float("inf")

train_losses, val_losses = [], []
train_accs, val_accs = [], []


# -------|-----------------------For loading the checkpoint model------------------------------------|---------------- #
def load_checkpoint(model, optimizer, scheduler, scaler, path=save_path):
    if not os.path.exists(path):
        print("No checkpoint found, starting fresh.")
        return 1, float("inf"), model

    checkpoint = torch.load(path)
    state_dict = checkpoint["model_state_dict"]
    has_module_prefix = any(k.startswith("module.") for k in state_dict.keys())

    if has_module_prefix and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)
        model.load_state_dict(state_dict)
    elif not has_module_prefix and isinstance(model, torch.nn.DataParallel):
        model.module.load_state_dict(state_dict)
    else:
        model.load_state_dict(state_dict)

    if "optimizer_state_dict" in checkpoint:
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    if "scheduler_state_dict" in checkpoint:
        scheduler.__dict__.update(checkpoint["scheduler_state_dict"])
    if "scaler_state_dict" in checkpoint:
        scaler.load_state_dict(checkpoint["scaler_state_dict"])

    start_epoch = checkpoint.get("epoch", 0) + 1
    best_val = checkpoint.get("best_val", float("inf"))

    print(f"Resuming from epoch {start_epoch}")
    return start_epoch, best_val, model

start_epoch, best_val, model = load_checkpoint(model, optimizer, scheduler, scaler, save_path)
# -------|-----------------------For loading the checkpoint model------------------------------------|---------------- #


# -----------------------------------------------Training loop-------------------------------------------------------- #
for epoch in range(start_epoch, EPOCHS + 1):
    tr_loss, tr_acc = train_epoch(
        model, train_loader, optimizer, criterion,
        scheduler=scheduler, scaler=scaler, clip_grad=CLIP_GRAD
    )
    va_loss, va_acc = eval_epoch(model, val_loader, criterion)

    train_losses.append(tr_loss)
    val_losses.append(va_loss)
    train_accs.append(tr_acc)
    val_accs.append(va_acc)

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"Train Loss: {tr_loss:.4f} | Train Acc: {tr_acc:.4f} | "
        f"Val Loss: {va_loss:.4f} | Val Acc: {va_acc:.4f} | "
        f"Val PPL: {perplexity_from_loss(va_loss):.2f}"
    )

    if va_loss < best_val:
        best_val = va_loss

        state_dict = model.module.state_dict() if hasattr(model, "module") else model.state_dict()

        torch.save({
            "epoch": epoch,
            "best_val": best_val,
            "model_state_dict": state_dict,
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.__dict__,
            "scaler_state_dict": scaler.state_dict(),
            "vocab_size": vocab_size,
            "e_dim": E_DIM,
            "d_model": D_MODEL,
            "num_layers": NUM_LAYERS,
            "num_heads": NUM_HEADS,
            "seq_len": SEQ_LEN
        }, save_path)

        tokenizer.save_pretrained("./gpt2_tokenizer_saved")
        print("Saved checkpoint with full training state.")
        print("For saving the full model without checkpoint for clean loading in future, run the <-Save Final Model->cell ")
# -----------------------------------------------Training loop-------------------------------------------------------- #


# --------------------------------------------Results of epochs-------------------------------------------------------- #
#Epoch 1/6 | Train Loss: 5.8140 | Train Acc: 0.2767 | Val Loss: 4.4591 | Val Acc: 0.3468 | Val PPL: 86.41
#Epoch 2/6 | Train Loss: 4.3180 | Train Acc: 0.3522 | Val Loss: 4.0670 | Val Acc: 0.3718 | Val PPL: 58.38
#Epoch 3/6 | Train Loss: 4.0009 | Train Acc: 0.3743 | Val Loss: 3.8697 | Val Acc: 0.3868 | Val PPL: 47.93
#Epoch 4/6 | Train Loss: 3.8499 | Train Acc: 0.3829 | Val Loss: 3.7936 | Val Acc: 0.3896 | Val PPL: 44.42
#Epoch 5/6 | Train Loss: 3.7142 | Train Acc: 0.3939 | Val Loss: 3.6811 | Val Acc: 0.3995 | Val PPL: 39.69
#Epoch 6/6 | Train Loss: 3.5927 | Train Acc: 0.4059 | Val Loss: 3.6342 | Val Acc: 0.4055 | Val PPL: 37.87
# --------------------------------------------Results of epochs-------------------------------------------------------- #


Resuming from epoch 1


Training:   0%|          | 0/11515 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


LR: 9.5e-05
LR: 0.00019
LR: 0.0001841619296358888
LR: 0.00017832385927177754
LR: 0.00017248578890766633
LR: 0.00016664771854355508
LR: 0.00016080964817944386
LR: 0.0001549715778153326
LR: 0.0001491335074512214
LR: 0.00014329543708711017
LR: 0.00013745736672299893


Validation:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 1/3 | Train Loss: 3.8499 | Train Acc: 0.3829 | Val Loss: 3.7936 | Val Acc: 0.3896 | Val PPL: 44.42
Saved checkpoint with full training state.


Training:   0%|          | 0/11515 [00:00<?, ?it/s]

LR: 0.0001316192963588877
LR: 0.00012578122599477646
LR: 0.00011994315563066524
LR: 0.00011410508526655401
LR: 0.00010826701490244278
LR: 0.00010242894453833154
LR: 9.659087417422031e-05
LR: 9.075280381010908e-05
LR: 8.491473344599785e-05
LR: 7.907666308188661e-05
LR: 7.323859271777539e-05
LR: 6.740052235366416e-05


Validation:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 2/3 | Train Loss: 3.7142 | Train Acc: 0.3939 | Val Loss: 3.6811 | Val Acc: 0.3995 | Val PPL: 39.69
Saved checkpoint with full training state.


Training:   0%|          | 0/11515 [00:00<?, ?it/s]

LR: 6.156245198955293e-05
LR: 5.5724381625441694e-05
LR: 4.988631126133046e-05
LR: 4.4048240897219235e-05
LR: 3.8210170533108e-05
LR: 3.237210016899677e-05
LR: 2.653402980488554e-05
LR: 2.0695959440774307e-05
LR: 1.4857889076663077e-05
LR: 9.019818712551846e-06
LR: 3.1817483484406143e-06


Validation:   0%|          | 0/24 [00:00<?, ?it/s]

Epoch 3/3 | Train Loss: 3.5927 | Train Acc: 0.4059 | Val Loss: 3.6342 | Val Acc: 0.4055 | Val PPL: 37.87
Saved checkpoint with full training state.


In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(train_losses, label="train")
plt.plot(val_losses, label="val")
plt.legend(); plt.title("Loss")

plt.subplot(1,2,2)
plt.plot(train_accs, label="train")
plt.plot(val_accs, label="val")
plt.legend(); plt.title("Accuracy")

plt.show()


NameError: name 'plt' is not defined

In [None]:
def top_k_logits(logits, k=None):
    if k is None:
        return logits
    v, _ = torch.topk(logits, k)
    min_v = v[:, -1].unsqueeze(1)
    return torch.where(logits < min_v, torch.full_like(logits, -1e10), logits)

@torch.no_grad()
def generate_bpe(model, tokenizer, prompt, length=100, temperature=1.0, top_k=50):
    model.eval()
    ids = tokenizer.encode(prompt)
    if len(ids) == 0:
        ids = [tokenizer.unk_token_id or 1]
    tokens = torch.tensor(ids, dtype=torch.long, device=device).unsqueeze(0)    for _ in range(length):
        context = tokens[:, -SEQ_LEN:]
        logits = model(context)        
        next_logits = logits[:, -1, :] / max(temperature, 1e-8)
        next_logits = top_k_logits(next_logits, top_k)
        probs = F.softmax(next_logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        tokens = torch.cat([tokens, next_id], dim=1)
    out_ids = tokens[0].tolist()
    return tokenizer.decode(out_ids, clean_up_tokenization_spaces=True)


In [None]:
model.eval()

print(generate_bpe(model, tokenizer, "india the great country", length=200, temperature=0.8, top_k=50))


NameError: name 'model' is not defined

## Save Final Model


In [None]:
# After the full training , this save cell will save the final model 
# without optimizer, scheduler and scaler states, 
# since those are only needed for resuming training, not for inference. 

# -----------------------------------------------Clean Model Saving-------------------------------------------------------- #
final_save_path = "CHAYA_62M_Phase1_Final.pt"

torch.save({
    "model_state_dict": model.state_dict(),
    "vocab_size": vocab_size,
    "e_dim": E_DIM,
    "d_model": D_MODEL,
    "num_layers": NUM_LAYERS,
    "num_heads": NUM_HEADS,
    "seq_len": SEQ_LEN
}, final_save_path)

tokenizer.save_pretrained("./gpt2_tokenizer_saved")

print(f"<---> Saved final model to {final_save_path}")
print("<---> Saved tokenizer to ./gpt2_tokenizer_saved")
print(f"\nFinal training complete!")
print(f"  Epochs trained: {EPOCHS}")
print(f"  Best validation loss: {best_val:.4f}")
# -----------------------------------------------Clean Model Saving-------------------------------------------------------- #
