In [2]:
import re
import torch.nn as nn
import numpy as np 
import pandas as pd
import os
import tiktoken
from torch.utils.data import Dataset, DataLoader
import torch
import math
import time
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
#Tokenizer
import tiktoken
tokenizer =  tiktoken.get_encoding("gpt2")

In [4]:
def text_to_tokens(text):
    text = tokenizer.encode(text)
    token = torch.tensor(text).unsqueeze(0)
    return token
def tokens_to_text(tokens):
    return tokenizer.decode(tokens.squeeze(0).tolist())

In [5]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        tokens = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(tokens) - max_length, stride):
            input_chuck = tokens[i:i+max_length]
            target_chuck = tokens[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chuck))
            self.target_ids.append(torch.tensor(target_chuck)) 

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [6]:
LLM_Config = {
    "vocab_size": 50257, #All Words in LLM
    'context_length': 256, #Size of the each row in batch
    'emb_dim': 768, #Dimensions of Embeddinging Vectors
    "n_heads": 12, #Number of Transfoemrs
    "n_layers": 12, #Number of Transformer layers
    "drop_rate": 0.1, #Percentage of Tokens that are dropped
    "qkv_bias": False
}

In [7]:
class Causual_attentionV1(torch.nn.Module):
    def __init__(self, d_in, d_out, context_len, dropout, bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = torch.nn.Linear(d_in, d_out, bias=bias)
        self.W_key = torch.nn.Linear(d_in, d_out, bias=bias)
        self.W_value = torch.nn.Linear(d_in, d_out, bias=bias)
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len,context_len), diagonal=1))

    def forward(self, x):
        b, nums_t, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        attn_score = queries @ keys.transpose(1,2)
        causal_mask = self.mask[:nums_t, :nums_t].bool()
        attn_score = attn_score.masked_fill(causal_mask, -torch.inf)
        attn_weights = torch.softmax(attn_score/(keys.shape[-1]**0.5), dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = attn_weights @ values
        return context_vec

class multiheadAttentionV1(torch.nn.Module):
    def __init__(self, d_in, d_out, context_len, dropout, num_heads, bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=bias)
        self.W_key = nn.Linear(d_in, d_out, bias=bias)
        self.W_value = nn.Linear(d_in, d_out, bias=bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(-2, -1) / math.sqrt(self.head_dim)
        causal_mask = self.mask[:num_tokens, :num_tokens].bool()
        attn_scores = attn_scores.masked_fill(causal_mask, float("-inf"))

        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context = attn_weights @ values
        context = context.transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)

        return self.out_proj(context)

In [8]:
import torch.nn as nn
class GPT(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.trf_blocks = nn.Sequential(*[TransformerBlock (cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)
        self.out_head.weight = self.tok_emb.weight
    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos = torch.arange(seq_len, device=in_idx.device)
        pos_embeds = self.pos_emb(pos)[None, :, :]
        pos_embeds = self.pos_emb(pos)
        x = tok_embeds + pos_embeds
        x= self.drop_emb(x)
        x= self.trf_blocks(x)
        x= self.final_norm(x)
        logits = self.out_head(x)
        return logits

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2/torch.pi , device=x.device))  * (x + 0.044715 * x ** 3)))

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = multiheadAttentionV1(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            context_len=cfg['context_length'],
            dropout=cfg['drop_rate'],
            num_heads=cfg['n_heads'],
            bias=cfg['qkv_bias']
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])
    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x= self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x= self.drop_shortcut(x)
        x = x + shortcut 
        

        return x

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']), GELU(), nn.Linear(4* cfg['emb_dim'],cfg['emb_dim']))

    def forward(self, x):
        return self.layers(x)

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, unbiased=False, keepdim=True)
        norm_x = (x - mean)/(torch.sqrt( var+self.eps ))
        return self.scale * norm_x + self.shift

In [9]:
class DNN(nn.Module):
    def __init__(self, layers, shortcut):
        super().__init__()
        self.shortcut =shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layers[0],layers[1]), GELU()),
            nn.Sequential(nn.Linear(layers[1],layers[2]), GELU()),
            nn.Sequential(nn.Linear(layers[2],layers[3]), GELU()),
            nn.Sequential(nn.Linear(layers[3],layers[4]), GELU()),
            nn.Sequential(nn.Linear(layers[4],layers[5]), GELU())
        ])
    def forward(self, x):
        for layer in self.layers:
            output = layer(x)
            if self.shortcut and x.shape == output.shape:
                x = x+output
            else:
                x = output
        return x

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
def gen_text(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1,:]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [12]:
def batch_loss(inputs, targets, model, device):
    inputs, targets = inputs.to(device), targets.to(device)

    logits = model(inputs)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), targets.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num=None):
    total = 0
    if len(data_loader) == 0:
        return float('nan')
    elif num is None:
        num = len(data_loader)
    else:
        num = min(num, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num:
            loss = batch_loss(input_batch, target_batch, model, device)
            total += loss.item()
        else:
            break
    return total/num

In [13]:
def gen_print_sample(model, tokenizer, device, start_context):
    model.eval()
    cs = model.pos_emb.weight.shape[0]
    encoded = text_to_tokens(start_context).to(device)
    with torch.no_grad():
        token_ids = gen_text(model, encoded, 50, cs)
    text = tokens_to_text(token_ids)
    print(text.replace('\n', ' '))
    model.train()
    

In [14]:
#torch.manual_seed(123)
#train_ratio = int(0.90 * len(verdict_story))
#train = verdict_story[:train_ratio]
#test = verdict_story[train_ratio:]

#train_loader = create_dataloader_v1(train, 2, LLM_Config['context_length'],LLM_Config['context_length'], True, True, 0 )
#test_loader = create_dataloader_v1(test, 2, LLM_Config['context_length'],LLM_Config['context_length'], False, False, 0 )

In [15]:
def training_cycle(model, train_loader, test_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_lossess, test_losses, track_tokens = [],[],[]
    tokens_seen, gloabal_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for inputs, targets in train_loader:
            optimizer.zero_grad()
            loss = batch_loss(inputs, targets, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += inputs.numel()
            gloabal_step += 1

            if gloabal_step % eval_freq == 0:
                training_loss, test_loss = evalm(model, train_loader, test_loader, device, eval_iter)
                train_lossess.append(training_loss)
                test_losses.append(test_loss)
                track_tokens.append(tokens_seen)
                print(f"Epoch: {epoch+1}, Step {gloabal_step}:\n Train_loss:{training_loss}, Test_loss: {test_loss}")

        gen_print_sample(model, tokenizer, device, start_context)

    return train_lossess, test_losses, tokens_seen

In [16]:
def evalm(model, train_loader, test_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device)
        test_loss = calc_loss_loader(test_loader, model, device)
    model.train()
    return train_loss, test_loss

In [17]:
import time

In [18]:
#model.to(device)
#optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

#num_epochs = 10
#train_losses, val_lossess, tokens_seen = training_cycle(model, train_loader, test_loader, optimizer, device, num_epochs=num_epochs, eval_freq=5,eval_iter=5, 
                                                       # start_context="Every effort moves you", tokenizer =tokenizer)
#end_time = time.time()
#print(f"Run Time: {end_time - start_time}")

In [19]:
def generate(model, idx, max_new_tokens, context_size, temp=0.0, top_k=None, eos_id=0):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1].unsqueeze(-1)
            logits = torch.where(logits < min_val, torch.full_like(logits, float("-inf")), logits)

        if temp > 0:
            logits = logits / temp
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if idx_next.item() == eos_id:
            break

        idx = torch.cat((idx, idx_next), dim=1)

    return idx


In [20]:
#token_ids = generate(model, text_to_tokens("Every effor moves you"), 15, 256, top_k=25, temp=1.4 )

In [21]:
import tensorflow as tf
import tqdm

print(tf.__version__)
print(tqdm.__version__)

2026-02-06 04:44:55.283055: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770353095.695090      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770353095.849579      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770353096.810836      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770353096.810877      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770353096.810880      55 computation_placer.cc:177] computation placer alr

2.19.0
4.67.1


In [22]:
pip install llms-from-scratch

Collecting llms-from-scratch
  Downloading llms_from_scratch-1.0.19-py3-none-any.whl.metadata (17 kB)
Collecting jupyterlab>=4.0 (from llms-from-scratch)
  Downloading jupyterlab-4.5.3-py3-none-any.whl.metadata (16 kB)
Collecting pip>=25.0.1 (from llms-from-scratch)
  Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)
Collecting async-lru>=1.0.0 (from jupyterlab>=4.0->llms-from-scratch)
  Downloading async_lru-2.1.0-py3-none-any.whl.metadata (5.3 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab>=4.0->llms-from-scratch)
  Downloading jupyter_lsp-2.3.0-py3-none-any.whl.metadata (1.8 kB)
Downloading llms_from_scratch-1.0.19-py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.3/83.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab-4.5.3-py3-none-any.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading p

In [23]:
from llms_from_scratch.ch05 import download_and_load_gpt2

In [24]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 150kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 6.03MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 194kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:54<00:00, 9.10MiB/s] 
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 12.7MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 4.23MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.38MiB/s]


In [25]:
settings

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

In [26]:
params.keys()

dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])

In [27]:
params['wte'].shape

(50257, 768)

In [28]:
GPT2_Config = LLM_Config.copy()
GPT2_Config.update({"context_length": 1024, "qkv_bias": True})
GPT2_Config

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': True}

In [29]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError()
    return torch.nn.Parameter(torch.tensor(right))

In [30]:
# I can't possibly train a LLM using my barely working PC
# So I will be using the GPT2 publiccally avalible weights and load them into my model

def load_weights(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        #This is loading the weights of the Multihead Attention 
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]['c_attn'])['w'], 3, axis =-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(gpt.trf_blocks[b].att.W_value.weight, v_w.T)
        #This is loading the bais
        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]['c_attn'])['b'], 3, axis =-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(gpt.trf_blocks[b].att.W_value.bias, v_b)
        #This is loading the out_projection weights and baises
        gpt.trf_blocks[b].att.out_proj.weight = assign(gpt.trf_blocks[b].att.out_proj.weight,params["blocks"][b]["attn"]['c_proj']['w'].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(gpt.trf_blocks[b].att.out_proj.bias,params["blocks"][b]["attn"]['c_proj']['b'])
        #Loading The Feed Forward Layers
        gpt.trf_blocks[b].ff.layers[0].weight = assign(gpt.trf_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]['c_fc']['w'].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]['c_fc']['b'])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(gpt.trf_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]['c_proj']['w'].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(gpt.trf_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]['c_proj']['b'])
        #loading the Normiliaztion Layers 1 and 2
        gpt.trf_blocks[b].norm1.scale = assign(gpt.trf_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]['g'])
        gpt.trf_blocks[b].norm1.shift = assign(gpt.trf_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]['b'])
        gpt.trf_blocks[b].norm2.scale = assign(gpt.trf_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]['g'])
        gpt.trf_blocks[b].norm2.shift = assign(gpt.trf_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]['b'])
        #loads the last layer
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = gpt.tok_emb.weight
        

In [31]:
model = GPT(GPT2_Config)

load_weights(model, params)

In [35]:
def talk(talk):
    token_ids = generate(model, text_to_tokens(talk), 30, 1024, top_k=25, temp=1.4 )
    return tokens_to_text(token_ids)

In [36]:
print(talk("3rd World Countries have high cases of poverty and diseases, with the most common disease being"))

3rd World Countries have high cases of poverty and diseases, with the most common disease being hepatitis B [1], chronic liver disease [2] and kidney disease.

The World Bank reported in 2014 it has the highest mortality rates per


In [40]:
torch.save(model.state_dict(), "/kaggle/working/gpt_state_dict.pt")
# This line exports the weights, so that can load in it in main.py.