<a href="https://colab.research.google.com/github/Rstam59/ds-portfolio/blob/main/train_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import math

class CasualSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        #output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        #Regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        #bad naming
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size)).
                             view(1, 1, config.block_size, config.block_size))


    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        #calculate query, key, values for all heads in batch and move head forward to be the batch dim
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        #compute attention scores
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)


        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


In [None]:
class TanhGelu(nn.Module):
    def forward(self, input):
        return 0.5 * (1 + torch.tanh(math.sqrt(2 / math.pi) * (input + 0.044715 * torch.pow(input, 3))))


In [None]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate= 'tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [None]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)


    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [None]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

In [None]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config


        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList(Block(config) for _ in range(config.n_layer)),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False)


        #weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight


        self.apply(self._init_weights)


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std = (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(self.lm_head(x.view(-1, x.size(-1))), targets.view(-1))
        logits = self.lm_head(x)
        return logits, loss




    @classmethod
    def from_pretrained(cls, model_type):
        '''Loads pretrained GPT-2 model weights from huggingface'''
        assert model_type in ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
        from transformers import GPT2LMHeadModel
        print('Loading weights from huggingface...')

        config_args = {
            "gpt2":  dict(n_layer=12, n_head=12, n_embd=768), #124M params
            "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024), #355M params
            "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280), #774M params
            "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600), #1558M params
        }[model_type]
        config_args['vocab_size'] = 50257
        config_args['block_size'] = 1024
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]

        #init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        #copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']




        assert len(sd_keys) == len(sd_keys_hf), f'mismatched number of keys: {len(sd_keys)} vs {len(sd_keys_hf)}'
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                assert sd_hf[k].shape[::-1] == sd[k].shape, f'mismatched shape for {k}: {sd[k].shape} vs {sd_hf[k].shape}'
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                assert sd_hf[k].shape == sd[k].shape, f'mismatched shape for {k}: {sd[k].shape} vs {sd_hf[k].shape}'
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model

model = GPT.from_pretrained('gpt2')
print(model)




Loading weights from huggingface...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [None]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
print(device)


torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

cuda


In [None]:
num_return_sequences = 5
max_length = 30

# model = GPT.from_pretrained('gpt2')
model = GPT(GPTConfig(vocab_size = 50304))
model.eval()
model.to(device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [None]:
# !pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.2 MB[0m [31m14.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
# import tiktoken
# enc = tiktoken.get_encoding("gpt2")
# tokens = enc.encode("Hello, I'm a language model")
# tokens = torch.tensor(tokens, dtype=torch.long)
# tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
# x = tokens.to(device)

# torch.manual_seed(42)
# torch.cuda.manual_seed(42)
# while x.size(1) < max_length:
#     with torch.no_grad():
#         logits = model(x)
#         logits = logits[:, -1, :]
#         probs = F.softmax(logits, dim=-1)
#         # next_token = torch.multinomial(probs, num_samples=1)
#         # tokens = torch.cat([tokens, next_token], dim=1)
#         topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1)
#         ix = torch.multinomial(topk_probs, 1)
#         xcol = torch.gather(topk_indices, -1, ix)
#         x = torch.cat([x, xcol], dim=1)

In [None]:
# for i in range(num_return_sequences):
#     tokens = x[i, :max_length].tolist()
#     decoded = enc.decode(tokens)
#     print('>', decoded)

In [None]:
# !wget https://raw.githubusercontent.com/karpathy/build-nanogpt/refs/heads/master/input.txt

--2025-02-04 13:31:40--  https://raw.githubusercontent.com/karpathy/build-nanogpt/refs/heads/master/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-02-04 13:31:41 (27.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
import numpy as np


np.random.randint(0, 60, size = (6, 10))

array([[50, 52, 16, 49, 30, 42, 39, 11, 28, 11],
       [26, 46, 17,  1, 23, 20, 29, 45,  8, 30],
       [ 6, 52,  5, 11, 14, 58, 19,  8, 53, 42],
       [54, 55, 38, 33, 47, 58, 48, 23,  0,  2],
       [46, 52,  3,  0, 55, 58, 12, 48, 31, 13],
       [18, 50, 37, 46,  9, 47, 21, 26,  9, 37]])

In [None]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')
with open('input.txt', 'r') as f:
    text = f.read()
text = text[:1000]
tokens = enc.encode(text)
B, T = 4, 32
buf = torch.tensor(tokens[:B*T + 1])
buf = buf.to(device)
x = buf[:-1].view(B, T)
y = buf[1:].view(B, T)
model = GPT(GPTConfig())
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas = (0.9, 0.95), eps = 1e-8)
for i in range(50):
    optimizer.zero_grad(set_to_none=True)
    logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
    print(f"step {i}, loss: {loss.item()}")

# import sys; sys.exit(0)

step 0, loss: 10.8391695022583
step 1, loss: 8.467406272888184
step 2, loss: 7.6278791427612305
step 3, loss: 7.323618412017822
step 4, loss: 6.522852897644043
step 5, loss: 6.038125991821289
step 6, loss: 5.57582950592041
step 7, loss: 5.062967300415039
step 8, loss: 4.563469409942627
step 9, loss: 3.976896286010742
step 10, loss: 3.226181983947754
step 11, loss: 3.2245426177978516
step 12, loss: 2.614743232727051
step 13, loss: 2.4133968353271484
step 14, loss: 1.928017497062683
step 15, loss: 1.628973126411438
step 16, loss: 1.1532443761825562
step 17, loss: 0.9028757214546204
step 18, loss: 0.6945186853408813
step 19, loss: 0.4917982518672943
step 20, loss: 0.3521387577056885
step 21, loss: 0.24547846615314484
step 22, loss: 0.16462981700897217
step 23, loss: 0.11487781256437302
step 24, loss: 0.0849955603480339
step 25, loss: 0.06487804651260376
step 26, loss: 0.050571419298648834
step 27, loss: 0.040224138647317886
step 28, loss: 0.03273648023605347
step 29, loss: 0.0272296201437

In [None]:
-np.log(1 / 50257)

10.82490511970208

In [None]:
class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T


        with open ('input.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f"loaded {len(self.tokens)} tokens")
        print(f"1 epoch = {len(self.tokens) // (B*T)} batches")


        self.current_position = 0



    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position:self.current_position + B*T + 1]
        x = buf[:-1].view(B, T)
        y = buf[1:].view(B, T)
        self.current_position += B*T + 1

        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

In [None]:
train_loader = DataLoaderLite(4, 32)


optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for i in range(50):
    x, y = train_loader.next_batch()
    x = x.to(device)
    y = y.to(device)
    optimizer.zero_grad(set_to_none=True)
    logits, loss = model(x, y)
    # import code; code.interact(local=locals())
    loss.backward()


    optimizer.step()
    print(f"step {i}, loss: {loss.item()}")

loaded 338025 tokens
1 epoch = 2640 batches
step 0, loss: 0.0037562581710517406
step 1, loss: 9.976945877075195
step 2, loss: 10.49441146850586
step 3, loss: 9.231887817382812
step 4, loss: 8.05864143371582
step 5, loss: 7.871607780456543
step 6, loss: 8.50680923461914
step 7, loss: 7.795938014984131
step 8, loss: 7.358175754547119
step 9, loss: 7.397676467895508
step 10, loss: 7.770276069641113
step 11, loss: 6.63814115524292
step 12, loss: 6.903777599334717
step 13, loss: 6.934110164642334
step 14, loss: 7.094895362854004
step 15, loss: 7.114306449890137
step 16, loss: 7.802863121032715
step 17, loss: 8.318770408630371
step 18, loss: 6.702744007110596
step 19, loss: 7.878968238830566
step 20, loss: 7.33986759185791
step 21, loss: 7.122585773468018
step 22, loss: 6.599552154541016
step 23, loss: 6.899408340454102
step 24, loss: 6.51102352142334
step 25, loss: 6.55588960647583
step 26, loss: 6.602773189544678
step 27, loss: 7.831883907318115
step 28, loss: 6.579095363616943
step 29, lo

In [None]:

!nvidia-smi

Tue Feb  4 13:32:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P0             81W /   70W |    4394MiB /  15360MiB |     96%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
logits.dtype

torch.float32

In [None]:
import time

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
print(device)


torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)


train_loader = DataLoaderLite(B = 4, T = 1024)
torch.set_float32_matmul_precision('high')

model = GPT(GPTConfig())
model.to(device)
model = torch.compile(model)


max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50

def get_lr(it):
    # 1) Linear warmup for warmup_iters steos
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    # 2) If it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas = (0.9, 0.95), eps = 1e-8)
for step in range(20):
    t0 = time.time()
    x, y = train_loader.next_batch()
    x = x.to(device)
    y = y.to(device)
    optimizer.zero_grad(set_to_none=True)
    # with torch.autocast(device_type=device, dtype = torch.bfloat16):
    with torch.autocast(device_type=device):
        logits, loss = model(x, y)
    loss.backward()
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0) * 1000
    tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
    print(f"step {step}, loss: {loss.item()},|lr {lr:.2f} | norm {norm:.4f} |time: {dt:.2f}ms, tok/sec: {tokens_per_sec:.4f}")


cuda
loaded 338025 tokens
1 epoch = 82 batches
step 0, loss: 10.911243438720703,|lr 0.00 | norm 10.5407 |time: 43778.20ms, tok/sec: 93.5626
step 1, loss: 9.986283302307129,|lr 0.00 | norm 5.9940 |time: 307.47ms, tok/sec: 13321.6419
step 2, loss: 9.522709846496582,|lr 0.00 | norm 3.4028 |time: 315.95ms, tok/sec: 12964.0898
step 3, loss: 9.291451454162598,|lr 0.00 | norm 4.7312 |time: 321.15ms, tok/sec: 12754.0201
step 4, loss: 8.905405044555664,|lr 0.00 | norm 3.1478 |time: 310.54ms, tok/sec: 13190.1112
step 5, loss: 8.662164688110352,|lr 0.00 | norm 3.2452 |time: 310.72ms, tok/sec: 13182.2169
step 6, loss: 8.626821517944336,|lr 0.00 | norm 3.1642 |time: 311.47ms, tok/sec: 13150.3617
step 7, loss: 8.504057884216309,|lr 0.00 | norm 3.3615 |time: 309.86ms, tok/sec: 13219.0056
step 8, loss: 8.430315017700195,|lr 0.00 | norm 2.4430 |time: 312.12ms, tok/sec: 13123.2697
step 9, loss: 8.176719665527344,|lr 0.00 | norm 2.0570 |time: 313.04ms, tok/sec: 13084.4197
step 10, loss: 8.124675750732422

In [None]:
import torch
torch.cuda.empty_cache()