In [1]:
import os; os.chdir('..')
import numpy as np
from dataclasses import dataclass
import torch
from torch import nn
from torch.nn import functional as F
# from transformers import GPT2LMHeadModel
import matplotlib.pyplot as plt 

# from tqdm import tqdm, trange
from tqdm.notebook import tqdm

from utils import *; from boring_utils.utils import *
from data_structure import add_to_class

from hf_gpt import (
    Block
)

init_graph()
device = get_device()

# Load Tiny Dataset

In [2]:
# tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('./data/shakespeare_char/input.txt', 'r') as f:
    text = f.read()
data = text[:1000]  # first 1,000 characters
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:24])

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13]


In [4]:
import torch

B, T = 4, 6
buff = torch.tensor(tokens[:B * T + 1])

# Sequence x, y with one bit shifted as data and label
x = buff[:-1].view(B, T)  # torch.tensor(tokens[:24]).view(4, 6)
y = buff[1:].view(B, T)
print(x)
print(y)

tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]])
tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]])


In [5]:
# model = GPT.from_pretrained('gpt2')
model = GPT(GPTConfig())
model.to(device)
logits, loss = model(x.to(device), y.to(device))
print(loss)  # if random init, the losss should around -ln(1/50257) = 10.82

tensor(10.8915, device='cuda:0', grad_fn=<NllLossBackward0>)


# Optimizer

In [11]:
# This is how you can resest the parameters of the model
for layer in model.modules():
    if hasattr(layer, 'reset_parameters'):
        print(layer)
        layer.reset_parameters()

Embedding(50257, 768)
Embedding(1024, 768)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
Linear(in_features=768, out_features=3072, bias=True)
Linear(in_features=3072, out_features=768, bias=True)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
Linear(in_features=768, out_features=3072, bias=True)
Linear(in_features=3072, out_features=768, bias=True)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
Linear(in_features=768, out_features=3072, bias=True)
Linear(in_features=3072, out_features=768, bias=True)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
LayerNorm((768,), eps

In [None]:
def reset_model_weights(model):
    for layer in model.modules():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

# Single Batch Case

In [7]:
reset_model_weights(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

pbar = tqdm(range(50), desc="Training")
for i in pbar:
    optimizer.zero_grad()
    logits, loss = model(x.to(device), y.to(device))
    loss.backward()
    optimizer.step()
    # tqdm.write(f"Step {i}, Loss: {loss.item():.4f}")
    pbar.set_description(f"Step {i}, Loss: {loss.item():.4f}")

Training:   0%|          | 0/50 [00:00<?, ?it/s]

# Data Loader

In [8]:
def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32)  # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt


class DataLoaderTiny:
    '''A lite version without process_rank and num_processes'''
    def __init__(self, B, T):
        self.B = B
        self.T = T
        self.num_processes = 1

        with open('./data/shakespeare_char/input.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        self.tokens = torch.tensor(enc.encode(text))
        batch_size = B * T
        cprint(len(self.tokens))

        print("calc: 1 epoch = ? batches")
        cprint(len(self.tokens) // batch_size)

        # state
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buff = self.tokens[self.current_position: self.current_position + B*T + 1]
        x = (buff[:-1]).view(B, T) # inputs
        y = (buff[1:]).view(B, T)  # targets

        # advance the position in the tensor
        self.current_position += B * T * self.num_processes

        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
            self.current_position = 0
        return x, y


In [9]:
train_loader = DataLoaderTiny(B=4, T=32)

[93mDataLoaderLite -> len(self.tokens):[0m
338025
calc: 1 epoch = ? batches
[93mDataLoaderLite -> len(self.tokens) // batch_size:[0m
2640


In [10]:
# ok now the loss is lower than 10.82
reset_model_weights(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

pbar = tqdm(range(50), desc="Training")
for i in pbar:
    optimizer.zero_grad()
    x, y = train_loader.next_batch()
    logits, loss = model(x.to(device), y.to(device))
    loss.backward()
    optimizer.step()
    # tqdm.write(f"Step {i}, Loss: {loss.item():.4f}")
    pbar.set_description(f"Step {i}, Loss: {loss.item():.4f}")

Training:   0%|          | 0/50 [00:00<?, ?it/s]

# GPT Improved: Fix Param Sharing
https://www.youtube.com/watch?v=l8pRSuU81PU&t=3974s parameter sharing wte and lm_head

Check ./gpt-2/src/model.py line 154

In hf_gpt.py, simply adding `self.model.transformer.wte.weight = self.lm_head.weight`, 
that will leads to a better result plus reduce tons of parameters (for gpt2, it's 768 x 50257 = 38.5M)!

In [12]:
from transformers import GPT2LMHeadModel
model_type = 'gpt2'

model_hf = GPT2LMHeadModel.from_pretrained(model_type)
sd_hf = model_hf.state_dict()

# copy while ensuring all of the parameters are aligned and match in names and shapes
sd_keys_hf = sd_hf.keys()

In [13]:
cprint(sd_hf["lm_head.weight"].shape == sd_hf["transformer.wte.weight"].shape)

[93m<module> -> sd_hf["lm_head.weight"].shape == sd_hf["transformer.wte.weight"].shape:[0m
True


In [15]:
# every value in the tensor should be the same
cprint((sd_hf["lm_head.weight"] == sd_hf["transformer.wte.weight"]).all())

[93m<module> -> (sd_hf["lm_head.weight"] == sd_hf["transformer.wte.weight"]).all():[0m
tensor(True)


In [16]:
# they are even the same tensor in memory
cprint(sd_hf["lm_head.weight"].data_ptr() == sd_hf["transformer.wte.weight"].data_ptr())

[93m<module> -> sd_hf["lm_head.weight"].data_ptr() == sd_hf["transformer.wte.weight"].data_ptr():[0m
True


# GPT Improved: Weight Init
https://www.youtube.com/watch?v=l8pRSuU81PU&t=4427s

Check ./gpt-2/src/model.py line 152

```python
wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                     initializer=tf.random_normal_initializer(stddev=0.01))
wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                     initializer=tf.random_normal_initializer(stddev=0.02))
```

In [17]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # https://www.youtube.com/watch?v=l8pRSuU81PU&t=3974s parameter sharing wte and lm_head
        self.transformer.wte.weight = self.lm_head.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        '''
        1/sqrt(768) = 0.036 and 1/sqrt(1600) = 0.025
        so the value in gpt2 paper 0.02 is reasonable
        '''
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, idx, targets=None):
        # idx shape: (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"input length {T} is longer than block size {self.config.block_size}"
        # pos = torch.arange(T, device=idx.device).unsqueeze(0).expand(B, T)
        pos = torch.arange(0, T, device=idx.device)  # shape: T
        pos_emb = self.transformer.wpe(pos)  # shape: (T, n_embd)
        tok_emb = self.transformer.wte(idx)  # shape: (B, T, n_embd)
        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)
        
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # shape: (B, T, Vocab Size)

        if targets is None:
            return logits
        else:
            # logits.view(-1, logits.size(-1)): 
            # flatten: (B, T, Vocab Size) -> (B * T, Vocab Size) 
            loss = F.cross_entropy(
                        logits.view(-1, logits.size(-1)), 
                        targets.view(-1)
                    )
            return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        '''https://youtu.be/l8pRSuU81PU?t=1830

        I insist using pytorch's MHA instead of HF. So I need a key_mapping dict.
        '''
        pass

From GPT2 paper:

A modified initialization which accounts for the accumulation on the residual path with model depth is used. We scale the weights of residual layers **at initialization** by a factor of 1/√N where N is the number of residual layers.

In Block class:

```python
class Block(nn.Module):
    ...

    def forward(self, x):
        x = x + self.attn
        x = x + self.mlp
        return x
```

In [21]:
# standard deviation grows inside the residual stream

std_list = []

for _ in range(5):
    x = torch.zeros(768)
    n = 100  # e.g. 100 layers

    for i in range(n):
        x += (1 / n ** 0.5) * torch.randn(768)

    std_list.append(x.std())


std_list_2 = []

for _ in range(5):
    x = torch.zeros(768)
    n = 100  # e.g. 100 layers

    for i in range(n):
        x += torch.randn(768)

    std_list_2.append(x.std())

In [22]:
cprint(std_list)
cprint(std_list_2)

[93m<module> -> std_list:[0m
[tensor(1.0170), tensor(0.9991), tensor(1.0267), tensor(1.0165), tensor(0.9875)]
[93m<module> -> std_list_2:[0m
[tensor(10.0053),
 tensor(10.0819),
 tensor(10.0419),
 tensor(9.9803),
 tensor(9.8867)]


In [None]:
class GPT(nn.Module):
    def __init__(self, config, scale_init=True):
        super().__init__()
        self.config = config
        self.scale_init = scale_init

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # https://www.youtube.com/watch?v=l8pRSuU81PU&t=3974s parameter sharing wte and lm_head
        self.transformer.wte.weight = self.lm_head.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        '''
        1/sqrt(768) = 0.036 and 1/sqrt(1600) = 0.025
        so the value in gpt2 paper 0.02 is reasonable
        '''
        if isinstance(module, nn.Linear):
            std = 0.02
            if self.scale_init:
                # '2 *' is because the two residual connections in the Block:
                # attn and mlp
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, idx, targets=None):
        # idx shape: (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"input length {T} is longer than block size {self.config.block_size}"
        # pos = torch.arange(T, device=idx.device).unsqueeze(0).expand(B, T)
        pos = torch.arange(0, T, device=idx.device)  # shape: T
        pos_emb = self.transformer.wpe(pos)  # shape: (T, n_embd)
        tok_emb = self.transformer.wte(idx)  # shape: (B, T, n_embd)
        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)
        
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # shape: (B, T, Vocab Size)

        if targets is None:
            return logits
        else:
            # logits.view(-1, logits.size(-1)): 
            # flatten: (B, T, Vocab Size) -> (B * T, Vocab Size) 
            loss = F.cross_entropy(
                        logits.view(-1, logits.size(-1)), 
                        targets.view(-1)
                    )
            return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        '''https://youtu.be/l8pRSuU81PU?t=1830

        I insist using pytorch's MHA instead of HF. So I need a key_mapping dict.
        '''
        pass