In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import math
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Transformer architecture

<p align="center">
<img src="https://lilianweng.github.io/lil-log/assets/images/transformer.png" width="1000px" alt="Zoom in to the Transformer"/>
</p>

## ATTENTION

In [3]:
# default GPT2 values?
hparams = {
    'n_layer':12, # num of blocks?
    'n_head': 16, #
    'n_embd':768, # 16 heads of size 64
    'block_size':1024, # sequences of 1024 tokens
    'dropout':0.1, # dropout value
    'bias': True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster. warning! nn.LayerNorm doesn't support bias=False
    'vocab_size':50257,
}

In [36]:
class GPT2Attention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention
    """

    def __init__(self):
        super().__init__()

        self.n_embd = hparams['n_embd'] # embeding dimensionality, includes all heads
        self.n_head = hparams['n_head'] #  num heads
        self.block_size = hparams['block_size']
        assert self.n_embd % self.n_head == 0

        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(self.n_embd, 3 * self.n_embd, bias=hparams['bias'])

        # output projection
        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=hparams['bias'])
        # regularization
        self.attn_dropout = nn.Dropout(hparams['dropout'])
        self.resid_dropout = nn.Dropout(hparams['dropout'])
        # causal mask to ensure that attention is only applied to the left in the input sequence
        # every token only comunicates with the previous ones
        self.register_buffer("tril", torch.tril(torch.ones(self.block_size, self.block_size))
                                     .view(1, 1, self.block_size, self.block_size))

        self._reset_parameters() # uncomment if we need to initialize as the original transformer

    def _reset_parameters(self):
        # Original Transformer initialization
        nn.init.xavier_uniform_(self.c_attn.weight)
        nn.init.xavier_uniform_(self.c_proj.weight)
        self.c_attn.bias.data.fill_(0)
        self.c_proj.bias.data.fill_(0)


    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        head_size = self.n_head, C // self.n_head

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)

        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, seqLen, numHeads, headSize) -> (B, numHeads, seqLen, headSize)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, seqLen, numHeads, headSize) -> (B, numHeads, seqLen, headSize)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, seqLen, numHeads, headSize) -> (B, numHeads, seqLen, headSize)

        # causal self-attention; Self-attend: (B, numHeads, seqLen, headSize) x (B, numHeads, headSize, seqLen) -> (B, numHeads, seqLen, seqLen)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.tril[:,:,:T,:T] == 0, float('-inf')) # aplying the softmax -inf become 0
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.c_proj(y)
        y = self.resid_dropout(y)
        return y

In [None]:
# attention tester
hparams = {
    'batch_size':10,
    'n_head': 2,
    'n_embd':6, # 2 heads of size 3
    'block_size':3, # seq_len 3 words
    'dropout':0.1, # dropout value
    'bias': True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
}

x = torch.randn(hparams['batch_size'], hparams['block_size'], hparams['n_embd']) # batch size, sequence length, embedding dimensionality

mha = GPT2Attention()
optimizer = optim.Adam(mha.parameters())

losses_mha = []
n_epochs = 10000
for i in range(n_epochs):
    optimizer.zero_grad()
    output = mha(x)
    loss = F.mse_loss(output, x) # Reconstruct input
    loss.backward()
    optimizer.step()
    losses_mha.append(loss.item())
    if (i + 1) % 1000 == 0:
        print(f"Loss ({i+1}/{n_epochs}): {loss.item()}")

print(f"\nOutput:\n{output[0,:3,:]}\n")
print(f"Query:\n{x[0,:3,:]}\n")

##MLP

In [37]:
class GPT2MLP(nn.Module):

    def __init__(self):
        super().__init__()

        self.n_embd = hparams['n_embd']

        self.c_fc    = nn.Linear(self.n_embd, 4 * self.n_embd, bias=hparams['bias']) # expand to dim*4
        self.c_proj  = nn.Linear(4 * self.n_embd, self.n_embd, bias=hparams['bias'])
        self.act    = nn.GELU()
        self.dropout = nn.Dropout(hparams['dropout'])

    def forward(self, x):
        x = self.c_fc(x)
        x = self.act(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

In [None]:
# MLP tester
hparams = {
    'batch_size':10,
    'n_head': 2,
    'n_embd':6, # 2 heads of size 3
    'block_size':3, # seq_len 3 words
    'dropout':0.1, # dropout value
    'bias': True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
}

x = torch.randn(hparams['batch_size'], hparams['block_size'], hparams['n_embd']) # batch size, sequence length, embedding dimensionality

mlp = GPT2MLP()
optimizer = optim.Adam(mlp.parameters())

losses_mha = []
n_epochs = 10000
for i in range(n_epochs):
    optimizer.zero_grad()
    output = mlp(x)
    loss = F.mse_loss(output, x) # Reconstruct input
    loss.backward()
    optimizer.step()
    losses_mha.append(loss.item())
    if (i + 1) % 1000 == 0:
        print(f"Loss ({i+1}/{n_epochs}): {loss.item()}")

print(" ")
print("input_shape ", x.shape)
print("output_shape ", output.shape)

print(f"\nOutput:\n{output[0,:3,:]}\n")
print(f"Query:\n{x[0,:3,:]}\n")

##BLOCK


<p align="center">
<img src="https://www.researchgate.net/publication/365625866/figure/fig2/AS:11431281098698218@1669051398448/Structure-of-the-applied-GPT-2-medium-architecture_W640.jpg" width="280px" alt="Zoom in to the Transformer"/>
</p>


In [38]:
class GPT2Block(nn.Module):

    def __init__(self):
        super().__init__()

        self.n_embd = hparams['n_embd']

        self.ln_1 = nn.LayerNorm(self.n_embd, bias=hparams['bias'])
        self.attn = GPT2Attention()
        self.ln_2 = nn.LayerNorm(self.n_embd, bias=hparams['bias'])
        self.mlp = GPT2MLP()

    def forward(self, x):
        x = x + self.attn(self.ln_1(x)) # LayerNorm -> attention -> Add ???? no hauria de layerNOrm després de l'attention?
        x = x + self.mlp(self.ln_2(x)) # like x = self.ln_2(x + self.mlp(x))
        return x

In [33]:
# Block tester
hparams = {
    'batch_size':10,
    'n_head': 2,
    'n_embd':6, # 2 heads of size 3
    'block_size':3, # seq_len 3 words
    'dropout':0.1, # dropout value
    'bias': True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
}

x = torch.randn(hparams['batch_size'], hparams['block_size'], hparams['n_embd']) # batch size, sequence length, embedding dimensionality

blk = GPT2Block()
optimizer = optim.Adam(blk.parameters())

losses_mha = []
n_epochs = 10000
for i in range(n_epochs):
    optimizer.zero_grad()
    output = blk(x)
    loss = F.mse_loss(output, x) # Reconstruct input
    loss.backward()
    optimizer.step()
    losses_mha.append(loss.item())
    if (i + 1) % 1000 == 0:
        print(f"Loss ({i+1}/{n_epochs}): {loss.item()}")

print(" ")
print("input_shape ", x.shape)
print("output_shape ", output.shape)

print(f"\nOutput:\n{output[0,:3,:]}\n")
print(f"Query:\n{x[0,:3,:]}\n")

Loss (1000/10000): 0.0001341675379080698
Loss (2000/10000): 1.9141029042657465e-05
Loss (3000/10000): 5.915315341553651e-06
Loss (4000/10000): 1.6615333606750937e-06
Loss (5000/10000): 8.235734867412248e-07
Loss (6000/10000): 4.255325336544047e-07
Loss (7000/10000): 2.427399863336177e-07
Loss (8000/10000): 1.7640056171330798e-07
Loss (9000/10000): 1.1159019663864456e-07
Loss (10000/10000): 2.3057913267621188e-07
 
input_shape  torch.Size([10, 3, 6])
output_shape  torch.Size([10, 3, 6])

Output:
tensor([[-1.3583, -0.7305,  1.3442,  1.2064,  2.0346, -0.4798],
        [-1.5790, -0.6084, -0.0171,  0.9759, -0.4144, -0.3798],
        [ 0.8599, -0.1505,  0.3228,  0.4175, -0.7545,  1.2290]],
       grad_fn=<SliceBackward0>)

Query:
tensor([[-1.3579, -0.7306,  1.3436,  1.2054,  2.0343, -0.4795],
        [-1.5795, -0.6085, -0.0176,  0.9756, -0.4143, -0.3797],
        [ 0.8600, -0.1500,  0.3233,  0.4166, -0.7548,  1.2287]])



##TRANSFORMER style GTP2

In [39]:
class GPT(nn.Module):
    """ GPT Language Model """

    def __init__(self):
        super().__init__()

        self.block_size = hparams['block_size']
        self.vocab_size = hparams['vocab_size']
        self.n_embd = hparams['n_embd']

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(self.vocab_size, self.n_embd), # n_embd includes all heads
            wpe = nn.Embedding(self.block_size, self.n_embd),
            drop = nn.Dropout(hparams['dropout']),
            h = nn.ModuleList([GPT2Block() for _ in range(hparams['n_layer'])]),
            ln_f = nn.LayerNorm(self.n_embd),
        ))
        # out linear
        self.lm_head = nn.Linear(self.n_embd, self.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * hparams['n_layer']))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size() # (B, seq_len)
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        # position
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t=seq_len)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t=seq_len, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t=seq_len, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss


    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [None]:
# GPT tester
hparams = {
    'n_layer':12, # num of blocks?
    'batch_size':10,
    'n_head': 2,
    'n_embd':6, # 2 heads of size 3
    'block_size':3, # seq_len 3 words
    'dropout':0.1, # dropout value
    'bias': True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
    'vocab_size':50257,
}

x = torch.randint(10, (hparams['batch_size'], hparams['block_size'])) # batch size, sequence length

model = GPT()
optimizer = optim.Adam(model.parameters())

losses_mha = []
n_epochs = 10000
for i in range(n_epochs):
    optimizer.zero_grad()
    output, loss = model(x, x) # calculate loss internally

#    loss = F.mse_loss(output, x) # Reconstruct input
    loss.backward()
    optimizer.step()
    losses_mha.append(loss.item())
    if (i + 1) % 1000 == 0:
        print(f"Loss ({i+1}/{n_epochs}): {loss.item()}")

print(" ")
print("input_shape ", x.shape)
print("output_shape ", output.shape)

print(f"\nOutput:\n{output[0,:3,:]}\n")
print(f"Query:\n{x[0,:3]}\n")

## compare with HF GPT2

In [8]:
!pip install transformers



In [40]:
#Initialize a pretrained GPT model by copying over the weights
#from a huggingface/transformers checkpoint.
from transformers import GPT2LMHeadModel

# init a huggingface/transformers model
model_hf = GPT2LMHeadModel.from_pretrained('gpt2')
print("HF GPT************** \n", model_hf)

hparams = {
    'n_layer':12, # num of blocks?
    'batch_size':1,
    'n_head': 12,
    'n_embd':768, # 12 heads of size 64
    'block_size':1024, # seq_len 3 words
    'dropout':0.1, # dropout value
    'bias': True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
    'vocab_size':50257,
}

# init our transformer model
model = GPT()
print("our GPT************** \n", model)

HF GPT************** 
 GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
number of parameters: 124.44M
our GPT************** 
 GPT(
  (transformer): ModuleDict(
 

In [41]:

sd_hf = model_hf.state_dict()
sd = model.state_dict()

sd_keys = sd.keys()
sd_keys = [k for k in sd_keys if not k.endswith('.attn.tril')] # discard this mask / buffer, not a param



# copy while ensuring all of the parameters are aligned and match in names and shapes
sd_keys_hf = sd_hf.keys()
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
# this means that we have to transpose these weights when we import them
assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
for k in sd_keys_hf:
    if any(k.endswith(w) for w in transposed):
        # special treatment for the Conv1D weights we need to transpose
        assert sd_hf[k].shape[::-1] == sd[k].shape
        with torch.no_grad():
            sd[k].copy_(sd_hf[k].t())
    else:
        # vanilla copy over the other parameters
        assert sd_hf[k].shape == sd[k].shape
        with torch.no_grad():
            sd[k].copy_(sd_hf[k])



In [42]:
# compare the number of parameters
# (note we don't count the decoder parameters in lm_head)
print("Our-params:", sum(p.numel() for p in model.parameters()))
print("GPT-params:", sum(p.numel() for p in model_hf.parameters()))

Our-params: 163037184
GPT-params: 124439808


In [44]:
# checking for same outputs
_ = model.eval()
_ = model_hf.eval()

idx = torch.tensor([[1,123,52,28]], dtype=torch.long)

#logits_hf, _ = model_hf(idx)
logits_hf = model_hf(idx).logits
logits_our, _ = model(idx)

print("hf__logits ", logits_hf)
print("our_logits ", logits_our)
print("")
print("hf__logits_shape ", logits_hf.shape)
print("our_logits_shape ", logits_our.shape)

hf__logits  tensor([[[-32.9012, -31.2025, -34.6623,  ..., -39.4868, -39.8732, -32.2388],
         [-74.0952, -71.2668, -74.4134,  ..., -84.2939, -82.6411, -75.3960],
         [-75.3409, -74.9739, -77.7477,  ..., -86.1547, -83.4554, -78.8507],
         [-78.8787, -80.1651, -80.8338,  ..., -89.4950, -89.4559, -83.1077]]],
       grad_fn=<UnsafeViewBackward0>)
our_logits  tensor([[[-32.8416, -31.1477, -34.6075,  ..., -39.4235, -39.8091, -32.1826],
         [-74.0748, -71.2478, -74.3932,  ..., -84.2734, -82.6181, -75.3790],
         [-75.3062, -74.9425, -77.7129,  ..., -86.1231, -83.4241, -78.8159],
         [-78.8222, -80.1078, -80.7757,  ..., -89.4309, -89.3999, -83.0506]]],
       grad_fn=<UnsafeViewBackward0>)

hf___logits_shape  torch.Size([1, 4, 50257])
our__logits_shape  torch.Size([1, 4, 50257])
