In [2]:
# 1. Self-attention by hand
# 2. Self-attention block in pytorch
# 3. GPT, piece-by-piece
# 4. GPU goes rrrr!

# Original code from https://github.com/karpathy/minGPT/tree/master/mingpt

### Step 1: Self-attention by hand

In [3]:
import torch
import math
import torch.nn.functional as F

In [4]:
#  -- Write the scaled dot product self attention
  # 1. Compute queries, keys, and values
  # 2. Compute dot products
  # 3. Scale the dot products
  # 4. Apply softmax to calculate attentions
  # 5. Weight values by attentions
  # 6. Compute attention weighted features

In [5]:
# Choose random values for the parameters -- sames values as on slide 12, but in pytorch format
# T = 4, C = 6, H = 3
X = torch.tensor([[2,0,0,0,2,1],[0,1,2,0,0,0],[0,0,1,1,0,1],[2,0,0,1,0,1]], dtype=float) # T x C
W_QT = torch.tensor([[1,0,0], [1,1,0], [0,0,1], [0,1,0], [0,0,1], [0,0,1]], dtype=float) # C x H
W_KT = torch.tensor([[0,0,1], [0,1,0], [1,0,0], [0,0,0], [0,0,0], [0,0,-1]], dtype=float) # C x H
W_VT = torch.tensor([[10,0,0], [0,0,10], [0,0,0], [0,10,0], [0,0,0], [0,0,0]], dtype=float) # C x H

In [6]:
Q = X @ W_QT
K = X @ W_KT
V = X @ W_VT

In [7]:
# What does the second dimension of matrices Q and K correspond to?

It correspond to the head size 

In [8]:
V.shape

torch.Size([4, 3])

In [9]:
# compute the weighted attention matrix S
S = Q @ K.T
d_k = Q.shape[1]
S /= math.sqrt(d_k)


In [10]:
# compute the self-attention matrix A
A = F.softmax(S, dim=-1) @ V

In [11]:
# Sanity check. This should return True.
torch.allclose(A.float(), torch.tensor([[10.30759701,  2.83283874,  4.59026201],
        [10.10551833,  2.97334971,  4.50027071],
        [15.03361159,  4.13169018,  2.10990693],
        [ 3.06082018,  1.53041009,  7.70438486]]))

True

### Step 2: Self-attention block in pytorch

In [12]:
import torch
import torch.nn as nn
from torch.functional import F

In [13]:
# do not modify this code

batch_size = 3 # B
block_size = 2 # T
n_embd = 3     # C

In [14]:
torch.set_printoptions(precision=8)

In [16]:
# Build a scaled self-attention head without masked attention and without dropout (i.e. just key, query and values)
# A matrix multiplication is implemented using the nn.Linear() operator with no bias.
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.values = nn.Linear(n_embd, head_size, bias=False)

    def forward(self, x):
        B, T, C = x.shape 

        key = self.key(x)  
        query = self.query(x)  
        values = self.values(x)  

        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(C)  # (B, T, T)
        attention_weights = F.softmax(attention_scores, dim=-1)  # (B, T, T)
        out = attention_weights @ values  # (B, T, head_size)
        
        return out

In [17]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
h = Head(2)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = h(x)
out

tensor([[[-0.46653441,  0.03306435],
         [-0.47224301,  0.04610375]],

        [[-0.38105935,  0.02397408],
         [-0.39453450,  0.02482043]],

        [[-0.29578221,  0.12158969],
         [-0.30042297,  0.12526262]]], grad_fn=<UnsafeViewBackward0>)

In [18]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.46728206,  0.03477207],
         [-0.47425330,  0.05069541]],
        [[-0.38198256,  0.02403205],
         [-0.39846635,  0.02506737]],
        [[-0.29631630,  0.12201238],
         [-0.30199534,  0.12650707]]]))

False

In [19]:
out - torch.tensor([[[-0.46728206,  0.03477207],
         [-0.47425330,  0.05069541]],
        [[-0.38198256,  0.02403205],
         [-0.39846635,  0.02506737]],
        [[-0.29631630,  0.12201238],
         [-0.30199534,  0.12650707]]]) # values are closed even if the sanity check did not pass 

tensor([[[ 7.47650862e-04, -1.70771778e-03],
         [ 2.01028585e-03, -4.59166244e-03]],

        [[ 9.23216343e-04, -5.79748303e-05],
         [ 3.93185019e-03, -2.46938318e-04]],

        [[ 5.34087420e-04, -4.22686338e-04],
         [ 1.57237053e-03, -1.24445558e-03]]], grad_fn=<SubBackward0>)

In [24]:
# Add weighted masked attention and dropout. Dropout comes after the softmax and before the multiplication with the value matrix.
# Copy the Head class from the previous exercise and expand upon it.

class Head(nn.Module):
    def __init__(self, head_size, dropout=0.01):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.values = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)  
        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) 

    def forward(self, x, mask=True):
        B, T, C = x.shape  

        key = self.key(x)  
        query = self.query(x)  
        values = self.values(x)  

        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(query.shape[-1])  # (B, T, T)

        if mask:
            attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  

        attention_weights = F.softmax(attention_scores, dim=-1)  # (B, T, T)
        attention_weights = self.dropout(attention_weights)  
        out = attention_weights @ values  # (B, T, head_size)

        return out

In [21]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
h = Head(2)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = h(x)
out

tensor([[[-0.38323045, -0.16764536],
         [-0.47904372,  0.05120752]],

        [[-0.14327437,  0.00903951],
         [-0.40249124,  0.02532059]],

        [[-0.17476673,  0.02467545],
         [-0.30504578,  0.12778492]]], grad_fn=<UnsafeViewBackward0>)

In [None]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.37939820, -0.16596894],
         [-0.47425330,  0.05069541]],
        [[-0.14184165,  0.00894911],
         [-0.39846635,  0.02506737]],
        [[-0.17301908,  0.02442869],
         [-0.30199534,  0.12650707]]])) 


False

In [None]:
out - torch.tensor([[[-0.37939820, -0.16596894],
         [-0.47425330,  0.05069541]],
        [[-0.14184165,  0.00894911],
         [-0.39846635,  0.02506737]],
        [[-0.17301908,  0.02442869],
         [-0.30199534,  0.12650707]]]) # values are closed even if the sanity check did not pass 

tensor([[[-3.83225083e-03, -1.67642534e-03],
         [-4.79042530e-03,  5.12104481e-04]],

        [[-1.43271685e-03,  9.04006884e-05],
         [-4.02489305e-03,  2.53221020e-04]],

        [[-1.74765289e-03,  2.46765092e-04],
         [-3.05044651e-03,  1.27784908e-03]]], grad_fn=<SubBackward0>)

In [26]:
# A multi-head attention module contains a list of heads and a linear projection layer.
# The heads are applied to the input and then concatenated along the last dimension, then
# the linear layer is applied. Look at the unit test below to determine the dimensions of
# the linear layer.

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, dropout=0.01):
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size

        self.heads = nn.ModuleList([Head(head_size, dropout) for _ in range(num_heads)])

        self.proj = nn.Linear(num_heads * head_size, n_embd)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)  # (B, T, num_heads * head_size)

        out = self.proj(out)  # (B, T, n_embd)
        out = self.dropout(out)

        return out

In [27]:
# do not modify
num_heads = 3
head_size = 2
n_embd = 6

In [28]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
sa = MultiHeadAttention(num_heads=3, head_size=head_size)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = sa(x)

In [29]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.03730504, -0.07006130, -0.27096999,  0.13144857, -0.45049590,
          -0.33217290],
         [-0.06818272, -0.04490501, -0.34806073,  0.15622401, -0.45459983,
          -0.33084857]],
        [[-0.08914752, -0.03846309, -0.36569631,  0.09802882, -0.39963537,
          -0.29225215],
         [-0.04541985,  0.01269679, -0.25225419,  0.08241771, -0.41533324,
          -0.30674040]],
        [[ 0.15234883, -0.08591781, -0.10099770,  0.19886394, -0.49236685,
          -0.43605998],
         [ 0.15418015, -0.01837257, -0.00573672,  0.14228639, -0.48172480,
          -0.40757987]]]))


False

In [None]:
out - torch.tensor([[[-0.03730504, -0.07006130, -0.27096999,  0.13144857, -0.45049590,
          -0.33217290],
         [-0.06818272, -0.04490501, -0.34806073,  0.15622401, -0.45459983,
          -0.33084857]],
        [[-0.08914752, -0.03846309, -0.36569631,  0.09802882, -0.39963537,
          -0.29225215],
         [-0.04541985,  0.01269679, -0.25225419,  0.08241771, -0.41533324,
          -0.30674040]],
        [[ 0.15234883, -0.08591781, -0.10099770,  0.19886394, -0.49236685,
          -0.43605998],
         [ 0.15418015, -0.01837257, -0.00573672,  0.14228639, -0.48172480,
          -0.40757987]]]) # values are closed even if the sanity check did not pass 

tensor([[[-0.00138838,  0.00042240, -0.00295746,  0.00235054, -0.00605676,
          -0.00361356],
         [-0.00201530,  0.04490501, -0.00452268,  0.00285356, -0.00614014,
          -0.00358665]],

        [[-0.00244097,  0.00106399, -0.00488073,  0.00167199, -0.00502414,
          -0.00280303],
         [-0.00155315,  0.00210274, -0.00257748,  0.00135501, -0.00534290,
          -0.00309718]],

        [[ 0.00246237,  0.00010044,  0.00049363,  0.00371936, -0.00690696,
          -0.00572288],
         [ 0.00249955,  0.00147188,  0.00242780,  0.00257060, -0.00669086,
          -0.00514466]]], grad_fn=<SubBackward0>)

In [31]:
# Add a classical feedforward module: linear -> ReLU -> linear
# The hidden dimension is four times bigger than the input dimension (see Section 3.3 of Attention is All You Need)
#
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.fc1 = nn.Linear(n_embd, 4 * n_embd)  
        self.fc2 = nn.Linear(4 * n_embd, n_embd)  
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)  
        out = self.relu(out) 
        out = self.fc2(out)  
        return out

In [32]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
ff = FeedForward(n_embd)
torch.manual_seed(123) # do not remove this line
x = torch.rand((3,n_embd))
out = ff(x)
out

tensor([[-0.58034140,  0.04641047, -0.10707692,  0.21581650, -0.30361831,
         -0.07352637],
        [-0.48917407,  0.07879593, -0.15972012,  0.17862341, -0.37070656,
         -0.07852859],
        [-0.48530391,  0.09604470, -0.06524835,  0.16611032, -0.35499069,
         -0.08964306]], grad_fn=<AddmmBackward0>)

In [33]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[-0.58034140,  0.04641046, -0.10707694,  0.21581653, -0.30361831,
         -0.07352637],
        [-0.48917407,  0.07879593, -0.15972012,  0.17862344, -0.37070659,
         -0.07852858],
        [-0.48530388,  0.09604470, -0.06524836,  0.16611034, -0.35499069,
         -0.08964306]]))

True

In [36]:
# Build a self-attention block
#
#   in -----> LayerNorm -------> multi-head attention -- + ----> LayerNorm -----> FeedForward --- + -----> out
#         |                                              |   |                                    |
#          ----------------------------------------------     ------------------------------------                       
#
# This architecture is slightly different from Attention is All You Need (or the UDL textbook):
# the layer norm comes before (not after) the attention or feed-forward
#
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.ln1 = nn.LayerNorm(n_embd)  
        self.attn = MultiHeadAttention(n_head, head_size)
        self.ln2 = nn.LayerNorm(n_embd)  
        self.ffn = FeedForward(n_embd)
        

    def forward(self, x):
        out = x + self.attn(self.ln1(x))
        out = out + self.ffn(self.ln2(out))
        return out

In [37]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
bk = Block(n_embd, num_heads)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size,block_size,n_embd))
out = bk(x)
out

tensor([[[-0.05804071, -0.11310092, -0.10850900,  0.98114121, -0.55709964,
           0.56820649],
         [-0.21670339, -0.27101421, -0.29645509,  1.13258827, -0.34745026,
           0.38123190]],

        [[-0.42635530, -0.29880306, -0.13436174,  0.65017426, -0.51931226,
           0.56973475],
         [-0.03636354,  0.09184688,  0.64883506,  0.70156789,  0.05891362,
           0.69799930]],

        [[ 0.53390098,  0.33858770,  0.30727547,  1.12015176,  0.37155873,
          -0.03651971],
         [ 1.39353633,  0.59874254,  0.99845648,  0.38251585,  0.61530453,
           0.47056967]]], grad_fn=<AddBackward0>)

In [39]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.05278997, -0.10863629, -0.09458938,  0.97590691, -0.55101192,
           0.57085067],
         [-0.16924502, -0.45394337, -0.25217158,  1.10904062, -0.34593600,
           0.41432184]],
        [[-0.41515028, -0.30126408, -0.11399293,  0.64651299, -0.51579159,
           0.57017863],
         [-0.02535054,  0.08704096,  0.66524690,  0.69768047,  0.05969021,
           0.69993609]],
        [[ 0.52881187,  0.34458166,  0.31130391,  1.11564195,  0.37998506,
          -0.02971917],
         [ 1.38496208,  0.60325992,  0.99346304,  0.38082033,  0.62151432,
           0.47973478]]]))

False

In [40]:
out - torch.tensor([[[-0.05278997, -0.10863629, -0.09458938,  0.97590691, -0.55101192,
           0.57085067],
         [-0.16924502, -0.45394337, -0.25217158,  1.10904062, -0.34593600,
           0.41432184]],
        [[-0.41515028, -0.30126408, -0.11399293,  0.64651299, -0.51579159,
           0.57017863],
         [-0.02535054,  0.08704096,  0.66524690,  0.69768047,  0.05969021,
           0.69993609]],
        [[ 0.52881187,  0.34458166,  0.31130391,  1.11564195,  0.37998506,
          -0.02971917],
         [ 1.38496208,  0.60325992,  0.99346304,  0.38082033,  0.62151432,
           0.47973478]]])

tensor([[[-0.00525074, -0.00446463, -0.01391962,  0.00523430, -0.00608772,
          -0.00264418],
         [-0.04745837,  0.18292916, -0.04428351,  0.02354765, -0.00151426,
          -0.03308994]],

        [[-0.01120502,  0.00246102, -0.02036881,  0.00366127, -0.00352067,
          -0.00044388],
         [-0.01101300,  0.00480592, -0.01641184,  0.00388741, -0.00077659,
          -0.00193679]],

        [[ 0.00508910, -0.00599396, -0.00402844,  0.00450981, -0.00842634,
          -0.00680054],
         [ 0.00857425, -0.00451738,  0.00499344,  0.00169551, -0.00620979,
          -0.00916511]]], grad_fn=<SubBackward0>)

In [None]:
## Step 3: Build a mini GPT
#
# - Start from the gpt-problem.py file
# - Add your Head, MultiHeadAttention, FeedForward and Block classes ok 
# - Fill in the GPT class (__init__ and forward methods)
# - Train the network on CPU
# - Train the network on GPU

# For __init__, the GPT model parameters are:
#   - a token embedding table
#   - a positional embedding table
#   - a sequence of Blocks
#   - a layer norm
#   - a linear layer
#
# For forward(), the model consists in:
#   - applying the token embedding table and positional embedding table to the input tensor
#   - adding the two together
#   - applying the blocks, layer norm and linear layer (in that order)
#
# The code comes from hyperparameters that should work well on GPU.  On CPU, you 
# will need to reduce the model size significantly.
#
# In pytorch, an learnable embedding table is implemented with nn.Embedding(...)
#
# The token embedding table learns an embedding for each item of the vocabulary. The 
# positional embedding table does not depend on the input and learns an embedding
# for each position in the context.