In [7]:
import torch
inputs = torch.tensor([
    [0.43, 0.15, 0.89], # Your
    [0.55, 0.87, 0.66], # journey
    [0.57, 0.85, 0.64], # starts
    [0.22, 0.58, 0.33], # with
    [0.77, 0.25, 0.10], # one
    [0.05, 0.80, 0.55] # step
])

<div class="alert alert-block alert-info">
    
#A The second input element

#B The input embedding size, d=3


#C The output embedding size, d_out=2

</div>

In [11]:
x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2
print(x_2)

tensor([0.5500, 0.8700, 0.6600])


<div class="alert alert-block alert-info">
    
Note that in GPT-like models, the input and output dimensions are usually the same. 

But for illustration purposes, to better follow the computation, we choose different input (d_in=3)
and output (d_out=2) dimensions here.

</div>

<div class="alert alert-block alert-success">

Next, we initialize the three weight matrices Wq, Wk and Wv

</div>

In [13]:
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
# print(W_query.shape)

In [18]:
print("Query matrics: ", W_query)
print("---"*15)
print("Key matrics: ", W_key)
print("---"*15)
print("Value matrics: ", W_value)

Query matrics:  Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
---------------------------------------------
Key matrics:  Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])
---------------------------------------------
Value matrics:  Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


<div class="alert alert-block alert-info">
    
Note that we are setting requires_grad=False to reduce clutter in the outputs for
illustration purposes. 

If we were to use the weight matrices for model training, we
would set requires_grad=True to update these matrices during model training.

</div>

<div class="alert alert-block alert-success">

Next, we compute the query, key, and value vectors as shown earlier
</div>

In [26]:
query_2 = x_2 @ W_query
print("Input 2 Query value: ", query_2)
print("---"*15)

key_2 = x_2 @ W_key
print("Input 2 key value: ", key_2)
print("---"*15)

value_2 = x_2 @ W_value
print("Input 2  value vector: ", value_2)
print("---"*15)

Input 2 Query value:  tensor([0.4306, 1.4551])
---------------------------------------------
Input 2 key value:  tensor([0.4433, 1.1419])
---------------------------------------------
Input 2  value vector:  tensor([0.3951, 1.0037])
---------------------------------------------


<div class="alert alert-block alert-info">
    
As we can see based on the output for the query, this results in a 2-dimensional vector. 

This is because: we set the number of columns of the corresponding weight matrix, via d_out, to 2:

</div>

<div class="alert alert-block alert-success">

Even though our temporary goal is to only compute the one context vector z(2),  we still
require the key and value vectors for all input elements. 

This is because they are involved in computing the attention weights with respect to the query q(2)
</div>

<div class="alert alert-block alert-success">

We can obtain all keys and values via matrix multiplication:
</div>

In [29]:
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value

print("keys.shape:", keys.shape)
print("queries.shape:", queries.shape)
print("values.shape:", values.shape)

keys.shape: torch.Size([6, 2])
queries.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 2])


In [30]:
print(keys)

tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])


In [31]:
values

tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])

In [32]:
queries

tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])

In [36]:
query_2 = queries[1]
attention_score_22 = query_2 @ keys.T
print(attention_score_22)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [39]:
final_attention_scores = queries @ keys.T
print(final_attention_scores)

tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


In [48]:
d_k = keys.shape[-1]
attention_weights_2 = torch.softmax(attention_score_22 / d_k**0.5, dim=-1)
print(attention_weights_2)
print(d_k)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
2


In [None]:
d_k = inputs.shape[-1]
scaled_scores = final_attention_scores / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
print(scaled_scores)

tensor([[0.5330, 0.7820, 0.7645, 0.4567, 0.2328, 0.6541],
        [0.7335, 1.0695, 1.0456, 0.6233, 0.3220, 0.8914],
        [0.7242, 1.0556, 1.0321, 0.6151, 0.3180, 0.8797],
        [0.4026, 0.5870, 0.5739, 0.3421, 0.1767, 0.4893],
        [0.3530, 0.5092, 0.4980, 0.2957, 0.1563, 0.4219],
        [0.5193, 0.7601, 0.7431, 0.4435, 0.2273, 0.6349]])


In [44]:
attention_weight = torch.softmax(scaled_scores, dim=-1)

In [45]:
attention_weight

tensor([[0.1577, 0.2024, 0.1988, 0.1462, 0.1168, 0.1781],
        [0.1540, 0.2154, 0.2104, 0.1379, 0.1020, 0.1803],
        [0.1542, 0.2148, 0.2098, 0.1383, 0.1027, 0.1802],
        [0.1608, 0.1934, 0.1908, 0.1514, 0.1283, 0.1754],
        [0.1623, 0.1897, 0.1876, 0.1532, 0.1333, 0.1739],
        [0.1582, 0.2013, 0.1979, 0.1467, 0.1182, 0.1776]])

In [46]:
attention_weight[1].sum()

tensor(1.)

In [49]:
d_k= inputs.shape[-1]
attention_weight = torch.softmax(final_attention_scores/d_k**0.5, dim=-1)
print(attention_weight)

tensor([[0.1577, 0.2024, 0.1988, 0.1462, 0.1168, 0.1781],
        [0.1540, 0.2154, 0.2104, 0.1379, 0.1020, 0.1803],
        [0.1542, 0.2148, 0.2098, 0.1383, 0.1027, 0.1802],
        [0.1608, 0.1934, 0.1908, 0.1514, 0.1283, 0.1754],
        [0.1623, 0.1897, 0.1876, 0.1532, 0.1333, 0.1739],
        [0.1582, 0.2013, 0.1979, 0.1467, 0.1182, 0.1776]])


## IMPLEMENTING A COMPACT SELF ATTENTION PYTHON CLASS

<div class="alert alert-block alert-success">
    
In the previous sections, we have gone through a lot of steps to compute the self-attention
outputs. 

This was mainly done for illustration purposes so we could go through one step at
a time. 

In practice, with the LLM implementation in the next chapter in mind, it is helpful to
organize this code into a Python class as follows:
    
</div>

In [50]:
import torch.nn as nn 
class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))


    def forward(self, x):
        query = x @ self.W_query
        key = x @ self.W_key
        value = x @ self.W_value

        attention_scores = query @ key.T
        attention_weights = torch.softmax(
            attention_scores / key.shape[-1]**0.5 , dim=-1
        )

        context_vector = attention_weights @ value
        return context_vector

<div class="alert alert-block alert-warning">

In this PyTorch code, SelfAttention_v1 is a class derived from nn.Module, which is a
fundamental building block of PyTorch models, which provides necessary functionalities for
model layer creation and management.    
</div>

<div class="alert alert-block alert-warning">

The __init__ method initializes trainable weight matrices (W_query, W_key, and
W_value) for queries, keys, and values, each transforming the input dimension d_in to an
output dimension d_out.

</div>

<div class="alert alert-block alert-warning">

During the forward pass, using the forward method, we compute the attention scores
(attn_scores) by multiplying queries and keys, normalizing these scores using softmax.

</div>

<div class="alert alert-block alert-success">
    
Finally, we create a context vector by weighting the values with these normalized attention
scores.
    
</div>

In [51]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


<div class="alert alert-block alert-info">

Since inputs contains six embedding vectors, we get a matrix storing the six
context vectors, as shown in the above result. 
</div>

<div class="alert alert-block alert-info">

As a quick check, notice how the second row ([0.3061, 0.8210]) matches the contents of
context_vec_2 in the previous section.
    
</div>

<div class="alert alert-block alert-warning">

We can improve the SelfAttention_v1 implementation further by utilizing PyTorch's
nn.Linear layers, which effectively perform matrix multiplication when the bias units are
disabled. 

</div>

<div class="alert alert-block alert-warning">

Additionally, a significant advantage of using nn.Linear instead of manually
implementing nn.Parameter(torch.rand(...)) is that nn.Linear has an optimized weight
initialization scheme, contributing to more stable and effective model training.

</div>

In [55]:
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

<div class="alert alert-block alert-success">

You can use the SelfAttention_v2 similar to SelfAttention_v1:
    
</div>

In [56]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


<div class="alert alert-block alert-info">

Note that SelfAttention_v1 and SelfAttention_v2 give different outputs because they
use different initial weights for the weight matrices since nn.Linear uses a more
sophisticated weight initialization scheme.
    
</div>