In [66]:
%pip install numpy requests torch tiktoken matplotlib pandas

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Note: you may need to restart the kernel to use updated packages.


In [67]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import tiktoken
import torch
import torch.nn as nn

In [68]:
# Hyperparameters
batch_size = 4
context_len = 16 # 每个batch的token块长度
d_model = 64 # token embeddings vector 的长度
num_layers = 8
num_heads = 4
learning_rate = 1e-3
dropout = 0.1
max_iters = 5000
eval_interval = 50
eval_iters = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'

TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

<torch._C.Generator at 0x2651b5783d0>

In [69]:
# download a sample txt file from https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt
if not os.path.exists('sales_textbook.txt'):
    url = 'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt'
    with open('sales_textbook.txt', 'w') as f:
        f.write(requests.get(url).text)

with open('sales_textbook.txt', 'r', encoding='utf-8') as f:
    text = f.read()

## STEP1: Tokenization

In [70]:
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = torch.tensor(encoding.encode(text)) # size of tokenized source text is 77,919
vocab_size = len(set(tokenized_text)) # size of vocabulary is 3,771
max_token_value = max(tokenized_text)

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"Vocabulary size: {vocab_size}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

Tokenized text size: 77919
Vocabulary size: 77919
The maximum value in the tokenized text is: 100069


## STEP2: Word Embedding

In [71]:
split_idx = int(len(tokenized_text) * 0.8)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]

data = train_data
idxs = torch.randint(low=0, high=len(data) - context_len, size=(batch_size,))
x_batch = torch.stack([data[idx:idx + context_len] for idx in idxs])
y_batch = torch.stack([data[idx + 1: idx + context_len + 1] for idx in idxs])
print(x_batch.shape, y_batch.shape)

torch.Size([4, 16]) torch.Size([4, 16])


## STEP3: Positional Encoding

In [72]:
token_embedding_lookup_table = nn.Embedding(max_token_value, d_model)

x = token_embedding_lookup_table(x_batch.data)
y = token_embedding_lookup_table(y_batch.data)

x, y

(tensor([[[ 0.3467, -0.5839,  0.1942,  ...,  1.0553,  0.8381, -0.7491],
          [-0.7265,  0.2796,  0.6138,  ...,  0.4791,  1.4515, -0.3293],
          [-0.8843, -0.4804,  1.0096,  ...,  0.6763,  1.0791,  0.2051],
          ...,
          [ 0.6703, -0.4721,  0.2348,  ...,  1.3074, -1.2579, -1.7545],
          [ 0.4058,  0.7155, -1.2484,  ..., -0.4716, -1.8187,  0.2622],
          [-0.5466,  0.5479, -0.8568,  ..., -2.0618,  0.7363,  1.5494]],
 
         [[ 1.8155, -1.2609,  0.0187,  ...,  2.1571, -1.2867,  1.6914],
          [ 0.3175,  2.1064, -0.0922,  ..., -0.3214, -0.4788, -0.1669],
          [-1.9643,  0.6743, -0.8320,  ..., -0.2654,  0.8217, -0.5584],
          ...,
          [ 0.7099,  1.3693, -0.7076,  ..., -1.3573,  1.6040,  0.9203],
          [ 1.1325, -0.9729,  0.7417,  ...,  1.2694, -0.5743, -1.3812],
          [ 0.4447,  0.9593, -0.4624,  ..., -1.4465,  0.6830,  0.4889]],
 
         [[-0.4976,  0.0382, -0.7235,  ...,  0.9505,  0.9450,  1.8551],
          [ 0.6253, -1.1287,

In [73]:
position_encoding_lookup_table = torch.zeros(context_len, d_model) # 初始化为0 [4, 16, 64]
position = torch.arange(0, context_len, dtype=torch.float).unsqueeze(1) # 位置编码， 升维 => [16, 1]

div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)

position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)

print("Position Encoding Look-up Table", position_encoding_lookup_table.shape)

Position Encoding Look-up Table torch.Size([4, 16, 64])


In [74]:
input_embedding_x = x + position_encoding_lookup_table
input_embedding_y = y + position_encoding_lookup_table

X = input_embedding_x

x_plot = input_embedding_x[0].detach().cpu().numpy()
print("Final Input Embedding of x: \n", pd.DataFrame(x_plot))

Final Input Embedding of x: 
           0         1         2         3         4         5         6   \
0   0.346711  0.416130  0.194183  2.676231  0.104132  0.317640 -0.716680   
1   0.115021  0.819889  1.295355  0.050067  1.152808  0.007295  2.096463   
2   0.025032 -0.896510  2.007128  0.645062  1.861388  1.183611  0.626594   
3   1.540862 -2.196910  1.106263 -0.879057  1.525128 -0.364159  1.305572   
4  -1.026346 -2.520314  0.092276 -1.922128  0.772673 -1.248682  0.811818   
5  -0.333637 -0.845062 -3.047244 -0.539240  0.370664 -0.112349  0.087253   
6  -0.671321  1.424937 -2.025386 -1.031886 -0.385049 -0.656757  0.465220   
7   1.460090  1.632251 -2.407473 -0.108412 -1.577921 -1.534449 -0.767310   
8   0.493083 -2.413755 -1.205481  0.105365 -0.475036  0.941739  0.546560   
9   0.381502 -0.606034  0.249510  1.915294 -0.984113  2.072770 -2.097719   
10 -0.586758 -1.718416  1.647630 -0.431370 -0.642387 -0.391723 -0.296238   
11 -0.625085 -0.037802  1.642752  1.460066  0.417635  2.48

## STEP4: Transformer Block

### Prepare Q, K, V

In [75]:
# Prepare Query, Key, Value for Multi-head Attention

query = key = value = X # [4, 16, 64] [batch_size, context_length, d_model]

# Define Query, Key, Value weight matrices
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(query) #[4, 16, 64]
Q = Q.view(batch_size, context_len, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

K = Wk(key) #[4, 16, 64]
K = K.view(batch_size, context_len, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

V = Wv(value) #[4, 16, 64]
V = V.view(batch_size, context_len, num_heads, d_model // num_heads)  #[4, 16, 4, 16]

In [76]:
# Transpose q,k,v from [batch_size, context_length, num_heads, head_size] to [batch_size, num_heads, context_length, head_size]
# The reason is that treat each batch with "num_heads" as its first dimension.
Q = Q.transpose(1, 2) # [4, 4, 16, 16]
K = K.transpose(1, 2) # [4, 4, 16, 16]
V = V.transpose(1, 2) # [4, 4, 16, 16]

### Calculate Q * K ^ T Attention

In [77]:
# Calculate the attention score betwee Q and K^T
attention_score = torch.matmul(Q, K.transpose(-2, -1)) # 只针对最后两个维度做转置

### Scale

In [78]:
attention_score = attention_score / math.sqrt(d_model // num_heads)

In [79]:
attention_score = torch.matmul(Q, K.transpose(-2, -1) / math.sqrt(d_model // num_heads))
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   0.271247  0.560849 -0.497451 -0.129847 -0.363947 -0.685937  0.185218   
1  -0.045627  0.335207 -0.261123  0.401420 -0.060369  0.060264  0.264576   
2  -0.062706  0.149768 -0.373623 -0.014540  0.501353  0.051065  0.528086   
3   0.489974  0.889258 -0.396356  0.726592  0.525897 -0.280680 -0.086415   
4   0.366094  0.809745  0.008297  0.004744  0.321938 -0.265212 -0.031628   
5   0.086670  0.772399 -0.768684  0.338177  0.517480  0.012420  0.335387   
6  -0.917973  0.179699 -1.242398  0.295149  0.050819 -0.538681  0.316469   
7   0.485319  1.187169 -0.421335  0.609655  0.178571 -0.377602 -0.150101   
8   0.442679  1.010227  0.541597  0.759292  0.351397  0.068198 -0.331880   
9  -0.378671 -0.184779 -0.172620  0.340819  0.176497  0.439594  0.154800   
10  0.634922  1.067505 -0.084768  1.119899  1.075548  0.098884 -0.157741   
11  0.714039  0.273180  0.165909 -0.122726  0.084472  0.506430 -0.043268   
12  0.522655

### Mask

In [80]:
attention_score = attention_score.masked_fill(torch.triu(torch.ones(attention_score.shape[-2:]), diagonal=1).bool(), float('-inf'))
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   0.271247      -inf      -inf      -inf      -inf      -inf      -inf   
1  -0.045627  0.335207      -inf      -inf      -inf      -inf      -inf   
2  -0.062706  0.149768 -0.373623      -inf      -inf      -inf      -inf   
3   0.489974  0.889258 -0.396356  0.726592      -inf      -inf      -inf   
4   0.366094  0.809745  0.008297  0.004744  0.321938      -inf      -inf   
5   0.086670  0.772399 -0.768684  0.338177  0.517480  0.012420      -inf   
6  -0.917973  0.179699 -1.242398  0.295149  0.050819 -0.538681  0.316469   
7   0.485319  1.187169 -0.421335  0.609655  0.178571 -0.377602 -0.150101   
8   0.442679  1.010227  0.541597  0.759292  0.351397  0.068198 -0.331880   
9  -0.378671 -0.184779 -0.172620  0.340819  0.176497  0.439594  0.154800   
10  0.634922  1.067505 -0.084768  1.119899  1.075548  0.098884 -0.157741   
11  0.714039  0.273180  0.165909 -0.122726  0.084472  0.506430 -0.043268   
12  0.522655

Softmax

In [81]:
# Softmax the attention score
attention_score = torch.softmax(attention_score, dim=-1)  #[4, 4, 16, 16] [batch_size, num_heads, context_length, context_length]
print(pd.DataFrame(attention_score[0][0].detach().cpu().numpy()))

          0         1         2         3         4         5         6   \
0   1.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1   0.405926  0.594074  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.336756  0.416478  0.246766  0.000000  0.000000  0.000000  0.000000   
3   0.239815  0.357506  0.098844  0.303836  0.000000  0.000000  0.000000   
4   0.203619  0.317316  0.142373  0.141868  0.194823  0.000000  0.000000   
5   0.139601  0.277139  0.059349  0.179522  0.214777  0.129611  0.000000   
6   0.064035  0.191923  0.046293  0.215410  0.168716  0.093571  0.220052   
7   0.146088  0.294729  0.059001  0.165429  0.107497  0.061639  0.077385   
8   0.104471  0.184279  0.115333  0.143383  0.095357  0.071839  0.048151   
9   0.057362  0.069635  0.070487  0.117786  0.099938  0.130014  0.097793   
10  0.112850  0.173928  0.054947  0.183284  0.175332  0.066024  0.051080   
11  0.131966  0.084918  0.076280  0.057156  0.070315  0.107226  0.061883   
12  0.147472

Calculate V Attention

In [82]:
A = torch.matmul(attention_score, V)
print(A.shape)

torch.Size([4, 4, 16, 16])


### Concatenate and Output

In [83]:
A = A.transpose(1, 2) # [4, 16, 4, 16] [batch_size, context_length, num_heads, head_size]
A = A.reshape(batch_size, -1, d_model) # [4, 16, 64] [batch_size, context_length, d_model]

In [84]:
Wo = nn.Linear(d_model, d_model)
output = Wo(A)

print(output.shape)

torch.Size([4, 16, 64])


## STEP5: Residual Connection and Layer Normalization

In [85]:
output = output + X

layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

## STEP6: Feed Forward Network

In [86]:
output = nn.Linear(d_model, d_model * 4)(output)
output = nn.ReLU()(output)

output = nn.Linear(d_model * 4, d_model)(output)
output = torch.dropout(output, p=dropout, train=True)

In [87]:
output = output + X

layer_norm = nn.LayerNorm(d_model)
output = layer_norm(output)

## STEP7: Repeat step 4 to 6

## STEP8: Output Probabilities

In [88]:
logits = nn.Linear(d_model, max_token_value)(output)
print(pd.DataFrame(logits[0].detach().cpu().numpy()))

      0         1         2         3         4         5         6       \
0  -0.198109 -0.024584 -0.390627 -0.509666 -0.651848 -0.803911  0.021912   
1   0.738735  0.621348 -0.293323  0.693907 -0.620037 -0.390120 -0.576853   
2   0.461873 -0.118239 -0.102895 -0.944985 -0.255342  0.026568  0.926113   
3  -0.594172  0.398664  0.292804 -0.800400 -0.337430 -0.212501  0.310157   
4  -0.627845  0.119776  0.000432 -0.299878  0.699299 -0.479650  0.159084   
5  -0.001701 -0.636552 -0.205869 -0.914698 -0.654249  0.303629  0.728940   
6  -0.556173 -0.015484  0.494321 -0.732024 -0.162457 -0.178356  0.327461   
7  -0.812433  0.003836 -0.903373 -0.397297 -0.450321  0.819017  0.168120   
8  -1.310561 -0.363689 -0.590449 -0.784727 -0.423371  0.130670 -0.028444   
9  -0.341814 -0.570835  0.131305 -0.092621  0.788680  0.772467  0.319421   
10  0.292164 -0.595430 -0.032514 -0.172319  0.704094  0.050517 -0.211032   
11  0.236014 -0.261912 -0.782080 -0.480625 -0.395049  0.332238  0.043047   
12 -0.529474

In [89]:
# torch.softmax usually used during inference, during training we use torch.nn.CrossEntropyLoss
# but for illustration purpose, we'll use torch.softmax here
probabilities = torch.softmax(logits, dim=-1)

probabilities

tensor([[[6.9243e-06, 8.2364e-06, 5.7117e-06,  ..., 1.5728e-05,
          5.5564e-06, 4.8681e-06],
         [1.7647e-05, 1.5692e-05, 6.2870e-06,  ..., 2.0243e-05,
          4.2263e-06, 1.2627e-06],
         [1.3348e-05, 7.4728e-06, 7.5883e-06,  ..., 8.0100e-06,
          4.6040e-06, 1.4888e-05],
         ...,
         [3.7644e-06, 1.5663e-05, 9.6613e-06,  ..., 8.6265e-06,
          1.5848e-05, 6.4252e-06],
         [3.3098e-06, 7.8638e-06, 1.8554e-05,  ..., 4.7886e-06,
          1.3277e-05, 2.1676e-06],
         [5.0956e-06, 9.5217e-06, 7.6047e-06,  ..., 1.4421e-05,
          5.3195e-06, 5.4828e-06]],

        [[1.9918e-05, 5.9267e-06, 3.7478e-06,  ..., 8.6077e-06,
          3.9476e-06, 8.1714e-06],
         [4.3524e-06, 1.5325e-05, 6.8608e-06,  ..., 1.6760e-05,
          8.0599e-06, 6.1646e-06],
         [1.0009e-05, 6.1595e-06, 5.4430e-06,  ..., 1.4058e-05,
          8.3669e-06, 6.3447e-06],
         ...,
         [9.4713e-06, 1.3125e-05, 1.3630e-05,  ..., 8.1968e-06,
          1.291