In [16]:
# 准备好数据集用于训练，运行获取莎士比亚数据集
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
# 读取数据集
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("数据集的字符长度：", len(text))

# 输出前500个字符内容
print(text[:500])

数据集的字符长度： 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [3]:
# 这里有出现在数据集中的全部字符
chars = sorted(list(set(text)))
vocab_size= len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
# 创建字符->整数映射
stoi = { ch: i for i, ch in enumerate(chars) }
itos = { i: ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of intgers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [5]:
# 编码整个文本数据集并且将它储存在torch.Tensor中
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:500]) # 我们上面刚看到的500个字符对于GPT来说看到的是下面这样

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [6]:
# 将数据集切分为训练集和验证集
n = int(0.9*len(data)) # 90%用于训练，剩余10%用于验证
train_data = data[:n]
val_data = data[:n]

In [7]:
block_size = 8
print(train_data[:block_size+1])

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [8]:
torch.manual_seed(1024)
batch_size = 4 # 含义为多少独立序列会被并行处理
block_size = 8 # 含义为预测的最大内容长度

def get_batch(split):
    # 生成一个小批量数据输入x和目标y的数据
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("输入：")
print(xb.shape)
print(xb)
print("目标：")
print(yb.shape)
print(yb)

print("-"*20)

for b in range(batch_size): # batch维度
    for t in range(block_size): # 时间维度
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

输入：
torch.Size([4, 8])
tensor([[58, 46, 39, 58,  1, 57, 53,  1],
        [12,  1,  1, 51, 63,  1, 52, 47],
        [46, 44, 59, 50,  1, 57, 53, 59],
        [43, 50, 57, 43,  0, 32, 53,  1]])
目标：
torch.Size([4, 8])
tensor([[46, 39, 58,  1, 57, 53,  1, 50],
        [ 1,  1, 51, 63,  1, 52, 47, 43],
        [44, 59, 50,  1, 57, 53, 59, 50],
        [50, 57, 43,  0, 32, 53,  1, 46]])
--------------------
when input is [58] the target: 46
when input is [58, 46] the target: 39
when input is [58, 46, 39] the target: 58
when input is [58, 46, 39, 58] the target: 1
when input is [58, 46, 39, 58, 1] the target: 57
when input is [58, 46, 39, 58, 1, 57] the target: 53
when input is [58, 46, 39, 58, 1, 57, 53] the target: 1
when input is [58, 46, 39, 58, 1, 57, 53, 1] the target: 50
when input is [12] the target: 1
when input is [12, 1] the target: 1
when input is [12, 1, 1] the target: 51
when input is [12, 1, 1, 51] the target: 63
when input is [12, 1, 1, 51, 63] the target: 1
when input is [12,

In [9]:
print(xb, yb) # our input to the transformer

tensor([[58, 46, 39, 58,  1, 57, 53,  1],
        [12,  1,  1, 51, 63,  1, 52, 47],
        [46, 44, 59, 50,  1, 57, 53, 59],
        [43, 50, 57, 43,  0, 32, 53,  1]]) tensor([[46, 39, 58,  1, 57, 53,  1, 50],
        [ 1,  1, 51, 63,  1, 52, 47, 43],
        [44, 59, 50,  1, 57, 53, 59, 50],
        [50, 57, 43,  0, 32, 53,  1, 46]])


In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1024)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # 每个token直接从查找表中读取下一个token的logits
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx和targets都是 (B, T)维度的数字tensor
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx是当前上下文中的索引数组，形状为(B,T)
        for _ in range(max_new_tokens):
            # 获取预测值
            logits, loss = self(idx) # 注意此处self代表BigramLanguageModel类
            # 只集中在最后时间步上
            logits = logits[:, -1, :] # (B,C)
            # 应用softmax去获取概率值
            probs = F.softmax(logits, dim=-1) # (B,C)
            # 从分布中抽样
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # 添加抽样index到预测序列中
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [12]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape, loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65]) tensor(4.7921, grad_fn=<NllLossBackward0>)

YpD&;JVTO,KBk:nicwp
P-wXPZEtUgtAgM?i3oUU?:'zTokrAVxhireSkj-cjhaqEw:baA,:z.ohoayokMFHHWMf;ujJXoM?HmYk


In [29]:
# 创建一个Pytorh优化器
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(100): # 增加step数来获得更好结果
    # 抽样一批数据
    xb, yb = get_batch("train")

    # 评估loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item()) # 多刷新几次会发现loss单调下降（梯度生效）
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

3.139368772506714

'BYoull,?vxNZfout,WS.WRinblre FRSThe,3IENGLUClsCut'e JdDUvSghyQM.f aTeryreo
LdYXCBfousTYSbovuxxMISjvuFaioUKESMAOyaxjA!wH3l$?jrows.ntqfRanLd
Dmy,ci, CpJ:.ns IfRallFLorBure,
Ge
Bju oud:SPer
TonYYworaxY'of ty.RXMtJEPEJXTMIEmy?
A
B'?ZVwouL$oure n fjjPce,G-c!
U&;Ke;OqAcXc onmaTADihe Wq-oMANo? pollPlmYUNCAs l enFLiF&w'd'sicheryo bRO:ZlP.LICAg.WqYEu lavYmn.V;k, ypurPqkNgempeCKHsWsurias:
.rr WrorcaXTererp;!sm O:
S&ot o b nxzLIhakiBjhe.M.zmNUUUye,;Kxcl;NerotsinkeUgiqY'Pags t'e !XLK'sty I:ESAR:LZFRe wincj


In [46]:
# self-attention中的数学trick
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
print("a = ", a)
a = a / torch.sum(a, 1, keepdim=True)
print("a -> ", a)
b = torch.randint(0, 10, (3, 2)).float()
print("b = ", b)
c = a @ b
print("c = ", c)

a =  tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
a ->  tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b =  tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c =  tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
x.shape =  torch.Size([4, 8, 2])
xbow.shape =  torch.Size([4, 8, 2])


In [63]:
# 进一步扩展上述例子
torch.manual_seed(1024)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
print("x.shape = ", x.shape)
# 方案一：使用for训练
# x[b, t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B, T, C)) # bow = bag of words
for b in range(B): 
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)
# print("xbow.shape = ", xbow.shape)
# 方案二：使用矩阵乘积求加权聚合
wei = torch.tril(torch.ones(T, T)) # (T, T)
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
print("方案二输出是否等于方案一输出：", torch.allclose(xbow, xbow2))
# 方案三：使用softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # 使用masked_fill
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
print("方案三输出是否等于方案一输出：", torch.allclose(xbow, xbow3))
# 方案四：self-attention（单头）
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x)
out = wei @ v
print(out.shape)

x.shape =  torch.Size([4, 8, 2])
方案二输出是否等于方案一输出： True
方案三输出是否等于方案一输出： True


torch.Size([4, 8, 16])

`Atetntion`是一种`沟通机制`。可以看作是有向图中的节点，它们相互观察，并通过指向它们的所有节点的加权和(具有数据依赖的权重)聚合信息。
- 没有空间的概念。注意力只是作用于一组向量。这就是为什么我们需要对令牌进行位置编码。
- 跨批处理维度的每个示例当然是完全独立处理的，并且永远不会相互“交谈”
- 在`encoder`注意块中，只需删除用`tril`屏蔽的单行，允许所有令牌进行通信。这里的这个块被称为`decoder`注意力块，因为它有三角形掩蔽，通常用于自回归设置，比如语言建模。
- `self-attention`只是意味着键和值与查询来自相同的来源。在“交叉关注”中，查询仍然由x产生，但键和值来自其他外部源(例如编码器模块)。
- `scaled attention`额外将`wei`除以1/sqrt(head_size)。这使得当输入`Q`,`K`是单位方差时，`wei`也将是单位方差，`Softmax`将保持扩散而不是过度饱和

In [64]:
# LayerNorm，可参考Transformer中更为系统的实现：https://github.com/PhenixZhang/Transformer-Pytorch
class LayerNorm1d: 

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    xmean = x.mean(1, keepdim=True) # batch均值
    xvar = x.var(1, keepdim=True) # batch方差
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # 归一化
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch_size32，100维embedding
x = module(x)
x.shape

torch.Size([32, 100])