In [11]:
# torch处理张量，张量即为矩阵格式的数据
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
# 划分块大小
block_size = 8
# batch，实际上我们可以并行处理多少个相同的序列，使用GPU，即批大小
batch_size = 4

cuda


In [12]:
with open("./database/wizard_of_oz.txt",'r',encoding='utf-8') as f:
    text = f.read()
print(len(text))
print(text[:200])

232307
﻿DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW Y


In [13]:
# 为后续分词器做铺垫，encoder and decoder
# encoder：character to integer
# decode：integer to character
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [14]:
# 分词器
# 字典，按照字符在枚举里面出现的顺序作为数字编码;字符级标记器
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s:[string_to_int[c] for c in s]
decode = lambda l:''.join([int_to_string[i] for i in l])
print(encode('hello'))
print(decode([50,10,11,12]))

[61, 58, 65, 65, 68]
Z-.0


In [15]:
# prepare data and set it as torch dot tensor
data = torch.tensor(encode(text),dtype=torch.long)
data[:100]

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,
         1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25,
        38, 28,  1, 39, 30,  1, 39, 50,  9,  1])

In [16]:
# validation and training splits
# 将数据分为训练集和测试集
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is',context,'target is',target)

when input is tensor([80]) target is tensor(28)
when input is tensor([80, 28]) target is tensor(39)
when input is tensor([80, 28, 39]) target is tensor(42)
when input is tensor([80, 28, 39, 42]) target is tensor(39)
when input is tensor([80, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80, 28, 39, 42, 39, 44]) target is tensor(32)
when input is tensor([80, 28, 39, 42, 39, 44, 32]) target is tensor(49)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49]) target is tensor(1)


In [17]:
# batch function
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch (split):
    # 随机得到一个批次的数据，得到批次的块起点
    data = train_data if split =='train' else val_data
    ix = torch.randint(len(data) - block_size,(batch_size,))
    print(ix)
    # 通过块起点，拿取一个块的数据
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    # 将tensor向量加载在cuda上
    x, y = x.to(device),y.to(device)
    return x,y

x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

tensor([ 56436, 142547, 160463, 182336])
inputs:
tensor([[ 1, 73, 74, 71, 67, 58, 57,  1],
        [71, 68, 60, 71, 58, 72, 72,  1],
        [74, 58,  1, 58, 67, 68, 74, 60],
        [54,  0, 72, 61, 54, 71, 69,  1]], device='cuda:0')
targets:
tensor([[73, 74, 71, 67, 58, 57,  1, 54],
        [68, 60, 71, 58, 72, 72,  1, 59],
        [58,  1, 58, 67, 68, 74, 60, 61],
        [ 0, 72, 61, 54, 71, 69,  1, 56]], device='cuda:0')


In [18]:
# 传入参数为 nn.Module，说明这个模型的参数都是可学习的，权重矩阵将通过梯度下降而改变
# 梯度下降是为了找到损失函数的最小值，损失函数值越小，说明模型的鲁棒性越好
# 损失函数L(Y,f(x))度量该样本经决策函数f计算后的输出预测值f(x)与样本真实值Y之间的不一致程度，反向传播，更新f(x)中各参数，使得f(x)与Y的差异减小
# 假设：在网络中有w1,w2,w3,w4,w5五个参数权重，最终计算出来的loss为l，那么看每一个参数对l的贡献为多少，求偏导：链式求偏导l/w1，然后按照梯度下降的即loss减小的方向更新w1，即反向传播更新权重
# w1_new = w1 - learningRate*gradient 学习率也就在此发挥作用

# AdamW 相较于 Adam 拥有权重更新，不会处于非常高或者非常低的水平，拥有更好的泛化能力
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embeddings_table = nn.Embedding(vocab_size,vocab_size)

# Embedding table
#    a   b   c   d   e   f   g
# a aa(69) ab(50) ac(70) ad(10)··· 
# b        
# c        
# d
# e
# f
# g
# 从Embedding table中我们关注特定的一行，比如第一行，在出现字母a时，出现字母c的概率比出现a,b,d的概率大，这就是嵌入表的作用，可以对一行进行正则化，就得到后续字符的概率
# 这也是为什么使用vocab_size作为嵌入表的大小

# 不使用默认的前向传递，有助于获取更多的模型的信息，而不是黑盒子   

#######################
# 下述实现仅仅考虑前向传播，而没有考虑反向传播以更新权重
#######################

# logits是正则化，即概率分布，与上面出现次数相对应，概率即百分比
    def forward(self, index, targets = None):
        logits = self.token_embeddings_table(index)
        
        # 前向传播时，不会计算loss，即logits仍然为三维信息
        if targets is None:
            loss = None
        else:
            # Batch,Time sequence,Channel
            # 看另一个notebook进行详细解释
            B, T, C = logits.shape
            # logits的形状为BxTxC，而torch期望的形状为BxT and C。所以我们需要重塑矩阵使之符合torch的期望
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            # 交叉熵损失函数
            loss = F.cross_entropy(logits,targets)
        
        return logits,loss
    
    # 这个模型为二元模型，也就是在ABCD中，我们只关注B--》C，而不是用 AB---》C
    def generate(self, index, max_new_tokens):
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            # 从后往前，仅仅关注最后一个
            logits = logits[:,-1,:] # becomes (B,C)
            # apply softmax to get probabilities
            # dim = -1 ，仅仅关注最后一个维度，应用softmax将所有的数值转换到0~1上作为概率
            probs = F.softmax(logits,dim=-1) # (B,C)
            # sample from the distribution
            # 从probs中按照概率随机生成下一个字符索引
            index_next = torch.multinomial(probs,num_samples =1) # (B,1)
            # append sampled index to the running sequence
            index = torch.cat((index,index_next),dim=1) # (B,T+1)，T为已知信息长度，初始为1
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)
# 首先，我们给定一个起始字符为1
context = torch.zeros((1,1),dtype = torch.long,device=device)
# 然后循环生成后续的500个字符
generate_chars = decode(m.generate(context,max_new_tokens = 500)[0].tolist())
print(generate_chars)


U!fF]e5t1pxY))H29;2Idl5-;um[kS
T)C:A?zXyvJ﻿6a7bpB﻿?7ZMo8Qb?1o.xjdz﻿oLWD9HO: *WowrABW."&]H 1x?&﻿Xqm﻿"LNF79&SWBV  hNqf!DsdI*V.'sX";ifMy.gjUUCp6ankNJv:wD5pHKJ﻿v95b]roiKDZojeG_D;(iZRT[7,﻿?YKlMzeo.wF9)O
C"*vde];2fVGxs:(x*ZRENE!SQXSosXnvV,y sMMJ﻿-&CB.s_[﻿:rxxsaGBt]8!fNkk,G7YCluP'qRce5 
Al09X;:AlE*isdK)ewPsXd0pn6Dm(Umx1I3d6'﻿GJ﻿J]5-R('x-sX:mZ4kUAl:qvGbv)1qYX;WNko7HOYMXx﻿4mPx_um'qh[ERT"[Wxg:
fMPvBjmh1a-(m9_D﻿,UQSl*_?jze:JM0[bg9_dlGZ36o2dUX-Sd&tV&*iOW4kxwlJvc0(2;'﻿5y9_AwLLN_O
LJ9bSy O5sSol_c.-1kJLovl4;NW
