In [30]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))


Downloading movie-corpus to C:\Users\11632\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [31]:
import torch
import torch.nn.functional as F

In [32]:

print("总话语数:", len(corpus.utterances))
print("总对话数（会话数）:", len(corpus.conversations))


for conversation_id in corpus.conversations:
    conversation = corpus.get_conversation(conversation_id)
    print(f"对话ID: {conversation_id}")
    for utterance in conversation.iter_utterances():
        print(f"{utterance.speaker.id}: {utterance.text}")
    break  

总话语数: 304713
总对话数（会话数）: 83097
对话ID: L1044
u0: They do not!
u2: They do to!


In [33]:
conversations_texts = []

# 
for i, conversation_id in enumerate(corpus.conversations):
    if i >= 10000:  
        break
    conversation = corpus.get_conversation(conversation_id)
   
    conversation_text = ' '.join([utterance.text for utterance in conversation.iter_utterances()])
    conversations_texts.append(conversation_text)

In [34]:
import tiktoken
# create a tokenizer

encoding = tiktoken.get_encoding("cl100k_base")

In [35]:
tokenized_text = encoding.encode("".join(conversations_texts))

print(len(tokenized_text))

502498


In [36]:
#convert to tensor
tokenized_text = torch.tensor(tokenized_text)
print(tokenized_text.shape)
max_token_value = tokenized_text.max().item()
print(max_token_value)

torch.Size([502498])
100252


In [37]:
#split the data into training and validation sets
train_idex = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_idex]
valid_data = tokenized_text[train_idex:]



In [38]:
batch_size = 8
context_size = 64
d_model = 64

In [39]:
#randomly extract a batch of data from train data
data = train_data
idxs = torch.randint(0 , len(data) - context_size, size = (batch_size,))
x_batch = torch.stack([data[idx:idx + context_size] for idx in idxs])
y_batch = torch.stack([data[idx + 1 :idx + context_size + 1] for idx in idxs])


In [40]:
import pandas as pd
pd.DataFrame(x_batch.numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,1457,13,9930,4359,369,279,15326,1274,13,33739,...,690,584,30,2675,323,264,2763,315,1023,1274
1,11,584,3358,1440,1405,311,5662,499,13,3639,...,430,596,837,11,719,358,20781,1781,1131,10445
2,4726,24532,279,4398,315,1057,5961,1345,5544,13,...,18083,13,2722,300,35283,11,369,499,1198,358
3,11649,11,433,596,6740,3814,323,499,2351,264,...,11,656,499,617,264,2457,18396,30,358,42210
4,292,73302,789,83,4208,1466,0,20524,1037,9608,...,33621,13,220,1102,596,7742,13,23371,13,4800
5,1283,5097,264,3828,2085,264,50169,1198,264,3828,...,358,1781,499,527,264,58725,430,596,4461,311
6,433,13,3639,30,423,3988,10555,13,4800,358,...,2771,499,1524,19937,433,30,88383,433,13,3639
7,323,44202,315,18157,13,1283,374,264,16888,47228,...,315,54242,13,578,89662,374,11604,0,578,3314


In [41]:
encoding.decode(x_batch[1].numpy())

", we'll know where to reach you. What can we do to assist you?Regardless of what you think, Lieutenant, the fact remains that David is missing and that we must find him. The forensic lads seem to feel that some sort of animal was involved, that's true, but I hardly think...Why"

In [42]:
#embedding 层
''' 
embedding层的作用是将单词嵌入为语义向量，它的输入是模型的输入X。输出单词的语义信息。

在gpt使用的Transformer中，语义分为两种，一是单词本身语义，二是单词所处位置的语义。

换句话说，上次的预测结果提供两种信息

1.词语是什么？
2.词语的位置是什么？

'''

token_embedding_table = torch.nn.Embedding(max_token_value + 1, d_model)
#打印embedding层的权重
print(token_embedding_table.weight)
x_batch_embedding = token_embedding_table(x_batch)
y_batch_embedding = token_embedding_table(y_batch)
print(x_batch_embedding.shape)
print(y_batch_embedding.shape)

#形状： X,T,C
#X: batch_size 批次大小
#T: context_size 上下文大小，序列长度，时间步
#C: d_model 词向量维度

'''
position = torch.arange(0, context_size, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
position_encoding = torch.zeros(context_size, d_model)
position_encoding[:, 0::2] = torch.sin(position * div_term)
position_encoding[:, 1::2] = torch.cos(position * div_term)
position_encoding = position_encoding.unsqueeze(0).expand(batch_size, -1, -1)
x = x_batch_embedding + position_encoding
y = y_batch_embedding + position_encoding
'''

'''
预先计算位置编码的值（而不是使用可训练的嵌入）的主要优点是我们的模型最终需要训练的参数更少。参数的减少可以提高训练性能
'''
#获取位置编码
position_encoding = torch.nn.Embedding(context_size, d_model)
print(position_encoding.weight)
print(position_encoding.weight.shape)

Parameter containing:
tensor([[ 0.8084,  0.0639, -0.3472,  ..., -0.2894,  0.4802, -1.0599],
        [ 0.2934, -0.7432, -0.6012,  ..., -0.2541,  0.6344, -0.4346],
        [-0.6278,  0.0846, -0.4237,  ...,  1.2919,  1.1441, -0.1380],
        ...,
        [ 1.0076,  0.8602, -0.5991,  ..., -0.2053, -1.3240, -1.1955],
        [ 0.7903,  0.4072,  1.2660,  ..., -0.7030, -0.7543,  0.9546],
        [ 1.3568, -1.9735,  2.4950,  ..., -0.8447, -0.1116, -0.4855]],
       requires_grad=True)
torch.Size([8, 64, 64])
torch.Size([8, 64, 64])
Parameter containing:
tensor([[-0.7015,  0.1402, -0.6608,  ..., -0.0070,  0.4238, -1.3672],
        [-0.2809,  0.0829,  0.6477,  ...,  1.4171,  0.8764,  0.0279],
        [-0.8238,  0.9602,  0.6292,  ..., -1.3885,  0.3483, -1.1689],
        ...,
        [-0.3615, -0.7396, -0.7769,  ..., -0.3373, -0.5814, -0.6948],
        [-0.4642,  0.1719, -0.9179,  ...,  0.1603,  0.1119, -1.2467],
        [-0.2487, -0.0665,  0.7068,  ...,  0.5214, -0.8784,  0.4992]],
       requir

In [43]:
#multihead attention
'''
batch_size = 8
context_size = 64
d_model = 64
'''
num_heads = 4
head_dim = d_model // num_heads  # 每个头的维度

# 64 * 64
Wq = torch.nn.Linear(d_model, d_model)
Wk = torch.nn.Linear(d_model, d_model)
Wv = torch.nn.Linear(d_model, d_model)

Q = Wq(x_batch_embedding)
K = Wk(x_batch_embedding)
V = Wv(x_batch_embedding)

print(Q.shape, K.shape, V.shape)

# 将Q, K, V按照多头设置进行维度重排
Q_multihead = Q.view(batch_size, context_size, num_heads, head_dim)
Q_multihead = Q_multihead.transpose(1, 2)  # 将头的维度和上下文长度的维度交换

K_multihead = K.view(batch_size, context_size, num_heads, head_dim)
K_multihead = K_multihead.transpose(1, 2)

V_multihead = V.view(batch_size, context_size, num_heads, head_dim)
V_multihead = V_multihead.transpose(1, 2)

torch.Size([8, 64, 64]) torch.Size([8, 64, 64]) torch.Size([8, 64, 64])


In [44]:
wei = Q @ K.transpose(-2 , -1)
wei.shape

torch.Size([8, 64, 64])

In [45]:
#apply mask
mask = torch.triu(torch.ones(context_size, context_size), diagonal=1).bool()
wei = wei.masked_fill(mask, float('-inf'))
pd.DataFrame(wei[0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,3.187471,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
1,-2.004797,-2.234551,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
2,0.993447,-0.542453,1.525857,-inf,-inf,-inf,-inf,-inf,-inf,-inf,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
3,-3.496064,0.938212,-1.086502,-3.696027,-inf,-inf,-inf,-inf,-inf,-inf,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
4,1.345954,0.483479,-0.616461,-5.035115,-0.393587,-inf,-inf,-inf,-inf,-inf,...,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,-2.508794,0.534034,2.798499,7.071962,1.866140,-2.058599,0.825363,0.200114,0.534034,1.829050,...,3.226939,-4.524257,1.118625,-2.542966,4.873235,-1.918636,-inf,-inf,-inf,-inf
60,4.301505,0.961278,0.726928,-0.021345,-2.444457,-0.957070,3.757560,0.268312,0.961278,2.591392,...,0.139885,1.461952,-2.637837,1.624761,1.352815,-0.004836,0.457659,-inf,-inf,-inf
61,-4.246055,-0.543369,-1.452324,3.743972,3.100425,-2.824816,0.717740,-0.435698,-0.543369,-0.682876,...,0.634515,-3.148952,-0.557993,-0.391002,2.907906,-1.497793,1.820893,-3.245041,-inf,-inf
62,-1.104063,1.111376,-2.176154,-3.728298,-6.003815,-1.022707,-1.283503,-1.320779,1.111376,3.099248,...,2.576442,-2.168800,-0.747793,-3.900452,-1.248537,5.893105,4.446800,-0.172687,6.724002,-inf


In [46]:
#softmax
attention_score = torch.nn.Softmax(dim=-1)(wei)

#计算多头注意力
attention = attention_score @ V

#将多头注意力拼接
output = attention.transpose(1, 2).contiguous().view(batch_size, context_size, d_model)
output.shape

torch.Size([8, 64, 64])

In [47]:
#apply residual connection
output = attention + x_batch_embedding

In [48]:
#apply layer normalization
layer_norm = torch.nn.LayerNorm(d_model)
output = layer_norm(output)


In [49]:
#a feedforward network
'''
feedforward层由两个全连接层组成，这两个全连接层之间有一个ReLU激活函数。
'''
feedforward = torch.nn.Sequential(
    torch.nn.Linear(d_model, 4 * d_model),
    torch.nn.ReLU(),
    torch.nn.Linear(4 * d_model, d_model)
)
output = feedforward(output)
output.shape


torch.Size([8, 64, 64])

In [50]:
#apply next layer normalization
output = layer_norm(output)

In [51]:
#apply final layer linear transformation
output = torch.nn.Linear(d_model, max_token_value + 1)(output)
output.shape

torch.Size([8, 64, 100253])

In [53]:
logits = F.softmax(output, dim=-1)
predicted_index =  torch.argmax(logits[0,0]).item()
encoding.decode([predicted_index])


' mov'