In [1]:
from convokit import Corpus, download
corpus = Corpus(filename=download("movie-corpus"))
import torch

Downloading movie-corpus to C:\Users\11632\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [2]:

print("总话语数:", len(corpus.utterances))
print("总对话数（会话数）:", len(corpus.conversations))


for conversation_id in corpus.conversations:
    conversation = corpus.get_conversation(conversation_id)
    print(f"对话ID: {conversation_id}")
    for utterance in conversation.iter_utterances():
        print(f"{utterance.speaker.id}: {utterance.text}")
    break  

总话语数: 304713
总对话数（会话数）: 83097
对话ID: L1044
u0: They do not!
u2: They do to!


In [3]:
conversations_texts = []

# 
for i, conversation_id in enumerate(corpus.conversations):
    if i >= 10000:  
        break
    conversation = corpus.get_conversation(conversation_id)
   
    conversation_text = ' '.join([utterance.text for utterance in conversation.iter_utterances()])
    conversations_texts.append(conversation_text)

In [4]:
import tiktoken
# create a tokenizer

encoding = tiktoken.get_encoding("cl100k_base")

In [5]:
tokenized_text = encoding.encode("".join(conversations_texts))

print(len(tokenized_text))

502498


In [6]:
#convert to tensor
tokenized_text = torch.tensor(tokenized_text)
print(tokenized_text.shape)
max_token_value = tokenized_text.max().item()
print(max_token_value)

torch.Size([502498])
100252


In [7]:
#split the data into training and validation sets
train_idex = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_idex]
valid_data = tokenized_text[train_idex:]



In [8]:
batch_size = 8
context_size = 64
d_model = 64

In [9]:
#randomly extract a batch of data from train data
data = train_data
idxs = torch.randint(0 , len(data) - context_size, size = (batch_size,))
x_batch = torch.stack([data[idx:idx + context_size] for idx in idxs])
y_batch = torch.stack([data[idx + 1 :idx + context_size + 1] for idx in idxs])


In [10]:
import pandas as pd
pd.DataFrame(x_batch.numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,433,13912,757,505,3515,311,2274,459,5150,17743,...,358,2846,2771,499,1053,13,35272,596,2751,430
1,596,4648,21908,3639,30,25797,893,11,374,568,...,311,757,11,1314,994,682,279,17619,574,5108
2,0,5519,499,1440,1405,430,4751,4131,505,30,...,499,649,439,433,5900,555,323,279,6037,315
3,311,6068,279,4359,15770,6166,13,54652,1514,258,...,1603,499,1027,264,1520,311,757,13,4718,11091
4,264,9396,912,22622,1109,872,25015,13,22335,30,...,555,3026,1006,12337,11,36823,42407,13,2435,1051
5,477,499,1253,5387,369,81801,16986,11,389,832,...,30,13440,477,539,499,12265,374,11,315,3388
6,5270,13,8595,30,3639,596,304,81801,30,578,...,389,279,42632,11,36346,30,4438,95950,499,527
7,11471,13,2435,2351,5710,13,2435,4265,387,1618,...,956,1390,499,369,264,4333,76241,922,757,30


In [11]:
encoding.decode(x_batch[1].numpy())

'\'s death..." What?Ah man, is he usin\' that line now on you? What, you think he made that little gem up? Jesus Christ, I used to have to listen to my old man use that every morning. Y\'know what Stephen said to me, right when all the shit was coming'

In [12]:
#embedding 层
''' 
embedding层的作用是将单词嵌入为语义向量，它的输入是模型的输入X。输出单词的语义信息。

在gpt使用的Transformer中，语义分为两种，一是单词本身语义，二是单词所处位置的语义。

换句话说，上次的预测结果提供两种信息

1.词语是什么？
2.词语的位置是什么？

'''

token_embedding_table = torch.nn.Embedding(max_token_value + 1, d_model)
#打印embedding层的权重
print(token_embedding_table.weight)
x_batch_embedding = token_embedding_table(x_batch)
y_batch_embedding = token_embedding_table(y_batch)
print(x_batch_embedding.shape)
print(y_batch_embedding.shape)

#形状： X,T,C
#X: batch_size 批次大小
#T: context_size 上下文大小，序列长度，时间步
#C: d_model 词向量维度

'''
position = torch.arange(0, context_size, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
position_encoding = torch.zeros(context_size, d_model)
position_encoding[:, 0::2] = torch.sin(position * div_term)
position_encoding[:, 1::2] = torch.cos(position * div_term)
position_encoding = position_encoding.unsqueeze(0).expand(batch_size, -1, -1)
x = x_batch_embedding + position_encoding
y = y_batch_embedding + position_encoding
'''

'''
预先计算位置编码的值（而不是使用可训练的嵌入）的主要优点是我们的模型最终需要训练的参数更少。参数的减少可以提高训练性能
'''
#获取位置编码
position_encoding = torch.nn.Embedding(context_size, d_model)
print(position_encoding.weight)
print(position_encoding.weight.shape)

Parameter containing:
tensor([[-0.0175,  1.5166, -0.6578,  ...,  0.9956,  1.1948, -0.6840],
        [-0.0458,  2.1350,  0.2176,  ...,  2.1543,  0.5076,  0.2751],
        [ 0.2905, -0.5032, -0.3581,  ..., -1.0190,  0.2279, -0.7405],
        ...,
        [ 1.4330, -0.2527,  0.7946,  ..., -0.2061,  1.4529,  0.3016],
        [ 0.8533, -0.1278, -2.3567,  ..., -0.6176, -1.3524, -0.0935],
        [ 0.2570,  0.0777, -0.0776,  ..., -0.4184, -0.1594, -0.4233]],
       requires_grad=True)
torch.Size([8, 64, 64])
torch.Size([8, 64, 64])
Parameter containing:
tensor([[ 0.3182, -1.6880, -1.3281,  ..., -0.3215,  0.2587, -0.1867],
        [-0.0200,  1.6375, -0.5828,  ..., -0.5376,  1.1342,  0.6509],
        [ 1.0112,  0.1976, -1.1066,  ..., -1.0429, -0.0309,  1.3762],
        ...,
        [-0.8707, -1.8437, -0.7408,  ...,  0.7956,  2.1323,  0.3430],
        [ 0.3719,  1.7791, -0.6057,  ..., -0.2267, -1.6209, -0.7319],
        [ 1.1740, -0.0482, -2.4479,  ..., -0.8374,  1.2912,  0.8960]],
       requir

In [15]:
#multihead attention
'''
batch_size = 8
context_size = 64
d_model = 64
'''
num_heads = 4
head_dim = d_model // num_heads  # 每个头的维度

# 64 * 64
Wq = torch.nn.Linear(d_model, d_model)
Wk = torch.nn.Linear(d_model, d_model)
Wv = torch.nn.Linear(d_model, d_model)

Q = Wq(x_batch_embedding)
K = Wk(x_batch_embedding)
V = Wv(x_batch_embedding)

# 将Q, K, V按照多头设置进行维度重排
Q_multihead = Q.view(batch_size, context_size, num_heads, head_dim)
Q_multihead = Q_multihead.transpose(1, 2)  # 将头的维度和上下文长度的维度交换

K_multihead = K.view(batch_size, context_size, num_heads, head_dim)
K_multihead = K_multihead.transpose(1, 2)

V_multihead = V.view(batch_size, context_size, num_heads, head_dim)
V_multihead = V_multihead.transpose(1, 2)

In [19]:
wei = Q @ K.transpose(-2 , -1)
wei.shape

torch.Size([8, 64, 64])