In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [11]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch03", "01_main-chapter-code")
print(file_path)
sys.path.append(file_path)

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch03/01_main-chapter-code


#### 第二章的数据载入器

In [7]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []  # 输入ID列表
        self.target_ids = []  # 目标ID列表

        # 对整个文本进行分词
        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})

        # 使用滑动窗口将文本分割成重叠的最大长度序列
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]  # 输入片段
            target_chunk = token_ids[i + 1: i + max_length + 1]  # 目标片段（右移一个位置）
            self.input_ids.append(torch.tensor(input_chunk))  # 将输入片段转换为张量
            self.target_ids.append(torch.tensor(target_chunk))  # 将目标片段转换为张量

    def __len__(self):
        return len(self.input_ids)  # 返回数据集的大小

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]  # 获取特定索引的输入和目标

def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True):
    # 初始化分词器
    tokenizer = tiktoken.get_encoding("gpt2")

    # 创建数据集
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 创建数据加载器
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader  # 返回数据加载器

In [13]:
# 加载测试数据
with open(os.path.join(file_path, "small-text-sample.txt"), "r", encoding="utf-8") as f:
    raw_text = f.read()  # 读取文本文件

tokenizer = tiktoken.get_encoding("gpt2")  # 初始化分词器
encoded_text = tokenizer.encode(raw_text)  # 对文本进行编码

vocab_size = 50257  # 词汇表大小
output_dim = 256  # 输出维度
max_len = 1024  # 最大序列长度
context_length = max_len  # 上下文长度

token_embedding_layer = nn.Embedding(vocab_size, output_dim)  # 创建词嵌入层
pos_embedding_layer = nn.Embedding(context_length, output_dim)  # 创建位置嵌入层

max_length = 4  # 每个输入片段的最大长度
dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=max_length)  # 创建数据加载器



In [14]:
# 加载一个batch的数据
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [15]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])


#### 第三章实现的多头注意力

In [17]:
# 利用权重拆分实现多头注意力，更高效
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()

        # 确保d_out是否能被num_heads整除
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"

        # 参数初始化
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        # 增加线性层，不改变维度
        self.out_proj = nn.Linear(d_out, d_out)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
    
    def forward(self, x):
        b, num_tokens, d_in = x.shape

        # 计算keys, queries, values
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 将keys, queries, values拆分成多个head
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        # 转置，将num_heads移到前面，方便后续计算
        # shape = b, num_heads, num_tokens, head_dim
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # 计算attn weights
        # shape = b, num_heads, num_tokens, num_tokens
        attn_scores = queries @ keys.transpose(2, 3)
        # mask未来信息, 避免信息泄露，同时适配不同token长度
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        # 归一化
        attn_weights = torch.softmax(attn_scores / (keys.shape[-1] ** 0.5), dim=-1)

        # 使用dropout防止过拟合
        attn_weights = self.dropout(attn_weights)
        
        # 计算上下文向量
        # shape = b, num_tokens, num_heads, head_dim
        context_vec = (attn_weights @ values).transpose(1, 2)
        # 调整上下文形状
        # shape = b, num_tokens, d_out(=num_heads * head_dim)
        # 在进行view之前，需要先进行contiguous()，否则会报错
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        # 线性层，增加一次变换
        context_vec = self.out_proj(context_vec)

        return context_vec

#### 结合

In [18]:
torch.manual_seed(123)

context_length = max_length
d_in = output_dim
d_out = d_in

mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)

batch = input_embeddings
context_vecs = mha(batch)

print("context_vecs.shape:", context_vecs.shape)

context_vecs.shape: torch.Size([8, 4, 256])
