In [2]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch05", "01_main-chapter-code")
print(file_path)
sys.path.append(file_path)

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch05/01_main-chapter-code


## 5.1 评估文本生成大模型

### 5.1.1 用GPT来生成文本

In [3]:
import torch
from previous_chapters import GPTModel

# 124M模型配置
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
# 预测模式，dropout层不起作用
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [4]:
import tiktoken
from previous_chapters import generate_text_simple

# text -> ids
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

# ids -> text
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

# 生成文本测试
start_context = "Every effort moves you"
# 使用gpt2的tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
# 将文本转换为token ids, 并输入模型
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    # 生成10个新token
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
# 将模型结果ids转换为文本
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


### 5.1.2 计算文本生成的损失：交叉熵(cross-entropy)和困惑度（perplexity）

In [6]:
# 构建一组测试输入和目标，以ids list的形式表式
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]
targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [7]:
# 计算当前inputs的预测的下一个token的概率
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
# 当输入的shape为[2, 3, 50257]时，对于每个输入的每个token，预测下一个token的概率
# 由于使用了causal mask，每个token的预测只依赖于它前面的token
print(probas.shape)

torch.Size([2, 3, 50257])


In [8]:
# 取max，得到预测的下一个token
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [9]:
# 将预测的ids转换为文本，与targets对比
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")


Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [25]:
# 查看当前样本的targets的概率
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [26]:
# 计算对数概率
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])


In [27]:
# 计算平均对数概率
# 在对数概率下，越接近0，表示预测越准确
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.7940)


In [28]:
# 负平均对数概率, 即交叉熵损失
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(10.7940)


In [29]:
# 查看logits和targets的shape
# (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)
# (batch_size, num_tokens)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [30]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [31]:
# 使用torch的cross_entropy计算交叉熵损失
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
# 与neg_avg_log_probas相同
print(loss)

tensor(10.7940)


In [32]:
# 计算困惑度, 困惑度是交叉熵的指数
perplexity = torch.exp(loss)
# 困惑度越小，表示预测越准确
print(perplexity)

tensor(48725.8203)


### 5.1.3 计算训练集和验证集的损失