In [1]:
import torch
import torch.nn as nn

## EXERCISE 5.1
Use the print_sampled_tokens function to print the sampling frequencies of the
softmax probabilities scaled with the temperatures shown in Figure 5.13. How often
is the word "pizza" sampled in each case? Can you think of a faster and more
accurate way to determine how often the word "pizza" is sampled?

In [2]:
vocab = { 
    "closer": 0,
    "every": 1, 
    "effort": 2, 
    "forward": 3,
    "inches": 4,
    "moves": 5, 
    "pizza": 6,
    "toward": 7,
    "you": 8,
} 

inverse_vocab = {v: k for k, v in vocab.items()}

# Suppose input is "every effort moves you", and the LLM
# returns the following logits for the next token:
next_token_logits = torch.tensor(
    [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)

probas = torch.softmax(next_token_logits, dim=0)

In [11]:
def print_sampled_tokens(logits, temperature=1.0):
    torch.manual_seed(123)
    # 从概率分布中采样
    probas = torch.softmax(logits / temperature, dim=0)
    print(f"pizza expected cnt = {probas[6] * 1000:.2f}")
    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]
    sampled_ids = torch.bincount(torch.tensor(sample))
    # 统计单词文本的频率
    for i, freq in enumerate(sampled_ids):
        print(f"{freq} x {inverse_vocab[i]}")

# 统计采样1000次时的单词频率
temperatures = [1, 0.1, 5]
for t in temperatures:
    print("*" * 30)
    print(f"temperatures={t}")
    print_sampled_tokens(next_token_logits, t)

******************************
temperatures=1
pizza expected cnt = 0.10
73 x closer
0 x every
0 x effort
582 x forward
2 x inches
0 x moves
0 x pizza
343 x toward
******************************
temperatures=0.1
pizza expected cnt = 0.00
0 x closer
0 x every
0 x effort
985 x forward
0 x inches
0 x moves
0 x pizza
15 x toward
******************************
temperatures=5
pizza expected cnt = 43.00
165 x closer
75 x every
42 x effort
239 x forward
71 x inches
46 x moves
32 x pizza
227 x toward
103 x you


## EXERCISE 5.2
Play around with different temperatures and top-k settings. Based on your
observations, can you think of applications where lower temperature and top-k
settings are desired? Vice versa, can you think of applications where higher
temperature and top-k settings are preferred? (It's recommended to also revisit this
exercise at the end of the chapter after loading the pretrained weights from OpenAI.)

In [None]:
# 更低的温度系数、更小的top-k值，会使得模型更倾向于生成重复的单词
# 更适合有明确答案的场景，如数学、编程、统计等场景
# 更高的温度系数、更大的top-k值，会使得模型更倾向于生成多样化的单词
# 更适合没有明确答案的场景，如写作、对话等场景

## EXERCISE 5.3
What are the different combinations of settings for the generate function to force
deterministic behavior, that is, disabling the random sampling such that it always
produces the same outputs similar to the generate_simple function?
So far, we covered how to pretrain LLMs and use them to generate text. The last two
sections of this chapter will discuss how we save and load the trained LLM and how
we load pretrained weights from OpenAI.

In [13]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch05", "01_main-chapter-code")
print(file_path)
sys.path.append(file_path)

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch05/01_main-chapter-code


In [14]:
import torch
from previous_chapters import GPTModel

# 124M模型配置
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
# 预测模式，dropout层不起作用
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [12]:
import tiktoken
# text -> ids
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

# ids -> text
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

# 生成文本测试
start_context = "Every effort moves you"
# 使用gpt2的tokenizer
tokenizer = tiktoken.get_encoding("gpt2")



In [15]:
# 增加topk和temperature参数的文本生成函数
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        # 取context_size个词
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        # 取最后一个词的预测logits
        # 实际的场景下，会通过kv cache的方式来减少冗余计算
        logits = logits[:, -1, :]
        
        # top k 采样
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
        
        # 温度校正
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        if idx_next == eos_id:
            break
        
        idx = torch.cat((idx, idx_next), dim=1)
    
    return idx

In [18]:
# 新的生成策略测试
torch.manual_seed(123)
token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves youEveryiliaralso stabbed OrleansAllowsean 52anche crime winter unbeaten quoteembedreportprint earning


In [None]:
# 如果想要产生确定的结果，可以设置temperature=0.0 or top_k=1 

## EXERCISE 5.4
After saving the weights, load the model and optimizer in a new Python session or
Jupyter notebook file and continue pretraining it for 1 more epoch using the
train_model_simple function.

In [41]:
from previous_chapters import generate_text_simple

In [42]:
# 计算batch loss
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

# 计算数据集的loss
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [43]:
def train_model_simple(model, train_loader, val_loader, optimizer, device,
                       num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    # training loop
    for epoch in range(num_epochs):
        # 训练模式
        model.train()
        for input_batch, target_batch in train_loader:
            # 梯度清零
            optimizer.zero_grad()
            # 计算batch loss
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            # 反向传播
            loss.backward()
            # 更新参数
            optimizer.step()
            # 更新token计数
            tokens_seen += input_batch.numel()
            # 更新全局步数
            global_step += 1
            # 每eval_freq个batch，计算一次loss, 查看训练效果
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                # 记录loss
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                # 记录token计数
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # 每个epoch结束，生成和打印文本
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen

# 模型评估
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

# 生成和打印文本
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model,
            idx=encoded,
            max_new_tokens=50,
            context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [44]:
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


In [45]:
# 下载训练数据
import os 
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode("utf-8")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [46]:
# 构建dataloader
from previous_chapters import create_dataloader_v1

# 划分训练集和验证集
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

# 训练集
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

# 验证集
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
# 加载模型
# 加载模型参数
model = GPTModel(GPT_CONFIG_124M)
# device = torch.device(device)
model.load_state_dict(torch.load("model.pth", map_location=device, weights_only=True))

<All keys matched successfully>

In [48]:
# 训练模型
import time
start_time = time.time()

torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M)
model.to(device)
# 定义优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10
# 训练模型
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

end_time = time.time()
print(f"Total training time: {(end_time - start_time) / 60:.2f} minutes")

Ep 1 (Step 000000): Train loss 1.149, Val loss 6.806
Ep 1 (Step 000005): Train loss 0.523, Val loss 6.581
Every effort moves you?"  "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"  He laughed again, and threw back his glory, and as once one had longed to say: "Be dissatisfied with your
Ep 2 (Step 000010): Train loss 0.375, Val loss 6.496
Ep 2 (Step 000015): Train loss 0.277, Val loss 6.540
Every effort moves you?"  "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"  He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I
Ep 3 (Step 000020): Train loss 0.180, Val loss 6.738
Ep 3 (Step 000025): Train loss 0.152, Val loss 6.773
Every effort moves you?"  "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"  He laughed again, and threw back his head to look up at the sketch of the donkey. "There were days when I
Ep 4 (Step 000030): Train loss 0.122, Val loss 6.872
Ep

## EXERCISE 5.5
Calculate the training and validation set losses of the GPTModel with the pretrained
weights from OpenAI on the "The Verdict" dataset.

In [49]:
# 下载OpenAI的预训练模型
from gpt_download import download_and_load_gpt2


In [50]:
# 加载124M模型
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

In [51]:
# 加载模型参数前，进行shape检查
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [52]:
import numpy as np

# 加载模型参数
def load_weights_into_gpt(gpt, params):
    # 加载embedding层
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
    # 加载transformer层
    for b in range(len(params["blocks"])):
        # 加载带bias的qkv
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1
        )
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)
        
        # 加载输出层
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        # 加载ff层
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        # 加载norm层
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])
    
    # 加载最后的norm层
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    # 加载输出层
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [53]:
# 定义模型
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [54]:
# 模型加载
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [60]:
torch.manual_seed(123)

eval_iter = 50
train_loss, val_loss = evaluate_model(gpt, train_loader, val_loader, device, eval_iter)
print(f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

Train loss 3.755, Val loss 3.560


In [61]:
# val loss有明显的下降 6.373 --> 3.560

## EXERCISE 5.6
Readers are encouraged to experiment with GPT-2 models of different sizes, for
example, the largest 1558M parameter model and compare the generated text to the
124M model we loaded in this chapter.

In [None]:
# 加载124M模型
settings, params = download_and_load_gpt2(model_size="1558M", models_dir="gpt2")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 38.1kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 774kiB/s]
hparams.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 30.1kiB/s]
model.ckpt.data-00000-of-00001:  55%|█████▍    | 3.42G/6.23G [17:17<29:46, 1.57MiB/s]  

In [None]:
# 定义模型
model_name = "gpt2-xl (1558M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt_xl = GPTModel(NEW_CONFIG)
gpt_xl.eval()

In [None]:
# 模型加载
load_weights_into_gpt(gpt_xl, params)
gpt_xl.to(device);

In [None]:
torch.manual_seed(123)

eval_iter = 50
train_loss, val_loss = evaluate_model(gpt_xl, train_loader, val_loader, device, eval_iter)
print(f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

In [None]:
torch.manual_seed(123)

token_ids = generate(
    model=gpt_xl,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))