In [1]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch06", "01_main-chapter-code")
print(file_path)
sys.path.append(file_path)

import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch06/01_main-chapter-code


# EXERCISE 6.1 INCREASING THE CONTEXT LENGTH
Pad the inputs to the maximum number of tokens the model supports and observe
how it impacts the predictive performance.

In [2]:
max_length = 1024

In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [4]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        # 编码输入数据
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]
        # 最大长度截断
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encode_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]
        # 填充
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )
    
    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [23]:
data_dir = "../../ch06/02_bonus_additional-experiments/"
train_dataset = SpamDataset(
    csv_file=data_dir+"train.csv",
    max_length=max_length,
    tokenizer=tokenizer
)

val_dataset = SpamDataset(
    csv_file=data_dir+"validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file=data_dir+"test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

1024


In [24]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [7]:
# print("Train loader:")
# for input_batch, target_batch in train_loader:
#     pass

# print("Input batch dimensions:", input_batch.shape)
# print("Label batch dimensions", target_batch.shape)

In [25]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,     # 词表大小
    "context_length": 1024,  # 上下文长度
    "drop_rate": 0.0,        # Dropout率
    "qkv_bias": True         # qkv向量是否使用Bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

# 上下文长度应小于max_length
assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG['context_length']}`"
)

In [26]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt

# 下载预训练模型权重
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,

In [27]:
# 冻结参数
for param in model.parameters():
    param.requires_grad = False

In [28]:
# 替换输出层，用于分类
torch.manual_seed(123)

num_classes = 2
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)

In [29]:
# 除了输出层外，令trf的最后一个模块和final_norm的参数可训练
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

In [30]:
# 计算分类准备率
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            # 将数据移到指定设备
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            # 模型预估
            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]
            # 获取分类结果
            predicted_labels = torch.argmax(logits, dim=-1)
            # 统计总样本量、分类正确样本量
            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

In [31]:
# 定义batch训练损失
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    # 只关注最后一个输出
    logits = model(input_batch)[:, -1, :]
    # 交叉熵损失
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

In [32]:
# 定义loader训练损失
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [33]:
# 定义训练过程
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1
    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            # 初始化梯度
            optimizer.zero_grad()
            # loss计算
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            # 反向传播
            loss.backward()
            # 权重更新
            optimizer.step()
            
            examples_seen += input_batch.shape[0]
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}) "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
        
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy * 100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy * 100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [34]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [35]:
model.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,

In [36]:
# 训练模型
import time 

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000) Train loss 4.888, Val loss 4.461
Ep 1 (Step 000050) Train loss 0.690, Val loss 0.687
Ep 1 (Step 000100) Train loss 0.528, Val loss 0.550
Training accuracy: 82.50% | Validation accuracy: 92.50%
Ep 2 (Step 000150) Train loss 0.479, Val loss 0.386
Ep 2 (Step 000200) Train loss 0.517, Val loss 0.355
Ep 2 (Step 000250) Train loss 0.288, Val loss 0.308
Training accuracy: 77.50% | Validation accuracy: 85.00%
Ep 3 (Step 000300) Train loss 0.502, Val loss 0.287
Ep 3 (Step 000350) Train loss 0.314, Val loss 0.280
Training accuracy: 70.00% | Validation accuracy: 87.50%
Ep 4 (Step 000400) Train loss 0.496, Val loss 0.307
Ep 4 (Step 000450) Train loss 0.464, Val loss 0.274
Ep 4 (Step 000500) Train loss 0.566, Val loss 0.275
Training accuracy: 85.00% | Validation accuracy: 85.00%
Ep 5 (Step 000550) Train loss 0.354, Val loss 0.260
Ep 5 (Step 000600) Train loss 0.484, Val loss 0.299
Training accuracy: 87.50% | Validation accuracy: 87.50%
Training completed in 10.66 minutes.


In [37]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 83.08%
Validation accuracy: 87.92%
Test accuracy: 78.33%


In [38]:
# 上下文长度变化：120 --> 1024
# 训练时间：0.74 mins --> 9.94 mins 
# 准确率变化：(97.21%, 97.32%, 95.67%) --> (83.85%, 82.55%, 84.33%)

In [22]:
# ! cd ..\..\ch06\02_bonus_additional-experiments && python additional_experiments.py --context_length "model_context_length"
! cd ../../ch06/02_bonus_additional-experiments && python additional_experiments.py --context_length "model_context_length"

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
model.ckpt.data-00000-of-00001: 100%|███████| 498M/498M [03:42<00:00, 2.23MiB/s]
model.ckpt.index: 100%|███████████████████| 5.21k/5.21k [00:00<00:00, 3.80MiB/s]
model.ckpt.meta: 100%|███████████████████████| 471k/471k [00:01<00:00, 374kiB/s]
vocab.bpe: 100%|█████████████████████████████| 456k/456k [00:02<00:00, 220kiB/s]
File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv
Ep 1 (Step 000000): Train loss 4.888, Val loss 4.461
^C
Traceback (most recent call last):
  File "/Users/young/project/llmProject/LLMs-from-scratch-CN/ch06/02_bonus_additional-experiments/additional_experiments.py", line 670, in <module>
    train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
  File "/Users/young/project/llmProject/LLMs-from-scratch-CN/ch06/02_bonus_additional-exp

# EXERCISE 6.2 FINETUNING THE WHOLE MODEL
Instead of finetuning just the final transformer block, finetune the entire model and
assess the impact on predictive performance.

In [22]:
max_length = 120

In [23]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=max_length,
    tokenizer=tokenizer
)

val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

120


In [24]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [25]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt

# 下载预训练模型权重
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.to(device)
model.eval()

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,

In [26]:
# 冻结参数
for param in model.parameters():
    param.requires_grad = True

In [27]:
# 训练模型
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000) Train loss 3.917, Val loss 4.082
Ep 1 (Step 000050) Train loss 0.544, Val loss 0.469
Ep 1 (Step 000100) Train loss 0.128, Val loss 0.204
Training accuracy: 95.00% | Validation accuracy: 87.50%
Ep 2 (Step 000150) Train loss 0.216, Val loss 0.065
Ep 2 (Step 000200) Train loss 0.092, Val loss 0.052
Ep 2 (Step 000250) Train loss 0.039, Val loss 0.036
Training accuracy: 92.50% | Validation accuracy: 100.00%
Ep 3 (Step 000300) Train loss 0.031, Val loss 0.163
Ep 3 (Step 000350) Train loss 0.005, Val loss 0.037
Training accuracy: 100.00% | Validation accuracy: 95.00%
Ep 4 (Step 000400) Train loss 0.005, Val loss 0.081
Ep 4 (Step 000450) Train loss 0.002, Val loss 0.009
Ep 4 (Step 000500) Train loss 0.002, Val loss 0.032
Training accuracy: 100.00% | Validation accuracy: 97.50%
Ep 5 (Step 000550) Train loss 0.001, Val loss 0.151
Ep 5 (Step 000600) Train loss 0.001, Val loss 0.176
Training accuracy: 100.00% | Validation accuracy: 97.50%
Training completed in 3.10 minutes.


In [28]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 100.00%
Validation accuracy: 97.32%
Test accuracy: 97.33%


In [45]:
! cd ..\..\ch06\02_bonus_additional-experiments && python additional_experiments.py --trainable_layers all

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe
Ep 1 (Step 000000): Train loss 2.230, Val loss 2.499
Ep 1 (Step 000050): Train loss 0.247, Val loss 0.136
Ep 1 (Step 000100): Train loss 0.188, Val loss 0.194
Training accuracy: 97.50% | Validation accuracy: 95.00%
Ep 2 (Step 000150): Train loss 0.454, Val loss 0.117
Ep 2 (Step 000200): Train loss 0.165, Val loss 0.126
Ep 2 (Step 000250): Train loss 0.101, Val loss 0.087
Training accuracy: 100.00% | Validation accuracy: 95.00%
Ep 3 (Step 000300): Train loss 0.065, Val loss 0.113
Ep 3 (Step 000350): Train loss 0.031, Val loss 0.147
Training 

2025-04-27 21:27:38.855786: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 21:27:39.895841: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# EXERCISE 6.3 FINETUNING THE FIRST VERSUS LAST TOKEN
Rather than finetuning the last output token, try finetuning the first output token and
observe the changes in predictive performance when finetuning the model in later
sections.

In [29]:
max_length = 120

In [30]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=max_length,
    tokenizer=tokenizer
)

val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

120


In [31]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [32]:
# 计算分类准备率
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            # 将数据移到指定设备
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            # 模型预估
            with torch.no_grad():
                logits = model(input_batch)[:, 0, :]
            # 获取分类结果
            predicted_labels = torch.argmax(logits, dim=-1)
            # 统计总样本量、分类正确样本量
            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

In [33]:
# 定义batch训练损失
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    # 只关注最后一个输出
    logits = model(input_batch)[:, 0, :]
    # 交叉熵损失
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

In [34]:
# 定义loader训练损失
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i < num_batches:
                loss = calc_loss_batch(input_batch, target_batch, model, device)
                total_loss += loss.item()
            else:
                break
    return total_loss / num_batches

In [35]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt

# 下载预训练模型权重
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.to(device)
model.eval()

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,

In [36]:
# 冻结参数
for param in model.parameters():
    param.requires_grad = False

In [37]:
# 除了输出层外，令trf的最后一个模块和final_norm的参数可训练
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

In [38]:
# 训练模型
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000) Train loss 5.973, Val loss 5.965
Ep 1 (Step 000050) Train loss 0.781, Val loss 0.870
Ep 1 (Step 000100) Train loss 0.616, Val loss 0.757
Training accuracy: 72.50% | Validation accuracy: 50.00%
Ep 2 (Step 000150) Train loss 0.636, Val loss 0.748
Ep 2 (Step 000200) Train loss 0.564, Val loss 0.681
Ep 2 (Step 000250) Train loss 0.560, Val loss 0.729
Training accuracy: 72.50% | Validation accuracy: 60.00%
Ep 3 (Step 000300) Train loss 0.557, Val loss 0.656
Ep 3 (Step 000350) Train loss 0.492, Val loss 0.687
Training accuracy: 75.00% | Validation accuracy: 52.50%
Ep 4 (Step 000400) Train loss 0.492, Val loss 0.634
Ep 4 (Step 000450) Train loss 0.457, Val loss 0.639
Ep 4 (Step 000500) Train loss 0.458, Val loss 0.642
Training accuracy: 75.00% | Validation accuracy: 67.50%
Ep 5 (Step 000550) Train loss 0.516, Val loss 0.624
Ep 5 (Step 000600) Train loss 0.388, Val loss 0.609
Training accuracy: 77.50% | Validation accuracy: 55.00%
Training completed in 1.32 minutes.


In [39]:
train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 76.83%
Validation accuracy: 73.15%
Test accuracy: 73.33%


In [40]:
! cd ..\..\ch06\02_bonus_additional-experiments && python additional_experiments.py --trainable_token first

zsh:cd:1: no such file or directory: ....ch0602_bonus_additional-experiments
