# EXERCISE 4.1 NUMBER OF PARAMETERS IN FEED FORWARD AND ATTENTION MODULES

Calculate and compare the number of parameters that are contained in the feed
forward module and those that are contained in the multi-head attention module.

In [30]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
#初始化定义需要的各种超参数

In [31]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch04", "02_performance-analysis")
print(file_path)
sys.path.append(file_path)

f:\project\LLMs-from-scratch-CN\ch04\02_performance-analysis


In [32]:
import torch
from previous_chapters import GPTModel

model = GPTModel(GPT_CONFIG_124M)

In [33]:
ff_numel = 0
attn_numel = 0
for name, p in model.named_parameters():
    if "ff" in name:
        ff_numel += p.numel()
    elif "att" in name:
        attn_numel += p.numel()
print(f"Feed forward module: {ff_numel / 12:,} parameters")
print(f"Attention module: {attn_numel / 12:,} parameters")


Feed forward module: 4,722,432.0 parameters
Attention module: 2,360,064.0 parameters


# EXERCISE 4.2 INITIALIZING LARGER GPT MODELS

In this chapter, we initialized a 124 million parameter GPT model, which is known as
"GPT-2 small." Without making any code modifications besides updating the
configuration file, use the GPTModel class to implement GPT-2 medium (using 1024-
dimensional embeddings, 24 transformer blocks, 16 multi-head attention heads),
GPT-2 large (1280-dimensional embeddings, 36 transformer blocks, 20 multi-head
attention heads), and GPT-2 XL (1600-dimensional embeddings, 48 transformer
blocks, 25 multi-head attention heads). As a bonus, calculate the total number of
parameters in each GPT model.

In [13]:
# 基本配置
BASE_CONFIG = {
    "vocab_size": 50257,     # 词汇表大小
    "context_length": 1024,  # 上下文长度
    "drop_rate": 0.0,        # 丢弃率
    "qkv_bias": True         # 是否使用查询-键-值偏置
}

# 不同规模的GPT模型配置
model_configs = {
    "gpt-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},  # 小型模型
    "gpt-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16}, # 中型模型
    "gpt-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},  # 大型模型
    "gpt-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},    # 超大模型
}

In [18]:
for name, config in model_configs.items():
    BASE_CONFIG.update(config)
    model = GPTModel(BASE_CONFIG)
    numel = sum([p.numel() for p in model.parameters()])
    print(f"{name} parameters is {numel}.")

gpt-small (124M) parameters is 163037184.
gpt-medium (355M) parameters is 406286336.
gpt-large (774M) parameters is 838359040.
gpt-xl (1558M) parameters is 1638022400.


# EXERCISE 4.3 USING SEPARATE DROPOUT PARAMETERS

At the beginning of this chapter, we defined a global "drop_rate" setting in the
GPT_CONFIG_124M dictionary to set the dropout rate in various places throughout the
GPTModel architecture. Change the code to specify a separate dropout value for the
various dropout layers throughout the model architecture. (Hint: there are three
distinct places where we used dropout layers: the embedding layer, shortcut layer,
and multi-head attention module.)

In [20]:
from previous_chapters_mod import GPTModel as GPTModel_mod

In [21]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "emb_drop_rate": 0.1,
    "att_drop_rate": 0.2,
    "shortcut_drop_rate": 0.3,
    "qkv_bias": False       # Query-Key-Value bias
}

In [26]:
model_mod = GPTModel_mod(GPT_CONFIG_124M)
for name, m in model_mod.named_modules():
    if isinstance(m, torch.nn.Dropout):
        print(f"{name}, {m.p}")

drop_emb, 0.1
trf_blocks.0.att.dropout, 0.2
trf_blocks.0.drop_shortcut, 0.3
trf_blocks.1.att.dropout, 0.2
trf_blocks.1.drop_shortcut, 0.3
trf_blocks.2.att.dropout, 0.2
trf_blocks.2.drop_shortcut, 0.3
trf_blocks.3.att.dropout, 0.2
trf_blocks.3.drop_shortcut, 0.3
trf_blocks.4.att.dropout, 0.2
trf_blocks.4.drop_shortcut, 0.3
trf_blocks.5.att.dropout, 0.2
trf_blocks.5.drop_shortcut, 0.3
trf_blocks.6.att.dropout, 0.2
trf_blocks.6.drop_shortcut, 0.3
trf_blocks.7.att.dropout, 0.2
trf_blocks.7.drop_shortcut, 0.3
trf_blocks.8.att.dropout, 0.2
trf_blocks.8.drop_shortcut, 0.3
trf_blocks.9.att.dropout, 0.2
trf_blocks.9.drop_shortcut, 0.3
trf_blocks.10.att.dropout, 0.2
trf_blocks.10.drop_shortcut, 0.3
trf_blocks.11.att.dropout, 0.2
trf_blocks.11.drop_shortcut, 0.3


In [27]:
model_mod = GPTModel(GPT_CONFIG_124M)
for name, m in model_mod.named_modules():
    if isinstance(m, torch.nn.Dropout):
        print(f"{name}, {m.p}")

drop_emb, 0.1
trf_blocks.0.att.dropout, 0.1
trf_blocks.0.drop_shortcut, 0.1
trf_blocks.1.att.dropout, 0.1
trf_blocks.1.drop_shortcut, 0.1
trf_blocks.2.att.dropout, 0.1
trf_blocks.2.drop_shortcut, 0.1
trf_blocks.3.att.dropout, 0.1
trf_blocks.3.drop_shortcut, 0.1
trf_blocks.4.att.dropout, 0.1
trf_blocks.4.drop_shortcut, 0.1
trf_blocks.5.att.dropout, 0.1
trf_blocks.5.drop_shortcut, 0.1
trf_blocks.6.att.dropout, 0.1
trf_blocks.6.drop_shortcut, 0.1
trf_blocks.7.att.dropout, 0.1
trf_blocks.7.drop_shortcut, 0.1
trf_blocks.8.att.dropout, 0.1
trf_blocks.8.drop_shortcut, 0.1
trf_blocks.9.att.dropout, 0.1
trf_blocks.9.drop_shortcut, 0.1
trf_blocks.10.att.dropout, 0.1
trf_blocks.10.drop_shortcut, 0.1
trf_blocks.11.att.dropout, 0.1
trf_blocks.11.drop_shortcut, 0.1
