# 前馈与注意力模块中的参数

In [1]:
from GPT import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "embedding_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

block = TransformerBlock(GPT_CONFIG_124M)

In [3]:
total_params = sum(p.numel() for p in block.ffn.parameters())
print(f"Total number of parameters in feed forward module: {total_params:,}")

Total number of parameters in feed forward module: 4,722,432


In [4]:
total_params = sum(p.numel() for p in block.attention.parameters())
print(f"Total number of parameters in attention module: {total_params:,}")

Total number of parameters in attention module: 2,360,064


- 以上结果适用于单个Transformer Block
- 可选地乘以12以捕获124M GPT模型中的所有Transformer

# 初始化大型GPT模型

- GPT2-small（我们已经实现的124M配置）：
    - emb_dim=768
    - n_layers=12
    - n_heads=12
- GPT2-medium
    - emb_dim=1024
    - n_layers=24
    - n_heads”=16
- GPT2-large
    - emb_dim=1280
    - n_layers=36
    - n_heads=20
- GPT2-XL
    - emb_dim=1600
    - n_layers=48
    - n_heads=25

In [9]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "embedding_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

def get_config(base_config, model_name="gpt2-small"):
    GPT_CONFIG = base_config.copy()
    
    if model_name == "gpt2-small":
        GPT_CONFIG['embedding_dim'] = 768
        GPT_CONFIG['n_layers'] = 12
        GPT_CONFIG['n_heads'] = 12
        
    elif model_name == "gpt2-medium":
        GPT_CONFIG['embedding_dim'] = 1024
        GPT_CONFIG['n_layers'] = 24
        GPT_CONFIG['n_heads'] = 16
        
    elif model_name == 'gpt2-large':
        GPT_CONFIG['embedding_dim'] = 1280
        GPT_CONFIG['n_layers'] = 36
        GPT_CONFIG['n_heads'] = 20
        
    elif model_name == 'gpt2-xl':
        GPT_CONFIG['embedding_dim'] = 1600
        GPT_CONFIG['n_layers'] = 48
        GPT_CONFIG['n_heads'] = 25
    
    else:
        raise ValueError(f"Incorrect model name {model_name}")
    
    return GPT_CONFIG

def calculate_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params:,}")
    
    total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
    print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")
    
    total_size_bytes = total_params * 4

    total_size_mb = total_size_bytes / (1024 * 1024)
    
    print(f"Total size of the model: {total_size_mb:.2f} MB")

In [None]:
from GPT import GPTModel

for model_abbrev in ("small", "medium", "large", "xl"):
    model_name = f"gpt2-{model_abbrev}"
    CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)
    model = GPTModel(CONFIG)
    print(f"\n\n{model_name}:")
    calculate_size(model)



gpt2-small:
Total number of parameters: 163,009,536
Number of trainable parameters considering weight tying: 124,412,160
Total size of the model: 621.83 MB


gpt2-medium:
Total number of parameters: 406,212,608
Number of trainable parameters considering weight tying: 354,749,440
Total size of the model: 1549.58 MB
