In [1]:
!pip install transformers torch

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from transformers import GPT2Model
model = GPT2Model.from_pretrained('gpt2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def count_params(model, ishuman=True):
    params = sum( p.numel() for p in model.parameters() if p.requires_grad)
    return f"{params/1e6:.2f}M" if ishuman else params

print(model)
print("Total # of params:", count_params(model, ishuman=True))

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
Total # of params: 124.44M


In [4]:
print(f"wte | {model.config.vocab_size * 768}") # V * E
print(f"wpe | {model.config.n_embd * 1024}") # P * E

print(f"{count_params(model._modules['wte'],False)}")

print(f"{count_params(model._modules['wpe'],False)}")

wte | 38597376
wpe | 786432
38597376
786432


In [None]:
# the size of parameter the layer normalization = the embedding dimension with beta and gama addded (2*Embedding)

In [None]:
from transformers import AutoConfig, Qwen3Model, Qwen3Config

config = AutoConfig.from_pretrained("Qwen/Qwen3-8B")

model = Qwen3Model(config)

configuration = model.config
print(config)



Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 12288,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
   

In [44]:
E = config.hidden_size
H = config.intermediate_size
V= config.vocab_size
L = config.num_hidden_layers
P = config.max_position_embeddings

In [5]:
print(model)

Qwen3Model(
  (embed_tokens): Embedding(151936, 4096)
  (layers): ModuleList(
    (0-35): 36 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=4096, out_features=12288, bias=False)
        (up_proj): Linear(in_features=4096, out_features=12288, bias=False)
        (down_proj): Linear(in_features=12288, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen3RMSNorm((4096,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((4096,), eps=1e-06)
    )
  )
  (norm): Qwen3RMSNorm((

Parameter Count

In [6]:
count_params(model, ishuman=True)

'7568.41M'

Layer by Layer Parameter Count

In [None]:
V = config.vocab_size
E = config.hidden_size
P= config.max_position_embeddings
L = config.num_hidden_layers
print(f"Vocab size (V): {V} Embedding size (E): {E} Max Position Embeddings (P): {P}")

Vocab size (V): 151936 Embedding size (E): 4096 Max Position Embeddings (P): 40960


### Embedding Layers 
we've only one Embedding layer

In [9]:
E = config.hidden_size

In [11]:
print(count_params(model._modules['embed_tokens']))

622.33M


In [None]:
Embedding_tokens = E * V
print('Expected',Embedding_tokens)

Expected 622329856


### Self Attention

Qwen3 uses group-query attention (GQA) q_proj and o_proj are full E X E matrices 

but k_proj, v_proj are only E x E/h_qkv_ratio

In [32]:
self_attn = E * E + E * E/4 + E * E/ 4 + E*E

In [33]:
print("Expected self attention:",self_attn)

Expected self attention: 41943040.0


In [34]:
print("Actual",count_params(model._modules['layers'][0].self_attn))

Actual 41.94M


### MLP

In [37]:

intermidate_size = config.intermediate_size
gated_proj = E * intermidate_size
up_proj = E * intermidate_size
down_proj = intermidate_size * E

mlp = gated_proj + up_proj + down_proj

In [38]:
print("MLP Expected:",mlp)

MLP Expected: 150994944


In [39]:
print("Actual",count_params(model._modules['layers'][0].mlp))

Actual 150.99M


### LayerNorms

In [18]:
print(count_params(model._modules['layers'][0].input_layernorm))

0.00M


In [21]:
print(count_params(model._modules['layers'][0].post_attention_layernorm))

0.00M


In [24]:
print(count_params(model._modules['norm']))

0.00M


In [40]:
print(count_params(model._modules['rotary_emb']))

0.00M


In [45]:
Total = Embedding_tokens + L * (self_attn + mlp)

In [46]:
print("Expected Total:", Total)

Expected Total: 7568097280.0


In [47]:
print("Actual Toatal:",count_params(model, ishuman=False))

Actual Toatal: 7568405504
