In [2]:
import torch
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
TOKEN = "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"

In [3]:
def print_MB_size(output):
    num_elements = output.numel()
    element_size = output.element_size()  # Returns the size in bytes of each element
    total_memory_MB = num_elements * element_size / 1024**2
    return f"Size in MB: {total_memory_MB}"

def print_model_intermediate_sizes(model_name, size=512):
    model = AutoModelForCausalLM.from_pretrained(model_name, token=TOKEN)
    
    # This will hold tuples of layer info and outputs for later sorting and printing
    layer_outputs = []

    def hook(module, input, output, prefix=""):
        layer_num = len(layer_outputs) + 1  # Sequential layer number
        layer_info = f"Layer {layer_num}: {module.__class__.__name__}"

        if output is None:
            layer_outputs.append((layer_num, f"{layer_info} Output is None"))
            return

        if isinstance(output, tuple):
            for i, o in enumerate(output):
                if hasattr(o, 'size'):
                    size_info = f"{layer_info} Output {i}: {o.size()} {print_MB_size(o)}"
                    layer_outputs.append((layer_num, size_info))
                else:
                    layer_outputs.append((layer_num, f"{layer_info} Output {i}: Output does not have a 'size' attribute"))
        else:
            if hasattr(output, 'size'):
                size_info = f"{layer_info} Output: {output.size()} {print_MB_size(output)}"
                layer_outputs.append((layer_num, size_info))
            else:
                layer_outputs.append((layer_num, f"{layer_info} Output does not have a 'size' attribute"))

    for layer in model.modules():
        if hasattr(layer, 'forward'):
            layer.register_forward_hook(hook)

    input_ids = torch.randint(0, 20000, (1, size))
    attention_mask = torch.ones(1, size)

    with torch.no_grad():
        model(input_ids=input_ids, attention_mask=attention_mask)

    # Sort by layer number and print
    layer_outputs.sort(key=lambda x: x[0])
    for _, output in layer_outputs:
        print(output)


In [4]:
def calculate_memory_MB(tensor):
    num_elements = tensor.numel()
    element_size = tensor.element_size()  # Returns the size in bytes of each element
    total_memory_MB = num_elements * element_size / 1024**2
    return total_memory_MB

def print_model_weights_size(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name, token=TOKEN)
    
    total_size_MB = 0
    for name, param in model.named_parameters():
        layer_size_MB = calculate_memory_MB(param)
        total_size_MB += layer_size_MB
        print(f"{name} size: {layer_size_MB:.2f} MB")
    total_size_GB = total_size_MB / 1024
    print(f"Total model size: {total_size_GB:.2f} GB")

In [43]:
# Example usage for Mistral 7B 
print_model_intermediate_sizes('mistralai/Mistral-7B-v0.1', size=8000)  

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 8000, 4096]) Size in MB: 125.0
Layer 2: MistralRMSNorm Output: torch.Size([1, 8000, 4096]) Size in MB: 125.0
Layer 3: Linear Output: torch.Size([1, 8000, 4096]) Size in MB: 125.0
Layer 4: Linear Output: torch.Size([1, 8000, 1024]) Size in MB: 31.25
Layer 5: Linear Output: torch.Size([1, 8000, 1024]) Size in MB: 31.25
Layer 6: MistralRotaryEmbedding Output 0: torch.Size([8000, 128]) Size in MB: 3.90625
Layer 6: MistralRotaryEmbedding Output 1: torch.Size([8000, 128]) Size in MB: 3.90625
Layer 8: Linear Output: torch.Size([1, 8000, 4096]) Size in MB: 125.0
Layer 9: MistralSdpaAttention Output 0: torch.Size([1, 8000, 4096]) Size in MB: 125.0
Layer 9: MistralSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: MistralSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: MistralRMSNorm Output: torch.Size([1, 8000, 4096]) Size in MB: 125.0
Layer 13: Linear Output: torch.Size([1, 8000, 14336]) Size in MB: 437.5


In [45]:
print_model_weights_size('mistralai/Mistral-7B-v0.1')

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

embed_tokens.weight size: 500.00 MB
layers.0.self_attn.q_proj.weight size: 64.00 MB
layers.0.self_attn.k_proj.weight size: 16.00 MB
layers.0.self_attn.v_proj.weight size: 16.00 MB
layers.0.self_attn.o_proj.weight size: 64.00 MB
layers.0.mlp.gate_proj.weight size: 224.00 MB
layers.0.mlp.up_proj.weight size: 224.00 MB
layers.0.mlp.down_proj.weight size: 224.00 MB
layers.0.input_layernorm.weight size: 0.02 MB
layers.0.post_attention_layernorm.weight size: 0.02 MB
layers.1.self_attn.q_proj.weight size: 64.00 MB
layers.1.self_attn.k_proj.weight size: 16.00 MB
layers.1.self_attn.v_proj.weight size: 16.00 MB
layers.1.self_attn.o_proj.weight size: 64.00 MB
layers.1.mlp.gate_proj.weight size: 224.00 MB
layers.1.mlp.up_proj.weight size: 224.00 MB
layers.1.mlp.down_proj.weight size: 224.00 MB
layers.1.input_layernorm.weight size: 0.02 MB
layers.1.post_attention_layernorm.weight size: 0.02 MB
layers.2.self_attn.q_proj.weight size: 64.00 MB
layers.2.self_attn.k_proj.weight size: 16.00 MB
layers.2.s

In [56]:
print_model_intermediate_sizes('meta-llama/Llama-2-7b-hf', size=8000)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 2: LlamaRMSNorm Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 3: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 4: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 5: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 6: LlamaRotaryEmbedding Output 0: torch.Size([1, 512, 128]) Size in MB: 0.25
Layer 6: LlamaRotaryEmbedding Output 1: torch.Size([1, 512, 128]) Size in MB: 0.25
Layer 8: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 9: LlamaSdpaAttention Output 0: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 9: LlamaSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: LlamaSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: LlamaRMSNorm Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 13: Linear Output: torch.Size([1, 512, 11008]) Size in MB: 21.5
Layer 14: SiLU Output: torch.Size([1, 512,

In [57]:
print_model_weights_size('meta-llama/Llama-2-7b-hf')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

embed_tokens.weight size: 500.00 MB
layers.0.self_attn.q_proj.weight size: 64.00 MB
layers.0.self_attn.k_proj.weight size: 64.00 MB
layers.0.self_attn.v_proj.weight size: 64.00 MB
layers.0.self_attn.o_proj.weight size: 64.00 MB
layers.0.mlp.gate_proj.weight size: 172.00 MB
layers.0.mlp.up_proj.weight size: 172.00 MB
layers.0.mlp.down_proj.weight size: 172.00 MB
layers.0.input_layernorm.weight size: 0.02 MB
layers.0.post_attention_layernorm.weight size: 0.02 MB
layers.1.self_attn.q_proj.weight size: 64.00 MB
layers.1.self_attn.k_proj.weight size: 64.00 MB
layers.1.self_attn.v_proj.weight size: 64.00 MB
layers.1.self_attn.o_proj.weight size: 64.00 MB
layers.1.mlp.gate_proj.weight size: 172.00 MB
layers.1.mlp.up_proj.weight size: 172.00 MB
layers.1.mlp.down_proj.weight size: 172.00 MB
layers.1.input_layernorm.weight size: 0.02 MB
layers.1.post_attention_layernorm.weight size: 0.02 MB
layers.2.self_attn.q_proj.weight size: 64.00 MB
layers.2.self_attn.k_proj.weight size: 64.00 MB
layers.2.s

In [51]:
print_model_intermediate_sizes('google/gemma-7b')

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 512, 3072]) Size in MB: 6.0
Layer 2: GemmaRMSNorm Output: torch.Size([1, 512, 3072]) Size in MB: 6.0
Layer 3: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 4: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 5: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 6: GemmaRotaryEmbedding Output 0: torch.Size([1, 512, 256]) Size in MB: 0.5
Layer 6: GemmaRotaryEmbedding Output 1: torch.Size([1, 512, 256]) Size in MB: 0.5
Layer 8: Linear Output: torch.Size([1, 512, 3072]) Size in MB: 6.0
Layer 9: GemmaSdpaAttention Output 0: torch.Size([1, 512, 3072]) Size in MB: 6.0
Layer 9: GemmaSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: GemmaSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: GemmaRMSNorm Output: torch.Size([1, 512, 3072]) Size in MB: 6.0
Layer 13: Linear Output: torch.Size([1, 512, 24576]) Size in MB: 48.0
Layer 14: GELUActivation Output: torch.Size(

In [52]:
print_model_weights_size('google/gemma-7b')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

embed_tokens.weight size: 3000.00 MB
layers.0.self_attn.q_proj.weight size: 48.00 MB
layers.0.self_attn.k_proj.weight size: 48.00 MB
layers.0.self_attn.v_proj.weight size: 48.00 MB
layers.0.self_attn.o_proj.weight size: 48.00 MB
layers.0.mlp.gate_proj.weight size: 288.00 MB
layers.0.mlp.up_proj.weight size: 288.00 MB
layers.0.mlp.down_proj.weight size: 288.00 MB
layers.0.input_layernorm.weight size: 0.01 MB
layers.0.post_attention_layernorm.weight size: 0.01 MB
layers.1.self_attn.q_proj.weight size: 48.00 MB
layers.1.self_attn.k_proj.weight size: 48.00 MB
layers.1.self_attn.v_proj.weight size: 48.00 MB
layers.1.self_attn.o_proj.weight size: 48.00 MB
layers.1.mlp.gate_proj.weight size: 288.00 MB
layers.1.mlp.up_proj.weight size: 288.00 MB
layers.1.mlp.down_proj.weight size: 288.00 MB
layers.1.input_layernorm.weight size: 0.01 MB
layers.1.post_attention_layernorm.weight size: 0.01 MB
layers.2.self_attn.q_proj.weight size: 48.00 MB
layers.2.self_attn.k_proj.weight size: 48.00 MB
layers.2.

In [58]:
print_model_intermediate_sizes('google/gemma-2b')

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 512, 2048]) Size in MB: 4.0
Layer 2: GemmaRMSNorm Output: torch.Size([1, 512, 2048]) Size in MB: 4.0
Layer 3: Linear Output: torch.Size([1, 512, 2048]) Size in MB: 4.0
Layer 4: Linear Output: torch.Size([1, 512, 256]) Size in MB: 0.5
Layer 5: Linear Output: torch.Size([1, 512, 256]) Size in MB: 0.5
Layer 6: GemmaRotaryEmbedding Output 0: torch.Size([1, 512, 256]) Size in MB: 0.5
Layer 6: GemmaRotaryEmbedding Output 1: torch.Size([1, 512, 256]) Size in MB: 0.5
Layer 8: Linear Output: torch.Size([1, 512, 2048]) Size in MB: 4.0
Layer 9: GemmaSdpaAttention Output 0: torch.Size([1, 512, 2048]) Size in MB: 4.0
Layer 9: GemmaSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: GemmaSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: GemmaRMSNorm Output: torch.Size([1, 512, 2048]) Size in MB: 4.0
Layer 13: Linear Output: torch.Size([1, 512, 16384]) Size in MB: 32.0
Layer 14: GELUActivation Output: torch.Size([1

In [59]:
print_model_weights_size('google/gemma-2b')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

embed_tokens.weight size: 2000.00 MB
layers.0.self_attn.q_proj.weight size: 16.00 MB
layers.0.self_attn.k_proj.weight size: 2.00 MB
layers.0.self_attn.v_proj.weight size: 2.00 MB
layers.0.self_attn.o_proj.weight size: 16.00 MB
layers.0.mlp.gate_proj.weight size: 128.00 MB
layers.0.mlp.up_proj.weight size: 128.00 MB
layers.0.mlp.down_proj.weight size: 128.00 MB
layers.0.input_layernorm.weight size: 0.01 MB
layers.0.post_attention_layernorm.weight size: 0.01 MB
layers.1.self_attn.q_proj.weight size: 16.00 MB
layers.1.self_attn.k_proj.weight size: 2.00 MB
layers.1.self_attn.v_proj.weight size: 2.00 MB
layers.1.self_attn.o_proj.weight size: 16.00 MB
layers.1.mlp.gate_proj.weight size: 128.00 MB
layers.1.mlp.up_proj.weight size: 128.00 MB
layers.1.mlp.down_proj.weight size: 128.00 MB
layers.1.input_layernorm.weight size: 0.01 MB
layers.1.post_attention_layernorm.weight size: 0.01 MB
layers.2.self_attn.q_proj.weight size: 16.00 MB
layers.2.self_attn.k_proj.weight size: 2.00 MB
layers.2.self_

In [60]:
print_model_intermediate_sizes('lmsys/vicuna-7b-v1.5')

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 2: LlamaRMSNorm Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 3: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 4: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 5: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 6: LlamaRotaryEmbedding Output 0: torch.Size([1, 512, 128]) Size in MB: 0.25
Layer 6: LlamaRotaryEmbedding Output 1: torch.Size([1, 512, 128]) Size in MB: 0.25
Layer 8: Linear Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 9: LlamaSdpaAttention Output 0: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 9: LlamaSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: LlamaSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: LlamaRMSNorm Output: torch.Size([1, 512, 4096]) Size in MB: 8.0
Layer 13: Linear Output: torch.Size([1, 512, 11008]) Size in MB: 21.5
Layer 14: SiLU Output: torch.Size([1, 512,

In [61]:
print_model_weights_size('lmsys/vicuna-7b-v1.5')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

embed_tokens.weight size: 500.00 MB
layers.0.self_attn.q_proj.weight size: 64.00 MB
layers.0.self_attn.k_proj.weight size: 64.00 MB
layers.0.self_attn.v_proj.weight size: 64.00 MB
layers.0.self_attn.o_proj.weight size: 64.00 MB
layers.0.mlp.gate_proj.weight size: 172.00 MB
layers.0.mlp.up_proj.weight size: 172.00 MB
layers.0.mlp.down_proj.weight size: 172.00 MB
layers.0.input_layernorm.weight size: 0.02 MB
layers.0.post_attention_layernorm.weight size: 0.02 MB
layers.1.self_attn.q_proj.weight size: 64.00 MB
layers.1.self_attn.k_proj.weight size: 64.00 MB
layers.1.self_attn.v_proj.weight size: 64.00 MB
layers.1.self_attn.o_proj.weight size: 64.00 MB
layers.1.mlp.gate_proj.weight size: 172.00 MB
layers.1.mlp.up_proj.weight size: 172.00 MB
layers.1.mlp.down_proj.weight size: 172.00 MB
layers.1.input_layernorm.weight size: 0.02 MB
layers.1.post_attention_layernorm.weight size: 0.02 MB
layers.2.self_attn.q_proj.weight size: 64.00 MB
layers.2.self_attn.k_proj.weight size: 64.00 MB
layers.2.s

In [8]:
print_model_intermediate_sizes('microsoft/phi-2')

config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Layer 1: Embedding Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 2: Dropout Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 3: LayerNorm Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 4: Linear Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 5: Linear Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 6: Linear Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 7: PhiRotaryEmbedding Output 0: torch.Size([512, 32]) Size in MB: 0.0625
Layer 7: PhiRotaryEmbedding Output 1: torch.Size([512, 32]) Size in MB: 0.0625
Layer 9: Linear Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 10: PhiAttention Output 0: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 10: PhiAttention Output 1: Output does not have a 'size' attribute
Layer 10: PhiAttention Output 2: Output does not have a 'size' attribute
Layer 13: Dropout Output: torch.Size([1, 512, 2560]) Size in MB: 5.0
Layer 14: Linear Output: torch.Size([1, 512, 10240]) Size in MB: 20.0
Layer

In [9]:
print_model_weights_size('microsoft/phi-2')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

model.embed_tokens.weight size: 500.00 MB
model.layers.0.self_attn.q_proj.weight size: 25.00 MB
model.layers.0.self_attn.q_proj.bias size: 0.01 MB
model.layers.0.self_attn.k_proj.weight size: 25.00 MB
model.layers.0.self_attn.k_proj.bias size: 0.01 MB
model.layers.0.self_attn.v_proj.weight size: 25.00 MB
model.layers.0.self_attn.v_proj.bias size: 0.01 MB
model.layers.0.self_attn.dense.weight size: 25.00 MB
model.layers.0.self_attn.dense.bias size: 0.01 MB
model.layers.0.mlp.fc1.weight size: 100.00 MB
model.layers.0.mlp.fc1.bias size: 0.04 MB
model.layers.0.mlp.fc2.weight size: 100.00 MB
model.layers.0.mlp.fc2.bias size: 0.01 MB
model.layers.0.input_layernorm.weight size: 0.01 MB
model.layers.0.input_layernorm.bias size: 0.01 MB
model.layers.1.self_attn.q_proj.weight size: 25.00 MB
model.layers.1.self_attn.q_proj.bias size: 0.01 MB
model.layers.1.self_attn.k_proj.weight size: 25.00 MB
model.layers.1.self_attn.k_proj.bias size: 0.01 MB
model.layers.1.self_attn.v_proj.weight size: 25.00 M

In [6]:
print_model_intermediate_sizes('meta-llama/Meta-Llama-3-8B', size=1)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 1, 4096]) Size in MB: 0.015625
Layer 2: LlamaRMSNorm Output: torch.Size([1, 1, 4096]) Size in MB: 0.015625
Layer 3: Linear Output: torch.Size([1, 1, 4096]) Size in MB: 0.015625
Layer 4: Linear Output: torch.Size([1, 1, 1024]) Size in MB: 0.00390625
Layer 5: Linear Output: torch.Size([1, 1, 1024]) Size in MB: 0.00390625
Layer 6: LlamaRotaryEmbedding Output 0: torch.Size([1, 1, 128]) Size in MB: 0.00048828125
Layer 6: LlamaRotaryEmbedding Output 1: torch.Size([1, 1, 128]) Size in MB: 0.00048828125
Layer 8: Linear Output: torch.Size([1, 1, 4096]) Size in MB: 0.015625
Layer 9: LlamaSdpaAttention Output 0: torch.Size([1, 1, 4096]) Size in MB: 0.015625
Layer 9: LlamaSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: LlamaSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: LlamaRMSNorm Output: torch.Size([1, 1, 4096]) Size in MB: 0.015625
Layer 13: Linear Output: torch.Size([1, 1, 14336]) Size in MB: 0.05468

In [12]:
print_model_intermediate_sizes('meta-llama/Meta-Llama-3-8B', size=100)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 100, 4096]) Size in MB: 1.5625
Layer 2: LlamaRMSNorm Output: torch.Size([1, 100, 4096]) Size in MB: 1.5625
Layer 3: Linear Output: torch.Size([1, 100, 4096]) Size in MB: 1.5625
Layer 4: Linear Output: torch.Size([1, 100, 1024]) Size in MB: 0.390625
Layer 5: Linear Output: torch.Size([1, 100, 1024]) Size in MB: 0.390625
Layer 6: LlamaRotaryEmbedding Output 0: torch.Size([1, 100, 128]) Size in MB: 0.048828125
Layer 6: LlamaRotaryEmbedding Output 1: torch.Size([1, 100, 128]) Size in MB: 0.048828125
Layer 8: Linear Output: torch.Size([1, 100, 4096]) Size in MB: 1.5625
Layer 9: LlamaSdpaAttention Output 0: torch.Size([1, 100, 4096]) Size in MB: 1.5625
Layer 9: LlamaSdpaAttention Output 1: Output does not have a 'size' attribute
Layer 9: LlamaSdpaAttention Output 2: Output does not have a 'size' attribute
Layer 12: LlamaRMSNorm Output: torch.Size([1, 100, 4096]) Size in MB: 1.5625
Layer 13: Linear Output: torch.Size([1, 100, 14336]) Size in MB: 5.468

In [7]:
print_model_weights_size('meta-llama/Meta-Llama-3-8B')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model.embed_tokens.weight size: 2004.00 MB
model.layers.0.self_attn.q_proj.weight size: 64.00 MB
model.layers.0.self_attn.k_proj.weight size: 16.00 MB
model.layers.0.self_attn.v_proj.weight size: 16.00 MB
model.layers.0.self_attn.o_proj.weight size: 64.00 MB
model.layers.0.mlp.gate_proj.weight size: 224.00 MB
model.layers.0.mlp.up_proj.weight size: 224.00 MB
model.layers.0.mlp.down_proj.weight size: 224.00 MB
model.layers.0.input_layernorm.weight size: 0.02 MB
model.layers.0.post_attention_layernorm.weight size: 0.02 MB
model.layers.1.self_attn.q_proj.weight size: 64.00 MB
model.layers.1.self_attn.k_proj.weight size: 16.00 MB
model.layers.1.self_attn.v_proj.weight size: 16.00 MB
model.layers.1.self_attn.o_proj.weight size: 64.00 MB
model.layers.1.mlp.gate_proj.weight size: 224.00 MB
model.layers.1.mlp.up_proj.weight size: 224.00 MB
model.layers.1.mlp.down_proj.weight size: 224.00 MB
model.layers.1.input_layernorm.weight size: 0.02 MB
model.layers.1.post_attention_layernorm.weight size:

In [8]:
print_model_intermediate_sizes('microsoft/Phi-3-mini-128k-instruct', size=1)

config.json:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

The repository for microsoft/Phi-3-mini-128k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-128k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for microsoft/Phi-3-mini-128k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-128k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

You are not running the flash-attention implementation, expect numerical differences.


Layer 1: Embedding Output: torch.Size([1, 1, 3072]) Size in MB: 0.01171875
Layer 2: Phi3RMSNorm Output: torch.Size([1, 1, 3072]) Size in MB: 0.01171875
Layer 3: Linear Output: torch.Size([1, 1, 9216]) Size in MB: 0.03515625
Layer 4: Phi3SuScaledRotaryEmbedding Output 0: torch.Size([1, 1, 96]) Size in MB: 0.0003662109375
Layer 4: Phi3SuScaledRotaryEmbedding Output 1: torch.Size([1, 1, 96]) Size in MB: 0.0003662109375
Layer 6: Linear Output: torch.Size([1, 1, 3072]) Size in MB: 0.01171875
Layer 7: Phi3Attention Output 0: torch.Size([1, 1, 3072]) Size in MB: 0.01171875
Layer 7: Phi3Attention Output 1: Output does not have a 'size' attribute
Layer 7: Phi3Attention Output 2: Output does not have a 'size' attribute
Layer 10: Dropout Output: torch.Size([1, 1, 3072]) Size in MB: 0.01171875
Layer 11: Phi3RMSNorm Output: torch.Size([1, 1, 3072]) Size in MB: 0.01171875
Layer 12: Linear Output: torch.Size([1, 1, 16384]) Size in MB: 0.0625
Layer 13: SiLU Output: torch.Size([1, 1, 8192]) Size in MB:

In [9]:
print_model_weights_size('microsoft/Phi-3-mini-128k-instruct')

The repository for microsoft/Phi-3-mini-128k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-128k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

model.embed_tokens.weight size: 375.75 MB
model.layers.0.self_attn.o_proj.weight size: 36.00 MB
model.layers.0.self_attn.qkv_proj.weight size: 108.00 MB
model.layers.0.mlp.gate_up_proj.weight size: 192.00 MB
model.layers.0.mlp.down_proj.weight size: 96.00 MB
model.layers.0.input_layernorm.weight size: 0.01 MB
model.layers.0.post_attention_layernorm.weight size: 0.01 MB
model.layers.1.self_attn.o_proj.weight size: 36.00 MB
model.layers.1.self_attn.qkv_proj.weight size: 108.00 MB
model.layers.1.mlp.gate_up_proj.weight size: 192.00 MB
model.layers.1.mlp.down_proj.weight size: 96.00 MB
model.layers.1.input_layernorm.weight size: 0.01 MB
model.layers.1.post_attention_layernorm.weight size: 0.01 MB
model.layers.2.self_attn.o_proj.weight size: 36.00 MB
model.layers.2.self_attn.qkv_proj.weight size: 108.00 MB
model.layers.2.mlp.gate_up_proj.weight size: 192.00 MB
model.layers.2.mlp.down_proj.weight size: 96.00 MB
model.layers.2.input_layernorm.weight size: 0.01 MB
model.layers.2.post_attention

In [11]:
print_model_intermediate_sizes('microsoft/Phi-3-mini-128k-instruct', size=100)

The repository for microsoft/Phi-3-mini-128k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-128k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Layer 1: Embedding Output: torch.Size([1, 100, 3072]) Size in MB: 1.171875
Layer 2: Phi3RMSNorm Output: torch.Size([1, 100, 3072]) Size in MB: 1.171875
Layer 3: Linear Output: torch.Size([1, 100, 9216]) Size in MB: 3.515625
Layer 4: Phi3SuScaledRotaryEmbedding Output 0: torch.Size([1, 100, 96]) Size in MB: 0.03662109375
Layer 4: Phi3SuScaledRotaryEmbedding Output 1: torch.Size([1, 100, 96]) Size in MB: 0.03662109375
Layer 6: Linear Output: torch.Size([1, 100, 3072]) Size in MB: 1.171875
Layer 7: Phi3Attention Output 0: torch.Size([1, 100, 3072]) Size in MB: 1.171875
Layer 7: Phi3Attention Output 1: Output does not have a 'size' attribute
Layer 7: Phi3Attention Output 2: Output does not have a 'size' attribute
Layer 10: Dropout Output: torch.Size([1, 100, 3072]) Size in MB: 1.171875
Layer 11: Phi3RMSNorm Output: torch.Size([1, 100, 3072]) Size in MB: 1.171875
Layer 12: Linear Output: torch.Size([1, 100, 16384]) Size in MB: 6.25
Layer 13: SiLU Output: torch.Size([1, 100, 8192]) Size in M