# Model Parameters Inspector

This notebook allows you to inspect the configuration and parameters of the models used in the project without loading the full weights into memory.

In [None]:
%env HF_TOKEN=your_token_here

In [None]:
import os
import json
from pprint import pprint

from transformers import AutoConfig, AutoTokenizer, GenerationConfig
import pandas as pd

In [None]:
# Model cache directory (same as in base_3models.ipynb)
CACHE_DIR = os.path.abspath("/data/cat/ws/albu670g-qa-model/models")

# List of models to inspect
MODEL_IDS = [
    "mistralai/Mistral-7B-Instruct-v0.2",
    "meta-llama/Meta-Llama-3-8B",
    "meta-llama/Meta-Llama-3-8B-Instruct",
]

## 1. Model Architecture Configuration

Load only the config (no weights) to inspect model architecture parameters.

In [None]:
def load_model_config(model_id: str) -> dict:
    """Load model configuration without loading weights."""
    config = AutoConfig.from_pretrained(model_id, cache_dir=CACHE_DIR)
    return config

# Store configs for all models
model_configs = {}
for model_id in MODEL_IDS:
    print(f"Loading config for: {model_id}")
    try:
        model_configs[model_id] = load_model_config(model_id)
        print(f"  ✓ Loaded successfully\n")
    except Exception as e:
        print(f"  ✗ Failed: {e}\n")
        model_configs[model_id] = None

In [None]:
# Display full configuration for each model
for model_id, config in model_configs.items():
    print("=" * 80)
    print(f"MODEL: {model_id}")
    print("=" * 80)
    if config is not None:
        pprint(config.to_dict())
    else:
        print("Config not available")
    print("\n")

## 2. Side-by-Side Comparison of Key Parameters

In [None]:
# Key parameters to compare across models
KEY_PARAMS = [
    "architectures",
    "hidden_size",
    "intermediate_size",
    "num_hidden_layers",
    "num_attention_heads",
    "num_key_value_heads",
    "vocab_size",
    "max_position_embeddings",
    "rope_theta",
    "rms_norm_eps",
    "hidden_act",
    "tie_word_embeddings",
    "torch_dtype",
]

def extract_key_params(config, params: list) -> dict:
    """Extract specified parameters from config."""
    if config is None:
        return {p: None for p in params}
    config_dict = config.to_dict()
    return {p: config_dict.get(p, "N/A") for p in params}

# Build comparison dataframe
comparison_data = {}
for model_id, config in model_configs.items():
    # Use short name for column header
    short_name = model_id.split("/")[-1]
    comparison_data[short_name] = extract_key_params(config, KEY_PARAMS)

comparison_df = pd.DataFrame(comparison_data)
comparison_df.index.name = "Parameter"
comparison_df

## 3. Tokenizer Configuration

In [None]:
def inspect_tokenizer(model_id: str) -> dict:
    """Load and inspect tokenizer configuration."""
    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
    
    info = {
        "vocab_size": tokenizer.vocab_size,
        "model_max_length": tokenizer.model_max_length,
        "is_fast": tokenizer.is_fast,
        "padding_side": tokenizer.padding_side,
        "truncation_side": tokenizer.truncation_side,
        "bos_token": repr(tokenizer.bos_token),
        "eos_token": repr(tokenizer.eos_token),
        "pad_token": repr(tokenizer.pad_token),
        "unk_token": repr(tokenizer.unk_token),
        "has_chat_template": hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None,
    }
    return info, tokenizer

# Inspect tokenizers
tokenizer_infos = {}
tokenizers = {}

for model_id in MODEL_IDS:
    print(f"Loading tokenizer for: {model_id}")
    try:
        info, tok = inspect_tokenizer(model_id)
        tokenizer_infos[model_id] = info
        tokenizers[model_id] = tok
        print(f"  ✓ Loaded successfully\n")
    except Exception as e:
        print(f"  ✗ Failed: {e}\n")
        tokenizer_infos[model_id] = None
        tokenizers[model_id] = None

In [None]:
# Build tokenizer comparison dataframe
tok_comparison = {}
for model_id, info in tokenizer_infos.items():
    short_name = model_id.split("/")[-1]
    tok_comparison[short_name] = info if info else {}

tok_df = pd.DataFrame(tok_comparison)
tok_df.index.name = "Property"
tok_df

## 4. Chat Templates

View the chat template used by each model for formatting conversations.

In [None]:
for model_id, tok in tokenizers.items():
    print("=" * 80)
    print(f"CHAT TEMPLATE: {model_id}")
    print("=" * 80)
    if tok is not None and hasattr(tok, "chat_template") and tok.chat_template:
        print(tok.chat_template)
    else:
        print("No chat template defined")
    print("\n")

## 5. Generation Configuration

Default generation parameters for each model.

In [None]:
for model_id in MODEL_IDS:
    print("=" * 80)
    print(f"GENERATION CONFIG: {model_id}")
    print("=" * 80)
    try:
        gen_config = GenerationConfig.from_pretrained(model_id, cache_dir=CACHE_DIR)
        pprint(gen_config.to_dict())
    except Exception as e:
        print(f"Could not load generation config: {e}")
    print("\n")

## 6. Special Tokens Details

In [None]:
for model_id, tok in tokenizers.items():
    print("=" * 80)
    print(f"SPECIAL TOKENS: {model_id}")
    print("=" * 80)
    if tok is not None:
        print(f"All special tokens: {tok.all_special_tokens}")
        print(f"Special tokens map: {tok.special_tokens_map}")
        if hasattr(tok, "added_tokens_encoder") and tok.added_tokens_encoder:
            print(f"Added tokens: {list(tok.added_tokens_encoder.keys())}")
    else:
        print("Tokenizer not available")
    print("\n")

## 7. Estimated Model Size

In [None]:
def estimate_model_params(config) -> int:
    """Estimate total number of parameters from config.
    
    This is an approximation based on typical transformer architecture.
    """
    if config is None:
        return 0
    
    cfg = config.to_dict()
    
    vocab_size = cfg.get("vocab_size", 0)
    hidden_size = cfg.get("hidden_size", 0)
    num_layers = cfg.get("num_hidden_layers", 0)
    intermediate_size = cfg.get("intermediate_size", 0)
    num_heads = cfg.get("num_attention_heads", 0)
    num_kv_heads = cfg.get("num_key_value_heads", num_heads)
    
    # Embedding parameters
    embed_params = vocab_size * hidden_size
    
    # Attention parameters per layer (Q, K, V, O projections)
    head_dim = hidden_size // num_heads if num_heads > 0 else 0
    q_params = hidden_size * hidden_size
    k_params = hidden_size * (num_kv_heads * head_dim)
    v_params = hidden_size * (num_kv_heads * head_dim)
    o_params = hidden_size * hidden_size
    attn_params = q_params + k_params + v_params + o_params
    
    # MLP parameters per layer
    mlp_params = 3 * hidden_size * intermediate_size  # gate, up, down projections
    
    # Layer norm parameters per layer
    ln_params = 2 * hidden_size
    
    # Total per layer
    layer_params = attn_params + mlp_params + ln_params
    
    # Total model
    total = embed_params + (num_layers * layer_params) + hidden_size  # final layer norm
    
    # If not tie_word_embeddings, add output projection
    if not cfg.get("tie_word_embeddings", True):
        total += vocab_size * hidden_size
    
    return total

# Calculate and display model sizes
size_data = []
for model_id, config in model_configs.items():
    params = estimate_model_params(config)
    size_data.append({
        "Model": model_id.split("/")[-1],
        "Estimated Parameters": f"{params:,}",
        "Size (Billions)": f"{params / 1e9:.2f}B",
        "FP16 Memory (GB)": f"{params * 2 / 1e9:.2f}",
        "FP32 Memory (GB)": f"{params * 4 / 1e9:.2f}",
    })

pd.DataFrame(size_data)