<a href="https://colab.research.google.com/github/Shravani018/llm-audit-bench/blob/main/01_extracting_metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### 01: Extracting model metadata

**Loading 5 small LLMs from HuggingFace, extracting their metadata and architecture for further analysis**

In [1]:
# Downlaoding necessary libraries
!pip install -q -r requirements.txt

In [2]:
# Importing necessary libraries
import json
import os
from dataclasses import dataclass, asdict
from typing import Optional
import pandas as pd
from transformers import AutoConfig, AutoTokenizer
from huggingface_hub import HfApi, ModelCard
from huggingface_hub.utils import EntryNotFoundError
import warnings
warnings.filterwarnings("ignore")

In [3]:
# LLMs used
models=[
    "gpt2",
    "distilgpt2",
    "facebook/opt-125m",
    "EleutherAI/gpt-neo-125m",
    "bigscience/bloom-560m",
]

In [4]:
# Defining data structure
@dataclass
class ModelMetadata:
    model_id:str
    author:str
    license:str
    has_model_card:bool
    architecture:str
    num_parameters_estimate:int
    num_layers:int
    hidden_size:int
    num_attention_heads:int
    vocab_size:int
    max_position_embeddings:int
    tokenizer_class:str
    tags:list

In [5]:
# Initializing API
api = HfApi()

In [6]:
# Fetching the license of a model from its HuggingFace model card metadata
def get_license(model_id):
    try:
        info = api.model_info(model_id)
        if info.cardData:
            return info.cardData.get("license")
    except Exception:
        pass
    return None

In [7]:
# Fetching the tags associated with a model inorder to understand the framework it uses
def get_tags(model_id):
    try:
        info= api.model_info(model_id)
        return list(info.tags or [])
    except Exception:
        return []

In [8]:
# Checking for model card as they hold information such as training_data, limitations etc
def check_model_card(model_id):
    try:
        ModelCard.load(model_id)
        return True
    except Exception:
        return False

In [9]:
# Estimates total parameter count from config fields only (no weights downloaded)
# Formula: embedding layer (vocab x hidden) + transformer blocks (12 x hidden^2 x layers)
def estimate_parameters(config):
    h= getattr(config, "hidden_size", None) or getattr(config, "n_embd", None) or getattr(config, "d_model", None)
    layers= getattr(config, "num_hidden_layers", None) or getattr(config, "n_layer", None)
    vocab= getattr(config, "vocab_size", None)
    if h and layers and vocab:
        return vocab * h + layers * (12 * h * h)
    return None

In [10]:
# Loading the models and extracting metadata
def load_model_meta(model_id):
    print(f"loading: {model_id}")
    config = AutoConfig.from_pretrained(model_id)
    try:
        tok = AutoTokenizer.from_pretrained(model_id)
        tokenizer_class = type(tok).__name__
    except Exception:
        tokenizer_class = None
    hidden_size = (
        getattr(config, "hidden_size", None) or
        getattr(config, "n_embd", None) or
        getattr(config, "d_model", None))
    num_layers = (
        getattr(config, "num_hidden_layers", None) or
        getattr(config, "n_layer", None))
    num_heads = (
        getattr(config, "num_attention_heads", None) or
        getattr(config, "n_head", None))
    max_ctx = (
        getattr(config, "max_position_embeddings", None) or
        getattr(config, "n_positions", None))
    meta=ModelMetadata(
        model_id= model_id,
        author= model_id.split("/")[0] if "/" in model_id else "openai",
        license= get_license(model_id),
        has_model_card= check_model_card(model_id),
        architecture= config.architectures[0] if config.architectures else None,
        num_parameters_estimate= estimate_parameters(config),
        num_layers= num_layers,
        hidden_size= hidden_size,
        num_attention_heads= num_heads,
        vocab_size= getattr(config, "vocab_size", None),
        max_position_embeddings= max_ctx,
        tokenizer_class= tokenizer_class,
        tags= get_tags(model_id))
    return meta

In [11]:
all_models = [load_model_meta(m) for m in models]
print(f"done. loaded {len(all_models)} models.")

loading: gpt2
loading: distilgpt2
loading: facebook/opt-125m
loading: EleutherAI/gpt-neo-125m
loading: bigscience/bloom-560m
done. loaded 5 models.


In [12]:
# Storing data inorder to export it
rows=[]
for model in all_models:
  rows.append({
        "model_id":model.model_id,
        "architecture":model.architecture,
        "params_estimate": f"{model.num_parameters_estimate:,}" if model.num_parameters_estimate else "N/A",
        "layers":model.num_layers,
        "hidden_size":model.hidden_size,
        "attn_heads":model.num_attention_heads,
        "vocab_size":model.vocab_size,
        "max_ctx":model.max_position_embeddings,
        "tokenizer":model.tokenizer_class,
        "license":model.license,
        "model_card":model.has_model_card,
    })

In [13]:
meta_df=pd.DataFrame(rows)

In [14]:
meta_df

Unnamed: 0,model_id,architecture,params_estimate,layers,hidden_size,attn_heads,vocab_size,max_ctx,tokenizer,license,model_card
0,gpt2,GPT2LMHeadModel,123532032,12,768,12,50257,1024.0,GPT2TokenizerFast,mit,True
1,distilgpt2,GPT2LMHeadModel,81064704,6,768,12,50257,1024.0,GPT2TokenizerFast,apache-2.0,True
2,facebook/opt-125m,OPTForCausalLM,123543552,12,768,12,50272,2048.0,GPT2TokenizerFast,other,True
3,EleutherAI/gpt-neo-125m,GPTNeoForCausalLM,123532032,12,768,12,50257,2048.0,GPT2TokenizerFast,mit,True
4,bigscience/bloom-560m,BloomForCausalLM,558891008,24,1024,16,250880,,BloomTokenizerFast,bigscience-bloom-rail-1.0,True


In [15]:
os.makedirs("./results",exist_ok=True)
with open("./results/model_metadata.json","w") as f:
    json.dump({"./results/models": [asdict(m) for m in all_models]}, f, indent=2)

Next: 02_transparency.ipynb

Scoring each model's transparency.