<a href="https://colab.research.google.com/github/Shahrukhzx/CSE465_project/blob/main/shortgpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
!pip install datasets



In [2]:
# importing the required libraries
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, Trainer, TrainingArguments
from collections import OrderedDict
from typing import List, Optional
import numpy as np
from tqdm.notebook import tqdm
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader

from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
)


# Support functions

In [3]:
def layer_removal(
    model: nn.Module,
    layers_to_remove: OrderedDict
):
    """
    Generic removal implementation
    """

    for layer_name, layer_idx in layers_to_remove.items():
        modules = layer_name.split(".") # this splits the layer name into parts
        mod = model # starts with the base model
        for m in modules[:-1]: # nagivate through the parent module
            mod = getattr(mod, m)

        if layer_idx is None: # If layer_idx is None, delete the whole attribute
            delattr(mod, modules[-1])
        else: # else delete the specific part within the attribute
            del getattr(mod, modules[-1])[layer_idx]

In [4]:
# calculates the influence of a block(a layer or a set of layer)
# calculated based on the cosine similarity
def block_influence(
    input_hidden_state: torch.Tensor,
    output_hidden_state: torch.Tensor,
    angular=False,
):
    """
    input_hidden_state: B, S, D
    output_hidden_state: B, S, D
    """
    _, _, d = input_hidden_state.shape # get the hidden size('d') from the hidden state
    input_hidden_state = input_hidden_state.reshape(-1, d) # reshape the input to d
    output_hidden_state = output_hidden_state.reshape(-1, d) # reshape the output to d
    # Calculate the L2 norm of the input and output
    norm_input = input_hidden_state.norm(dim=-1, keepdim=True)
    norm_output = output_hidden_state.norm(dim=-1, keepdim=True)
    # Calculate the cosine similarity between the input and output
    sim = (input_hidden_state @ output_hidden_state.T) / (norm_input * norm_output)
    sim = sim.diagonal().nan_to_num(nan=0.5) # extracts the diagonal element of the matrix
    # it calculates the angular distance it its true
    if angular:
        return (torch.arccos(sim) / torch.pi)
    # else returns the dissimilarities
    return 1 - sim

In [5]:
class ShortHFModel():

    def __init__(self, model_name: str, layers_path: str, n_prune_layers: Optional[int] = None):
        """
        HuggingFace Model Wrapper

        Args:
            model_name (str): HuggingFace model name
            layers_path (str): String in dot notation demonstrating how to access layers of the model. Ex: "model.layers"
            (Optional) n_prune_layers (int): Number of layers to prune. Defaults to None.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
        # self.model.params = self.model.to_fp16(self.model.params)
        self.model.to("cuda")

        modules = layers_path.split(".")
        mod = self.model
        for m in modules:
            mod = getattr(mod, m)
        self.layers = mod

        self.n_prune_layers = n_prune_layers
        self.importances = [0 for _ in self.layers]  # layer-wise importance scores

    def remove_layers(
        self,
        layers_to_remove: Optional[List[int]] = [],
        angular: Optional[bool] = False
    ):
        if angular:
            assert self.importances, "Need to compute importances with eval_importance()"
            assert self.n_prune_layers, "Need number of layers to prune, set `n_prune_layers`"
            start_layer = np.argsort(np.array(self.importances[:-self.n_prune_layers+1]))[0]
            layers_to_remove = list(range(start_layer, start_layer + self.n_prune_layers))
        elif not layers_to_remove and self.n_prune_layers:
            assert self.importances, "Need to compute importances with eval_importance()"
            layers_to_remove = np.argsort(np.array(self.importances))[:self.n_prune_layers].tolist()

        # remove layers in reverse to avoid indexing errors
        for layer_idx in sorted(layers_to_remove, reverse=True):
            try:
                del self.layers[layer_idx]
            except IndexError:
                print(f"layer {layer_idx} does not exist, function may have already been called")
                return []

        return layers_to_remove

    def compute_bi(self, hiddens: List[torch.Tensor], angular: bool):
        n = 1
        if angular:
            assert self.n_prune_layers is not None, "Set number of layers to prune to use angular importance"
            n = self.n_prune_layers

        for i in range(len(hiddens) - n):
            in_hidden = hiddens[i]
            out_hidden = hiddens[i+n]
            if angular:
                # use only last token for angular distance as described in section 3.2
                # https://arxiv.org/pdf/2403.17887.pdf
                in_hidden = in_hidden[:,-1:]
                out_hidden = out_hidden[:,-1:]

            self.importances[i] += block_influence(
                in_hidden,
                out_hidden,
                angular=angular
            ).sum().cpu().item()

    @torch.inference_mode()
    def eval_importance(
        self,
        prompts: List[str],
        max_seq_len: int,
        stride: int = 256,
        max_gen_len: int = 0,
        temperature: float = 0.6,
        top_p: float = 0.9,
        angular: Optional[bool] = False
    ):
        """
        Computes layer-wise importances over input texts.

        NOTE: ShortGPT paper performs no generation during importance computation, which suggests a `max_gen_len`= 0.

        Args:
            prompts (List[str]): List of prompts.
            max_seq_len (int): Maximum sequence length for model input, the sliding window size.
            (Optional) stride (int): Number of tokens to skip/shift between each window inference.
            (Optional) max_gen_len (int): Maximum length of the generated text sequence.
            (Optional) temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
            (Optional) top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
            (Optional) angular (bool): Whether to ues angular distance. Defaults to False.

        Returns:
            None
        """
        prompt_tokens = self.tokenizer(
            prompts,
            padding=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = prompt_tokens.input_ids
        attn_mask = prompt_tokens.attention_mask

        max_prompt_len = max(len(t) for t in input_ids)

        # authors use a sliding window of size 1024 with a shift of 256
        for start in range(0, max_prompt_len, stride):
            seq_ids = (attn_mask.sum(dim=-1) > start).nonzero().squeeze()
            seq_ids = seq_ids.unsqueeze(0) if seq_ids.dim() == 0 else seq_ids  # ensure 2d
            inputs = input_ids[seq_ids, start:start+max_seq_len]
            attn = attn_mask[seq_ids, start:start+max_seq_len]

            if max_gen_len == 0:
                outputs = self.model(
                    input_ids=inputs.to("cuda"),
                    attention_mask=attn.to("cuda"),
                    output_hidden_states=True,
                )
            else:
                outputs = self.model.generate(
                    input_ids=inputs.to("cuda"),
                    attention_mask=attn.to("cuda"),
                    max_new_tokens=max_gen_len,
                    do_sample=True,
                    temperature=temperature,
                    top_p=top_p,
                    output_hidden_states=True,
                    return_dict_in_generate=True,
                )

            self.compute_bi(outputs.hidden_states, angular=angular)

        return

# Load the data

In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

In [7]:
data = load_dataset("pg19",split="validation",streaming=True).take(5) # loading the dataset.
dataloader = DataLoader(
    data,
    batch_size=1,
    #shuffle=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Load the model

In [8]:
MAX_SEQ_LEN = 1024
short_model = ShortHFModel(
    model_name="meta-llama/Llama-3.2-3B",
    layers_path="model.layers",
    n_prune_layers=5,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
short_model.model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

In [10]:
short_model.layers[0]

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
    (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
    (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
    (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
    (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
    (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
)

In [11]:
# sample generation
gen = short_model.model.generate(
    short_model.tokenizer(["I am an avid fan of "], return_tensors='pt').input_ids.to("cuda"),
    max_new_tokens=20
)
short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['I am an avid fan of 3D animation and I love to create 3D animations for my clients. I love to create']

# Compute the importances

In [12]:
for i, batch in enumerate(tqdm(dataloader)):
    prompts = batch['text']

    short_model.eval_importance(
        prompts=prompts,
        max_seq_len=MAX_SEQ_LEN,
        stride=256,
        max_gen_len=0
    )

0it [00:00, ?it/s]

In [13]:
short_model.importances

[994099.203125,
 421453.3203125,
 337679.955078125,
 332749.5576171875,
 350797.275390625,
 342770.3857421875,
 327574.5859375,
 321823.41015625,
 271661.43359375,
 262673.771484375,
 251812.03857421875,
 284246.2197265625,
 247053.53564453125,
 267426.22802734375,
 256803.025390625,
 202656.91845703125,
 162444.90380859375,
 135945.55859375,
 127603.7177734375,
 131424.0146484375,
 97748.361328125,
 80488.69775390625,
 75904.65478515625,
 78370.91943359375,
 86251.6103515625,
 98231.9482421875,
 130557.31298828125,
 486319.056640625]

# Remove the unimportant layers

In [14]:
short_model.remove_layers()

[22, 23, 21, 24, 20]

In [15]:
short_model.layers

ModuleList(
  (0-22): 23 x LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
      (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
      (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
      (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
      (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
      (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
    (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
  )
)

In [16]:
# reassign layer_idx to attentions for caching
for layer_idx, module in enumerate(short_model.layers):
    module.self_attn.layer_idx = layer_idx

In [24]:
gen = short_model.model.generate(
    short_model.tokenizer(["I am an avid fan of "], return_tensors='pt').input_ids.to("cuda"),
    max_new_tokens=20,
    use_cache=True
)
short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['I am an avid fan of 2AM\'s new album "2MADRLYNSOFTWKRSNTNWNX']

# Compute the angular importances

In [18]:
"""for i, batch in enumerate(tqdm(dataloader)):
    prompts = batch['text']

    short_model.eval_importance(
        prompts=prompts,
        max_seq_len=MAX_SEQ_LEN,
        stride=256,
        max_gen_len=0,
        angular=True
    )"""

"for i, batch in enumerate(tqdm(dataloader)):\n    prompts = batch['text']\n\n    short_model.eval_importance(\n        prompts=prompts,\n        max_seq_len=MAX_SEQ_LEN,\n        stride=256,\n        max_gen_len=0,\n        angular=True\n    )"

In [19]:
#short_model.importances

In [20]:
#short_model.remove_layers(angular=True)

In [21]:
#short_model.layers

In [22]:
# reassign layer_idx to attentions for caching
#for layer_idx, module in enumerate(short_model.layers):
#    module.self_attn.layer_idx = layer_idx

In [23]:
"""gen = short_model.model.generate(
    short_model.tokenizer(["I am an avid fan of "], return_tensors='pt').input_ids.to("cuda"),
    max_new_tokens=20,
    use_cache=True
)
short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"""

'gen = short_model.model.generate(\n    short_model.tokenizer(["I am an avid fan of "], return_tensors=\'pt\').input_ids.to("cuda"),\n    max_new_tokens=20,\n    use_cache=True\n)\nshort_model.tokenizer.batch_decode(gen, skip_special_tokens=True)'

# Upload the model to hf

In [26]:
import os
new_model_name = 'short-llama-3.2-3B'
output_dir = './'+new_model_name
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

short_model.model.save_pretrained(output_dir)
short_model.tokenizer.save_pretrained(output_dir)
#new_config.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}")

Pruned model saved to ./short-llama-3.2-3B


In [27]:
# Push the model to your Hugging Face repository

short_model.model.push_to_hub(new_model_name, private=False)
short_model.tokenizer.push_to_hub(new_model_name)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/453M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Shahrukh0/short-llama-3.2-3B/commit/064c4f5fbba830654b418d9f42a9825402d2dd14', commit_message='Upload tokenizer', commit_description='', oid='064c4f5fbba830654b418d9f42a9825402d2dd14', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Shahrukh0/short-llama-3.2-3B', endpoint='https://huggingface.co', repo_type='model', repo_id='Shahrukh0/short-llama-3.2-3B'), pr_revision=None, pr_num=None)

# Model evaluation

In [28]:
!pip install lm_eval

Collecting lm_eval
  Downloading lm_eval-0.4.7-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate (from lm_eval)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jsonlines (from lm_eval)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pybind11>=2.6.2 (from lm_eval)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pytablewriter (from lm_eval)
  Downloading pytablewriter-1.2.1-py3-none-any.whl.metadata (38 kB)
Collecting rouge-score>=0.0.4 (from lm_eval)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu>=1.5.0 (from lm_eval)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━

In [29]:
from lm_eval import evaluator, tasks, models

In [30]:
def evaluate_hf_model(model_name, tasks, num_fewshot=0):
    """
    It calls the evaluator to evaluate a model available on Hugging Face.

    Args:
    - model_name: The model name in hugging Face.
    - tasks: Tasks to evaluate.
    - num_fewshot: Number of examples of few-shot learning

    Returns:
    - metrics.
    """
    model_args = f"pretrained={model_name},device=cuda"
    tasks = tasks

    results = evaluator.simple_evaluate(
      model="hf",
      model_args=model_args,
      tasks=tasks,
      num_fewshot=0,  # Number of few-shot smaples.
      limit=None,  # Use all the samples in the Evaluate Dataset.
      bootstrap_iters=10
    )

    metrics = results.get('results', {})
    return metrics

In [31]:
# Select tasks to evaluate.
tasks = ['lambada', 'boolq', 'arc_easy']

In [32]:
metrics_pruned = evaluate_hf_model("Shahrukh0/short-llama-3.2-3B", tasks=tasks)
metrics_pruned

INFO:lm-eval:Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
INFO:lm-eval:Initializing hf model, with arguments: {'pretrained': 'Shahrukh0/short-llama-3.2-3B', 'device': 'cuda'}
INFO:lm-eval:Using device 'cuda'


config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

INFO:lm-eval:Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}


model.safetensors.index.json:   0%|          | 0.00/17.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/453M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at Shahrukh0/short-llama-3.2-3B and are newly initialized: ['model.layers.23.input_layernorm.weight', 'model.layers.23.mlp.down_proj.weight', 'model.layers.23.mlp.gate_proj.weight', 'model.layers.23.mlp.up_proj.weight', 'model.layers.23.post_attention_layernorm.weight', 'model.layers.23.self_attn.k_proj.weight', 'model.layers.23.self_attn.o_proj.weight', 'model.layers.23.self_attn.q_proj.weight', 'model.layers.23.self_attn.v_proj.weight', 'model.layers.24.input_layernorm.weight', 'model.layers.24.mlp.down_proj.weight', 'model.layers.24.mlp.gate_proj.weight', 'model.layers.24.mlp.up_proj.weight', 'model.layers.24.post_attention_layernorm.weight', 'model.layers.24.self_attn.k_proj.weight', 'model.layers.24.self_attn.o_proj.weight', 'model.layers.24.self_attn.q_proj.weight', 'model.layers.24.self_attn.v_proj.weight', 'model.layers.25.input_layernorm.weight', 'model.layers.25.mlp.down_proj.weight', 'model.layer

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

lambada_openai.py:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/5153 [00:00<?, ? examples/s]



README.md:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/281M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2662 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5153 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4869 [00:00<?, ? examples/s]



README.md:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

super_glue.py:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

INFO:lm-eval:Building contexts for arc_easy on rank 0...
100%|██████████| 2376/2376 [00:02<00:00, 1002.56it/s]
INFO:lm-eval:Building contexts for boolq on rank 0...
100%|██████████| 3270/3270 [00:02<00:00, 1420.13it/s]
INFO:lm-eval:Building contexts for lambada_standard on rank 0...
100%|██████████| 5153/5153 [00:09<00:00, 547.56it/s]
INFO:lm-eval:Building contexts for lambada_openai on rank 0...
100%|██████████| 5153/5153 [00:10<00:00, 513.39it/s]
INFO:lm-eval:Running loglikelihood requests
Running loglikelihood requests: 100%|██████████| 26347/26347 [18:41<00:00, 23.50it/s]


bootstrapping for stddev: perplexity


100%|██████████| 1/1 [00:00<00:00, 414.38it/s]


bootstrapping for stddev: perplexity


100%|██████████| 1/1 [00:00<00:00, 96.88it/s]


{'arc_easy': {'alias': 'arc_easy',
  'acc,none': 0.25084175084175087,
  'acc_stderr,none': 0.008895183010487386,
  'acc_norm,none': 0.25084175084175087,
  'acc_norm_stderr,none': 0.008895183010487386},
 'boolq': {'alias': 'boolq',
  'acc,none': 0.3782874617737003,
  'acc_stderr,none': 0.008482001133931},
 'lambada_openai': {'alias': 'lambada_openai',
  'perplexity,none': nan,
  'perplexity_stderr,none': nan,
  'acc,none': 0.0,
  'acc_stderr,none': 0.0},
 'lambada_standard': {'alias': 'lambada_standard',
  'perplexity,none': nan,
  'perplexity_stderr,none': nan,
  'acc,none': 0.0,
  'acc_stderr,none': 0.0}}

In [33]:
metrics_base= evaluate_hf_model("meta-llama/Llama-3.2-3B", tasks=tasks)
metrics_base

INFO:lm-eval:Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
INFO:lm-eval:Initializing hf model, with arguments: {'pretrained': 'meta-llama/Llama-3.2-3B', 'device': 'cuda'}
INFO:lm-eval:Using device 'cuda'
INFO:lm-eval:Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:lm-eval:Building contexts for arc_easy on rank 0...
100%|██████████| 2376/2376 [00:02<00:00, 1005.96it/s]
INFO:lm-eval:Building contexts for boolq on rank 0...
100%|██████████| 3270/3270 [00:01<00:00, 1863.30it/s]
INFO:lm-eval:Building contexts for lambada_standard on rank 0...
100%|██████████| 5153/5153 [00:10<00:00, 514.21it/s]
INFO:lm-eval:Building contexts for lambada_openai on rank 0...
100%|██████████| 5153/5153 [00:09<00:00, 525.47it/s]
INFO:lm-eval:Running loglikelihood requests
Running loglikelihood requests:   0%|          | 45/26347 [00:34<4:20:46,  1.68it/s]

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_metrics(metrics_base, metrics_pruned):
    """
    Visualizes accuracy and perplexity before and after pruning.

    Parameters:
    - metrics_base: Dictionary of metrics for the base model
    - metrics_pruned: Dictionary of metrics for the pruned model
    """
    datasets = list(metrics_base.keys())

    # Extract accuracy values
    acc_base = [metrics_base[d].get('acc,none', None) for d in datasets]
    acc_pruned = [metrics_pruned[d].get('acc,none', None) for d in datasets]

    # Extract perplexity values (ignoring None values)
    datasets_ppl = [d for d in datasets if 'perplexity,none' in metrics_base[d]]
    ppl_base = [metrics_base[d]['perplexity,none'] for d in datasets_ppl]
    ppl_pruned = [metrics_pruned[d]['perplexity,none'] for d in datasets_ppl]

    # Bar Width
    bar_width = 0.35
    x = np.arange(len(datasets))

    # Plot Accuracy Comparison
    plt.figure(figsize=(8, 5))
    plt.bar(x - bar_width/2, acc_base, bar_width, label="Base Model", color='blue', alpha=0.7)
    plt.bar(x + bar_width/2, acc_pruned, bar_width, label="Pruned Model", color='red', alpha=0.7)
    plt.xticks(x, datasets, rotation=20)
    plt.ylabel("Accuracy")
    plt.title("Accuracy Comparison (Base vs. Pruned)")
    plt.legend()
    plt.show()

    # Plot Perplexity Comparison (Only for relevant datasets)
    if datasets_ppl:
        x_ppl = np.arange(len(datasets_ppl))
        plt.figure(figsize=(8, 5))
        plt.bar(x_ppl - bar_width/2, ppl_base, bar_width, label="Base Model", color='green', alpha=0.7)
        plt.bar(x_ppl + bar_width/2, ppl_pruned, bar_width, label="Pruned Model", color='orange', alpha=0.7)
        plt.xticks(x_ppl, datasets_ppl)
        plt.ylabel("Perplexity")
        plt.title("Perplexity Comparison (Base vs. Pruned)")
        plt.legend()
        plt.show()

# Call the function
visualize_metrics(metrics_base, metrics_pruned)