In [None]:
# Install libraries
!pip install transformers
!pip install torch
!git clone https://github.com/EleutherAI/lm-evaluation-harness
!cd lm-evaluation-harness && pip install -e .
!pip install deepspeed

In [None]:
# Import and deploy Qwen models
import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
import deepspeed

sys.path.append("/content/lm-evaluation-harness")
from lm_eval import evaluator, tasks
from lm_eval.models.huggingface import HFLM

ds_config = {
    "fp16": {
        "enabled": True
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu"
        }
    },
    "train_batch_size": 8,
    "train_micro_batch_size_per_gpu": 8
}

MODELS = [
    "Qwen/Qwen2.5-1.5B",
    "Qwen/Qwen2.5-7B"
]

TASKS = {
    "NLI": ["hellaswag"],
    "understanding": ["mmlu"],
    "code_generation": ["mbpp"]
}

results = {}

In [3]:
# Benchmark for Qwen
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

for model_name in MODELS:
    print(f"Evaluating {model_name}...")

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype=torch.float16
    )

    ds_engine = deepspeed.init_inference(
        model=model,
        mp_size=1,
        dtype=torch.float16,
        replace_with_kernel_inject=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    hf_model = HFLM(
        pretrained=ds_engine.module,
        tokenizer=tokenizer,
        batch_size=8,
        device="cuda"
    )

    model_results = {}

    for category, task_list in TASKS.items():
        print(f"Evaluating {category} tasks...")

        num_fewshot = 2 if category == "code_generation" else 0

        results_dict = evaluator.simple_evaluate(
            model=hf_model,
            tasks=task_list,
            num_fewshot=num_fewshot,
            batch_size=8,
            device="cuda",
            confirm_run_unsafe_code=True,
            gen_kwargs="temperature=0.1,top_p=0.95,max_length=512",
            random_seed=42,
            torch_random_seed=42,
            fewshot_random_seed=42
        )

        model_results[category] = results_dict

    results[model_name] = model_results

    del hf_model
    del ds_engine
    del model
    torch.cuda.empty_cache()

Evaluating Qwen/Qwen2.5-1.5B...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

[2025-04-15 11:16:39,161] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.16.5, git-hash=unknown, git-branch=unknown
[2025-04-15 11:16:39,164] [INFO] [logging.py:107:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]



Evaluating NLI tasks...


README.md:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

hellaswag.py:   0%|          | 0.00/4.36k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/2.53k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39905 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10042 [00:00<?, ? examples/s]

Map:   0%|          | 0/39905 [00:00<?, ? examples/s]

Map:   0%|          | 0/10042 [00:00<?, ? examples/s]

100%|██████████| 10042/10042 [00:04<00:00, 2311.81it/s]
Running loglikelihood requests: 100%|██████████| 40168/40168 [03:23<00:00, 196.94it/s]


Evaluating understanding tasks...


README.md:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

mmlu_no_train.py:   0%|          | 0.00/5.86k [00:00<?, ?B/s]

data.tar:   0%|          | 0.00/166M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

100%|██████████| 270/270 [00:00<00:00, 658.05it/s]
100%|██████████| 100/100 [00:00<00:00, 663.17it/s]
100%|██████████| 235/235 [00:00<00:00, 661.25it/s]
100%|██████████| 100/100 [00:00<00:00, 663.30it/s]
100%|██████████| 145/145 [00:00<00:00, 663.64it/s]
100%|██████████| 100/100 [00:00<00:00, 673.03it/s]
100%|██████████| 203/203 [00:00<00:00, 670.33it/s]
100%|██████████| 135/135 [00:00<00:00, 658.53it/s]
100%|██████████| 310/310 [00:00<00:00, 673.39it/s]
100%|██████████| 100/100 [00:00<00:00, 671.21it/s]
100%|██████████| 378/378 [00:00<00:00, 665.45it/s]
100%|██████████| 144/144 [00:00<00:00, 672.82it/s]
100%|██████████| 151/151 [00:00<00:00, 670.33it/s]
100%|██████████| 100/100 [00:00<00:00, 665.92it/s]
100%|██████████| 102/102 [00:00<00:00, 651.08it/s]
100%|██████████| 152/152 [00:00<00:00, 658.43it/s]
100%|██████████| 112/112 [00:00<00:00, 668.07it/s]
100%|██████████| 216/216 [00:00<00:00, 657.88it/s]
100%|██████████| 100/100 [00:00<00:00, 660.75it/s]
100%|██████████| 223/223 [00:00

Evaluating code_generation tasks...


Downloading builder script:   0%|          | 0.00/9.18k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.06k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/87.2k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/116k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

prompt-00000-of-00001.parquet:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/374 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/90 [00:00<?, ? examples/s]

Generating prompt split:   0%|          | 0/10 [00:00<?, ? examples/s]

100%|██████████| 500/500 [00:02<00:00, 188.64it/s]
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Running generate_until requests:   0%|          | 1/500 [00:08<1:12:06,  8.67s/it]Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Running generate_until requests:   2%|▏         | 9/500 [00:12<09:15,  1.13s/it]  Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Running generate_until requests:

Evaluating Qwen/Qwen2.5-7B...


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

[2025-04-15 11:33:15,331] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.16.5, git-hash=unknown, git-branch=unknown
[2025-04-15 11:33:15,334] [INFO] [logging.py:107:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]



Evaluating NLI tasks...


100%|██████████| 10042/10042 [00:03<00:00, 2532.99it/s]
Running loglikelihood requests: 100%|██████████| 40168/40168 [05:01<00:00, 133.16it/s]


Evaluating understanding tasks...


100%|██████████| 270/270 [00:00<00:00, 648.86it/s]
100%|██████████| 100/100 [00:00<00:00, 667.76it/s]
100%|██████████| 235/235 [00:00<00:00, 658.56it/s]
100%|██████████| 100/100 [00:00<00:00, 664.79it/s]
100%|██████████| 145/145 [00:00<00:00, 668.10it/s]
100%|██████████| 100/100 [00:00<00:00, 665.01it/s]
100%|██████████| 203/203 [00:00<00:00, 662.94it/s]
100%|██████████| 135/135 [00:00<00:00, 655.40it/s]
100%|██████████| 310/310 [00:00<00:00, 660.59it/s]
100%|██████████| 100/100 [00:00<00:00, 659.95it/s]
100%|██████████| 378/378 [00:00<00:00, 666.42it/s]
100%|██████████| 144/144 [00:00<00:00, 670.09it/s]
100%|██████████| 151/151 [00:00<00:00, 658.77it/s]
100%|██████████| 100/100 [00:00<00:00, 672.18it/s]
100%|██████████| 102/102 [00:00<00:00, 657.56it/s]
100%|██████████| 152/152 [00:00<00:00, 670.19it/s]
100%|██████████| 112/112 [00:00<00:00, 675.95it/s]
100%|██████████| 216/216 [00:00<00:00, 643.51it/s]
100%|██████████| 100/100 [00:00<00:00, 641.93it/s]
100%|██████████| 223/223 [00:00

Evaluating code_generation tasks...


100%|██████████| 500/500 [00:02<00:00, 187.55it/s]
Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Running generate_until requests:   0%|          | 1/500 [00:19<2:38:02, 19.00s/it]Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Running generate_until requests:   2%|▏         | 9/500 [00:22<16:01,  1.96s/it]  Both `max_new_tokens` (=2048) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Running generate_until requests:

In [4]:
# Value Function
def get_metric_value(task_results, metric_name):
    formats = [
        f"{metric_name},none",
        metric_name
    ]

    for fmt in formats:
        if fmt in task_results:
            return task_results[fmt]
    return None

In [5]:
# Print Model Evaluation Results
print("\n============= Model Evaluation Results with 7B vs 1.5B Improvement =============")

# Determine all task categories
all_categories = set()
for model_results in results.values():
    all_categories.update(model_results.keys())

# Get models in correct order for comparison
models = list(results.keys())
model_headers = [model.split('/')[-1] for model in models]  # Only take the last part of model names

# Ensure we have exactly 2 models for comparison
if len(models) != 2:
    print("Warning: Expected exactly 2 models for comparison")

# Find the indices for the 1.5B and 7B models
model_1_5B_idx = -1
model_7B_idx = -1
for i, header in enumerate(model_headers):
    if "1.5B" in header:
        model_1_5B_idx = i
    elif "7B" in header:
        model_7B_idx = i

# Print headers
header = "Task/Metric".ljust(25)
for model_header in model_headers:
    header += model_header.ljust(20)
header += "Improvement(pp)".ljust(20)  # Add improvement column
print(header)
print("-" * (25 + 20 * len(models) + 20))  # Extend line for new column

# Process and print results for all categories
for category in sorted(all_categories):
    print(f"\n【{category}】")

    # Collect all tasks in this category
    category_tasks = set()
    for model in models:
        if category in results[model] and 'results' in results[model][category]:
            category_tasks.update(results[model][category]['results'].keys())

    # Print results for each task
    for task in sorted(category_tasks):
        # Special handling for hellaswag - distinguish between standard and normalized
        if task == "hellaswag":
            # Standard acc
            task_line = f"  {task} (standard)".ljust(25)
            model_values = []

            for model in models:
                if (category in results[model] and
                    'results' in results[model][category] and
                    task in results[model][category]['results']):
                    task_results = results[model][category]['results'][task]

                    # Get value and stderr
                    value = get_metric_value(task_results, "acc")
                    stderr = get_metric_value(task_results, "acc_stderr")

                    if value is not None:
                        # Store raw value for improvement calculation
                        model_values.append(value)
                        # Convert to percentage for display
                        score = f"{value*100:.2f}% ± {stderr*100:.2f}%" if stderr else f"{value*100:.2f}%"
                    else:
                        model_values.append(None)
                        score = "N/A"
                    task_line += score.ljust(20)
                else:
                    model_values.append(None)
                    task_line += "N/A".ljust(20)

            # Calculate improvement (in percentage points)
            if len(model_values) >= 2 and model_values[model_1_5B_idx] is not None and model_values[model_7B_idx] is not None:
                improvement = (model_values[model_7B_idx] - model_values[model_1_5B_idx]) * 100
                task_line += f"+{improvement:.2f}pp".ljust(20) if improvement >= 0 else f"{improvement:.2f}pp".ljust(20)
            else:
                task_line += "N/A".ljust(20)

            print(task_line)

            # Normalized acc
            task_line = f"  {task} (normalized)".ljust(25)
            model_values = []

            for model in models:
                if (category in results[model] and
                    'results' in results[model][category] and
                    task in results[model][category]['results']):
                    task_results = results[model][category]['results'][task]

                    # Get value and stderr
                    value = get_metric_value(task_results, "acc_norm")
                    stderr = get_metric_value(task_results, "acc_norm_stderr")

                    if value is not None:
                        # Store raw value for improvement calculation
                        model_values.append(value)
                        # Convert to percentage for display
                        score = f"{value*100:.2f}% ± {stderr*100:.2f}%" if stderr else f"{value*100:.2f}%"
                    else:
                        model_values.append(None)
                        score = "N/A"
                    task_line += score.ljust(20)
                else:
                    model_values.append(None)
                    task_line += "N/A".ljust(20)

            # Calculate improvement (in percentage points)
            if len(model_values) >= 2 and model_values[model_1_5B_idx] is not None and model_values[model_7B_idx] is not None:
                improvement = (model_values[model_7B_idx] - model_values[model_1_5B_idx]) * 100
                task_line += f"+{improvement:.2f}pp".ljust(20) if improvement >= 0 else f"{improvement:.2f}pp".ljust(20)
            else:
                task_line += "N/A".ljust(20)

            print(task_line)

        # Special handling for mbpp
        elif task == "mbpp":
            task_line = f"  {task} (pass@1)".ljust(25)
            model_values = []

            for model in models:
                if (category in results[model] and
                    'results' in results[model][category] and
                    task in results[model][category]['results']):
                    task_results = results[model][category]['results'][task]

                    # Get value and stderr
                    value = get_metric_value(task_results, "pass_at_1")
                    stderr = get_metric_value(task_results, "pass_at_1_stderr")

                    if value is not None:
                        # Store raw value for improvement calculation
                        model_values.append(value)
                        # Convert to percentage for display
                        score = f"{value*100:.2f}% ± {stderr*100:.2f}%" if stderr else f"{value*100:.2f}%"
                    else:
                        model_values.append(None)
                        score = "N/A"
                    task_line += score.ljust(20)
                else:
                    model_values.append(None)
                    task_line += "N/A".ljust(20)

            # Calculate improvement (in percentage points)
            if len(model_values) >= 2 and model_values[model_1_5B_idx] is not None and model_values[model_7B_idx] is not None:
                improvement = (model_values[model_7B_idx] - model_values[model_1_5B_idx]) * 100
                task_line += f"+{improvement:.2f}pp".ljust(20) if improvement >= 0 else f"{improvement:.2f}pp".ljust(20)
            else:
                task_line += "N/A".ljust(20)

            print(task_line)

        # Handle MMLU and other tasks that use acc
        else:
            task_line = f"  {task}".ljust(25)
            model_values = []

            for model in models:
                if (category in results[model] and
                    'results' in results[model][category] and
                    task in results[model][category]['results']):
                    task_results = results[model][category]['results'][task]

                    # Get value and stderr
                    value = get_metric_value(task_results, "acc")
                    stderr = get_metric_value(task_results, "acc_stderr")

                    if value is not None:
                        # Store raw value for improvement calculation
                        model_values.append(value)
                        # Convert to percentage for display
                        score = f"{value*100:.2f}% ± {stderr*100:.2f}%" if stderr else f"{value*100:.2f}%"
                    else:
                        model_values.append(None)
                        score = "N/A"
                    task_line += score.ljust(20)
                else:
                    model_values.append(None)
                    task_line += "N/A".ljust(20)

            # Calculate improvement (in percentage points)
            if len(model_values) >= 2 and model_values[model_1_5B_idx] is not None and model_values[model_7B_idx] is not None:
                improvement = (model_values[model_7B_idx] - model_values[model_1_5B_idx]) * 100
                task_line += f"+{improvement:.2f}pp".ljust(20) if improvement >= 0 else f"{improvement:.2f}pp".ljust(20)
            else:
                task_line += "N/A".ljust(20)

            print(task_line)


Task/Metric              Qwen2.5-1.5B        Qwen2.5-7B          Improvement(pp)     
-------------------------------------------------------------------------------------

【NLI】
  hellaswag (standard)   50.24% ± 0.50%      60.01% ± 0.49%      +9.77pp             
  hellaswag (normalized) 67.75% ± 0.47%      78.93% ± 0.41%      +11.18pp            

【code_generation】
  mbpp (pass@1)          46.00% ± 2.23%      62.20% ± 2.17%      +16.20pp            

【understanding】
  mmlu                   59.74% ± 0.39%      71.90% ± 0.35%      +12.16pp            
  mmlu_abstract_algebra  35.00% ± 4.79%      54.00% ± 5.01%      +19.00pp            
  mmlu_anatomy           51.85% ± 4.32%      71.85% ± 3.89%      +20.00pp            
  mmlu_astronomy         71.05% ± 3.69%      83.55% ± 3.02%      +12.50pp            
  mmlu_business_ethics   61.00% ± 4.90%      76.00% ± 4.29%      +15.00pp            
  mmlu_clinical_knowledge68.30% ± 2.86%      76.98% ± 2.59%      +8.68pp             
  mmlu_col