# Preparation

In [None]:
"""
# Download model
huggingface-cli download tiiuae/Falcon3-3B-Instruct-1.58bit --local-dir ~/models/tiiuae/Falcon3-3B-Instruct-1.58bit
# Compile
cmake -S . -B build -G Ninja \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
  -DGGML_NATIVE=OFF \
  -DGGML_AVX=ON -DGGML_AVX2=ON \
  -DGGML_AVX512=ON \
  -DGGML_AVX512_VNNI=ON \
  -DGGML_AVX512_BF16=ON \
  -DGGML_AVX512_VBMI=OFF
python setup_env.py -md ~/models/tiiuae/Falcon3-3B-Instruct-1.58bit -q i2_s --quant-embd
"""

# Importing Libraries

In [None]:
import gc
from dataclasses import dataclass
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import set_seed, user_prompt, system_prompt
from speculative_decoding import BitNet

# Configuration

In [None]:
@dataclass
class CONFIG:
    # Debug
    debug: bool = True
    verbose: bool = True

    # Model
    ## Tokenizer
    tokenizer_id: str = "tiiuae/Falcon3-1B-Instruct"
    ## HuggingFace
    model_path: str       = "/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct"  # 3B
    model_small_path: str = "/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct"  # 1B
    ## GGUF (1bit)
    bitnet_path: str = "/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct-1.58bit/ggml-model-i2_s.gguf" # 3B
    ctx_size: int = 256

    # Generation
    max_new_tokens: int = 100
    ## Speculative Decoding
    num_assistant_tokens: int = 10
    assistant_early_exit: int = 4

    # Device
    device = "cpu"
    n_threads: int = 12

    # Seed
    seed = 42

config = CONFIG()

In [None]:
set_seed(config.seed)

# Model

In [None]:
bitnet = BitNet()
bitnet.start_server(
    bitnet_path=config.bitnet_path,
    ctx_size=config.ctx_size,
    n_threads=config.n_threads,
    verbose=config.verbose
)
bitnet.init_tokenizer(
    tokenizer_id=config.tokenizer_id,
    verbose=False
)
bitnet.init_model(
    model_path=config.model_path,
    device=config.device,
    verbose=True
)

# Generation

In [None]:
input_ids = bitnet.encode_falcon_prompt(
    system_prompt=system_prompt,
    user_prompt=user_prompt
)
print(input_ids)
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_id)
input_tokens = tokenizer.decode(input_ids, skip_special_tokens=False)
print(input_tokens)

In [None]:
# 3B (1bit)
if config.debug:
    bitnet.generate_gguf(
        input_ids=input_ids,
        max_new_tokens=config.max_new_tokens,
        verbose=config.verbose
    )
    print()

In [None]:
# 3B (fp32)
if config.debug:
    bitnet.generate_hf(
        input_tokens=input_tokens,
        max_new_tokens=config.max_new_tokens,
        stream=True,
        verbose=config.verbose
    )

In [None]:
gc.collect()
torch.cuda.empty_cache()

## Speculative Decoding

In [None]:
# 32bit-1bit
bitnet.speculative_decode(
    input_ids=input_ids,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    verbose=True
)
gc.collect()
torch.cuda.empty_cache()

In [None]:
small_model = AutoModelForCausalLM.from_pretrained(
    config.model_small_path,
    device_map=config.device,
    dtype=torch.float32,
    attn_implementation="eager"
)

In [None]:
# 3B-1B
bitnet.speculative_decode(
    small_model=small_model,
    input_ids=input_ids,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    verbose=True
)
gc.collect()
torch.cuda.empty_cache()

In [None]:
# 3B-1B, HuggingFace
bitnet.generate_hf(
    small_model=small_model,
    input_tokens=input_tokens,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    stream=True,
    verbose=True
)
gc.collect()
torch.cuda.empty_cache()

In [None]:
# 3B(early exit), HuggingFace
bitnet.generate_hf(
    input_tokens=input_tokens,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    assistant_early_exit=config.assistant_early_exit,
    stream=True,
    verbose=True
)
gc.collect()
torch.cuda.empty_cache()

## Graph

In [None]:
# ---------------------------------------------------
# 실험 설정
# ---------------------------------------------------
num_assistant_tokens_list = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
MAX_NEW_TOKENS = 100

results_no_small = []   # GGUF draft (small_model=None)
results_small = []      # HF draft (small_model=small_model)

# ---------------------------------------------------
# 1️⃣ GGUF draft (small_model 없음)
# ---------------------------------------------------
for n in num_assistant_tokens_list:
    out = bitnet.speculative_decode(
        input_ids=input_ids,
        max_new_tokens=MAX_NEW_TOKENS,
        num_assistant_tokens=n,
        small_model=None,       # GGUF draft
        verbose=False
    )
    gen_text = out["generated"]
    latency = float(out["latency"])
    acc_rate = float(out["acceptance_rate"])

    results_no_small.append({
        "num_assistant_tokens": n,
        "latency": latency,
        "acceptance": acc_rate
    })

# ---------------------------------------------------
# 2️⃣ HF small model draft
# ---------------------------------------------------
for n in num_assistant_tokens_list:
    out = bitnet.speculative_decode(
        input_ids=input_ids,
        max_new_tokens=MAX_NEW_TOKENS,
        num_assistant_tokens=n,
        small_model=small_model,   # HuggingFace small model
        verbose=False
    )
    gen_text = out["generated"]
    latency = float(out["latency"])
    acc_rate = float(out["acceptance_rate"])

    results_small.append({
        "num_assistant_tokens": n,
        "latency": latency,
        "acceptance": acc_rate
    })

# ---------------------------------------------------
# 결과 정리
# ---------------------------------------------------
print("\n=== GGUF Draft Results ===")
pprint(results_no_small)
print("\n=== HF Small Model Draft Results ===")
pprint(results_small)

# numpy 변환
xs = np.array(num_assistant_tokens_list)
lat_no_small = np.array([r["latency"] for r in results_no_small])
lat_small = np.array([r["latency"] for r in results_small])
acc_no_small = np.array([r["acceptance"] for r in results_no_small])
acc_small = np.array([r["acceptance"] for r in results_small])

# ---------------------------------------------------
# 그래프 2개 (Latency / Acceptance Rate)
# ---------------------------------------------------
plt.figure(figsize=(9, 8))

# Latency 그래프
plt.subplot(2, 1, 1)
plt.plot(xs, lat_no_small, marker='o', label="GGUF Draft (no small model)")
plt.plot(xs, lat_small, marker='s', linestyle='--', label="HF Small Model Draft")
plt.title("Latency vs. num_assistant_tokens")
plt.xlabel("num_assistant_tokens")
plt.ylabel("Latency (ms/token)")
plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5)

# Acceptance Rate 그래프
plt.subplot(2, 1, 2)
plt.plot(xs, acc_no_small, marker='o', label="GGUF Draft (no small model)")
plt.plot(xs, acc_small, marker='s', linestyle='--', label="HF Small Model Draft")
plt.title("Acceptance Rate vs. num_assistant_tokens")
plt.xlabel("num_assistant_tokens")
plt.ylabel("Acceptance Rate (%)")
plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

# Evaluation

In [None]:
"""
lm_eval --model hf \
    --model_args pretrained=/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct,trust_remote_code=True \
    --tasks mmlu,hellaswag,gsm8k_cot,arc_easy \
    --num_fewshot 5 \
    --device cuda:0 \
    --batch_size 4 \
    --seed 1234 \
    --apply_chat_template \
    --output_path results \
    --log_samples \
    --wandb_args project=lm-eval-harness-integration

lm_eval --model hf \
    --model_args pretrained=/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct,trust_remote_code=True \
    --tasks mmlu,hellaswag,gsm8k_cot,arc_easy \
    --num_fewshot 5 \
    --device cuda:0 \
    --batch_size 4 \
    --seed 1234 \
    --apply_chat_template \
    --output_path results \
    --log_samples \
    --wandb_args project=lm-eval-harness-integration

lm_eval --model hf \
    --model_args pretrained=tiiuae/Falcon3-3B-Instruct-1.58bit,trust_remote_code=True \
    --tasks mmlu,hellaswag,gsm8k_cot,arc_easy \
    --num_fewshot 5 \
    --device cuda:0 \
    --batch_size 4 \
    --seed 1234 \
    --apply_chat_template \
    --output_path results \
    --log_samples \
    --wandb_args project=lm-eval-harness-integration
"""

In [None]:
"""
hf (pretrained=/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 4
|                 Tasks                 |Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|---------------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|arc_easy                               |      1|none            |     5|acc        |↑  |0.7639|±  |0.0087|
|                                       |       |none            |     5|acc_norm   |↑  |0.7719|±  |0.0086|
|gsm8k_cot                              |      3|flexible-extract|     5|exact_match|↑  |0.7013|±  |0.0126|
|                                       |       |strict-match    |     5|exact_match|↑  |0.7005|±  |0.0126|
|hellaswag                              |      1|none            |     5|acc        |↑  |0.5227|±  |0.0050|
|                                       |       |none            |     5|acc_norm   |↑  |0.6846|±  |0.0046|
|mmlu                                   |      2|none            |      |acc        |↑  |0.5631|±  |0.0040|
| - humanities                          |      2|none            |      |acc        |↑  |0.4869|±  |0.0068|
|  - formal_logic                       |      1|none            |     5|acc        |↑  |0.4524|±  |0.0445|
|  - high_school_european_history       |      1|none            |     5|acc        |↑  |0.6606|±  |0.0370|
|  - high_school_us_history             |      1|none            |     5|acc        |↑  |0.7157|±  |0.0317|
|  - high_school_world_history          |      1|none            |     5|acc        |↑  |0.7300|±  |0.0289|
|  - international_law                  |      1|none            |     5|acc        |↑  |0.7190|±  |0.0410|
|  - jurisprudence                      |      1|none            |     5|acc        |↑  |0.7037|±  |0.0441|
|  - logical_fallacies                  |      1|none            |     5|acc        |↑  |0.7239|±  |0.0351|
|  - moral_disputes                     |      1|none            |     5|acc        |↑  |0.6416|±  |0.0258|
|  - moral_scenarios                    |      1|none            |     5|acc        |↑  |0.2235|±  |0.0139|
|  - philosophy                         |      1|none            |     5|acc        |↑  |0.6463|±  |0.0272|
|  - prehistory                         |      1|none            |     5|acc        |↑  |0.6049|±  |0.0272|
|  - professional_law                   |      1|none            |     5|acc        |↑  |0.3748|±  |0.0124|
|  - world_religions                    |      1|none            |     5|acc        |↑  |0.7661|±  |0.0325|
| - other                               |      2|none            |      |acc        |↑  |0.6154|±  |0.0086|
|  - business_ethics                    |      1|none            |     5|acc        |↑  |0.5700|±  |0.0498|
|  - clinical_knowledge                 |      1|none            |     5|acc        |↑  |0.5962|±  |0.0302|
|  - college_medicine                   |      1|none            |     5|acc        |↑  |0.5954|±  |0.0374|
|  - global_facts                       |      1|none            |     5|acc        |↑  |0.3700|±  |0.0485|
|  - human_aging                        |      1|none            |     5|acc        |↑  |0.6457|±  |0.0321|
|  - management                         |      1|none            |     5|acc        |↑  |0.6990|±  |0.0454|
|  - marketing                          |      1|none            |     5|acc        |↑  |0.7821|±  |0.0270|
|  - medical_genetics                   |      1|none            |     5|acc        |↑  |0.6500|±  |0.0479|
|  - miscellaneous                      |      1|none            |     5|acc        |↑  |0.7011|±  |0.0164|
|  - nutrition                          |      1|none            |     5|acc        |↑  |0.6111|±  |0.0279|
|  - professional_accounting            |      1|none            |     5|acc        |↑  |0.4681|±  |0.0298|
|  - professional_medicine              |      1|none            |     5|acc        |↑  |0.5257|±  |0.0303|
|  - virology                           |      1|none            |     5|acc        |↑  |0.4940|±  |0.0389|
| - social sciences                     |      2|none            |      |acc        |↑  |0.6422|±  |0.0085|
|  - econometrics                       |      1|none            |     5|acc        |↑  |0.5088|±  |0.0470|
|  - high_school_geography              |      1|none            |     5|acc        |↑  |0.6869|±  |0.0330|
|  - high_school_government_and_politics|      1|none            |     5|acc        |↑  |0.6736|±  |0.0338|
|  - high_school_macroeconomics         |      1|none            |     5|acc        |↑  |0.5513|±  |0.0252|
|  - high_school_microeconomics         |      1|none            |     5|acc        |↑  |0.6681|±  |0.0306|
|  - high_school_psychology             |      1|none            |     5|acc        |↑  |0.7468|±  |0.0186|
|  - human_sexuality                    |      1|none            |     5|acc        |↑  |0.6183|±  |0.0426|
|  - professional_psychology            |      1|none            |     5|acc        |↑  |0.5441|±  |0.0201|
|  - public_relations                   |      1|none            |     5|acc        |↑  |0.5818|±  |0.0472|
|  - security_studies                   |      1|none            |     5|acc        |↑  |0.6980|±  |0.0294|
|  - sociology                          |      1|none            |     5|acc        |↑  |0.7413|±  |0.0310|
|  - us_foreign_policy                  |      1|none            |     5|acc        |↑  |0.7300|±  |0.0446|
| - stem                                |      2|none            |      |acc        |↑  |0.5480|±  |0.0086|
|  - abstract_algebra                   |      1|none            |     5|acc        |↑  |0.3000|±  |0.0461|
|  - anatomy                            |      1|none            |     5|acc        |↑  |0.5333|±  |0.0431|
|  - astronomy                          |      1|none            |     5|acc        |↑  |0.6908|±  |0.0376|
|  - college_biology                    |      1|none            |     5|acc        |↑  |0.7222|±  |0.0375|
|  - college_chemistry                  |      1|none            |     5|acc        |↑  |0.5500|±  |0.0500|
|  - college_computer_science           |      1|none            |     5|acc        |↑  |0.5200|±  |0.0502|
|  - college_mathematics                |      1|none            |     5|acc        |↑  |0.3800|±  |0.0488|
|  - college_physics                    |      1|none            |     5|acc        |↑  |0.4412|±  |0.0494|
|  - computer_security                  |      1|none            |     5|acc        |↑  |0.7500|±  |0.0435|
|  - conceptual_physics                 |      1|none            |     5|acc        |↑  |0.6340|±  |0.0315|
|  - electrical_engineering             |      1|none            |     5|acc        |↑  |0.6345|±  |0.0401|
|  - elementary_mathematics             |      1|none            |     5|acc        |↑  |0.4815|±  |0.0257|
|  - high_school_biology                |      1|none            |     5|acc        |↑  |0.6968|±  |0.0261|
|  - high_school_chemistry              |      1|none            |     5|acc        |↑  |0.5665|±  |0.0349|
|  - high_school_computer_science       |      1|none            |     5|acc        |↑  |0.6700|±  |0.0473|
|  - high_school_mathematics            |      1|none            |     5|acc        |↑  |0.3704|±  |0.0294|
|  - high_school_physics                |      1|none            |     5|acc        |↑  |0.4636|±  |0.0407|
|  - high_school_statistics             |      1|none            |     5|acc        |↑  |0.5139|±  |0.0341|
|  - machine_learning                   |      1|none            |     5|acc        |↑  |0.4464|±  |0.0472|

|      Groups      |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|------------------|------:|------|------|------|---|-----:|---|-----:|
|mmlu              |      2|none  |      |acc   |↑  |0.5631|±  |0.0040|
| - humanities     |      2|none  |      |acc   |↑  |0.4869|±  |0.0068|
| - other          |      2|none  |      |acc   |↑  |0.6154|±  |0.0086|
| - social sciences|      2|none  |      |acc   |↑  |0.6422|±  |0.0085|
| - stem           |      2|none  |      |acc   |↑  |0.5480|±  |0.0086|
"""

In [None]:
"""
hf (pretrained=/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 4
|                 Tasks                 |Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|---------------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|arc_easy                               |      1|none            |     5|acc        |↑  |0.7378|±  |0.0090|
|                                       |       |none            |     5|acc_norm   |↑  |0.7285|±  |0.0091|
|gsm8k_cot                              |      3|flexible-extract|     5|exact_match|↑  |0.4071|±  |0.0135|
|                                       |       |strict-match    |     5|exact_match|↑  |0.4026|±  |0.0135|
|hellaswag                              |      1|none            |     5|acc        |↑  |0.4648|±  |0.0050|
|                                       |       |none            |     5|acc_norm   |↑  |0.6009|±  |0.0049|
|mmlu                                   |      2|none            |      |acc        |↑  |0.4474|±  |0.0041|
| - humanities                          |      2|none            |      |acc        |↑  |0.4049|±  |0.0069|
|  - formal_logic                       |      1|none            |     5|acc        |↑  |0.3333|±  |0.0422|
|  - high_school_european_history       |      1|none            |     5|acc        |↑  |0.6000|±  |0.0383|
|  - high_school_us_history             |      1|none            |     5|acc        |↑  |0.5588|±  |0.0348|
|  - high_school_world_history          |      1|none            |     5|acc        |↑  |0.6160|±  |0.0317|
|  - international_law                  |      1|none            |     5|acc        |↑  |0.5537|±  |0.0454|
|  - jurisprudence                      |      1|none            |     5|acc        |↑  |0.4630|±  |0.0482|
|  - logical_fallacies                  |      1|none            |     5|acc        |↑  |0.4847|±  |0.0393|
|  - moral_disputes                     |      1|none            |     5|acc        |↑  |0.5231|±  |0.0269|
|  - moral_scenarios                    |      1|none            |     5|acc        |↑  |0.2346|±  |0.0142|
|  - philosophy                         |      1|none            |     5|acc        |↑  |0.4502|±  |0.0283|
|  - prehistory                         |      1|none            |     5|acc        |↑  |0.5031|±  |0.0278|
|  - professional_law                   |      1|none            |     5|acc        |↑  |0.3357|±  |0.0121|
|  - world_religions                    |      1|none            |     5|acc        |↑  |0.5789|±  |0.0379|
| - other                               |      2|none            |      |acc        |↑  |0.4976|±  |0.0087|
|  - business_ethics                    |      1|none            |     5|acc        |↑  |0.5000|±  |0.0503|
|  - clinical_knowledge                 |      1|none            |     5|acc        |↑  |0.5396|±  |0.0307|
|  - college_medicine                   |      1|none            |     5|acc        |↑  |0.4277|±  |0.0377|
|  - global_facts                       |      1|none            |     5|acc        |↑  |0.2600|±  |0.0441|
|  - human_aging                        |      1|none            |     5|acc        |↑  |0.4574|±  |0.0334|
|  - management                         |      1|none            |     5|acc        |↑  |0.6602|±  |0.0469|
|  - marketing                          |      1|none            |     5|acc        |↑  |0.7009|±  |0.0300|
|  - medical_genetics                   |      1|none            |     5|acc        |↑  |0.5100|±  |0.0502|
|  - miscellaneous                      |      1|none            |     5|acc        |↑  |0.5785|±  |0.0177|
|  - nutrition                          |      1|none            |     5|acc        |↑  |0.5261|±  |0.0286|
|  - professional_accounting            |      1|none            |     5|acc        |↑  |0.3475|±  |0.0284|
|  - professional_medicine              |      1|none            |     5|acc        |↑  |0.3162|±  |0.0282|
|  - virology                           |      1|none            |     5|acc        |↑  |0.4217|±  |0.0384|
| - social sciences                     |      2|none            |      |acc        |↑  |0.5180|±  |0.0088|
|  - econometrics                       |      1|none            |     5|acc        |↑  |0.3246|±  |0.0440|
|  - high_school_geography              |      1|none            |     5|acc        |↑  |0.5960|±  |0.0350|
|  - high_school_government_and_politics|      1|none            |     5|acc        |↑  |0.5959|±  |0.0354|
|  - high_school_macroeconomics         |      1|none            |     5|acc        |↑  |0.4154|±  |0.0250|
|  - high_school_microeconomics         |      1|none            |     5|acc        |↑  |0.4790|±  |0.0324|
|  - high_school_psychology             |      1|none            |     5|acc        |↑  |0.6239|±  |0.0208|
|  - human_sexuality                    |      1|none            |     5|acc        |↑  |0.5191|±  |0.0438|
|  - professional_psychology            |      1|none            |     5|acc        |↑  |0.4020|±  |0.0198|
|  - public_relations                   |      1|none            |     5|acc        |↑  |0.5909|±  |0.0471|
|  - security_studies                   |      1|none            |     5|acc        |↑  |0.5510|±  |0.0318|
|  - sociology                          |      1|none            |     5|acc        |↑  |0.6368|±  |0.0340|
|  - us_foreign_policy                  |      1|none            |     5|acc        |↑  |0.6600|±  |0.0476|
| - stem                                |      2|none            |      |acc        |↑  |0.3923|±  |0.0086|
|  - abstract_algebra                   |      1|none            |     5|acc        |↑  |0.2200|±  |0.0416|
|  - anatomy                            |      1|none            |     5|acc        |↑  |0.4296|±  |0.0428|
|  - astronomy                          |      1|none            |     5|acc        |↑  |0.4868|±  |0.0407|
|  - college_biology                    |      1|none            |     5|acc        |↑  |0.4167|±  |0.0412|
|  - college_chemistry                  |      1|none            |     5|acc        |↑  |0.3500|±  |0.0479|
|  - college_computer_science           |      1|none            |     5|acc        |↑  |0.4500|±  |0.0500|
|  - college_mathematics                |      1|none            |     5|acc        |↑  |0.3300|±  |0.0473|
|  - college_physics                    |      1|none            |     5|acc        |↑  |0.3137|±  |0.0462|
|  - computer_security                  |      1|none            |     5|acc        |↑  |0.5700|±  |0.0498|
|  - conceptual_physics                 |      1|none            |     5|acc        |↑  |0.4043|±  |0.0321|
|  - electrical_engineering             |      1|none            |     5|acc        |↑  |0.4966|±  |0.0417|
|  - elementary_mathematics             |      1|none            |     5|acc        |↑  |0.3122|±  |0.0239|
|  - high_school_biology                |      1|none            |     5|acc        |↑  |0.5516|±  |0.0283|
|  - high_school_chemistry              |      1|none            |     5|acc        |↑  |0.3793|±  |0.0341|
|  - high_school_computer_science       |      1|none            |     5|acc        |↑  |0.4800|±  |0.0502|
|  - high_school_mathematics            |      1|none            |     5|acc        |↑  |0.2741|±  |0.0272|
|  - high_school_physics                |      1|none            |     5|acc        |↑  |0.2649|±  |0.0360|
|  - high_school_statistics             |      1|none            |     5|acc        |↑  |0.3843|±  |0.0332|
|  - machine_learning                   |      1|none            |     5|acc        |↑  |0.3839|±  |0.0462|

|      Groups      |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|------------------|------:|------|------|------|---|-----:|---|-----:|
|mmlu              |      2|none  |      |acc   |↑  |0.4474|±  |0.0041|
| - humanities     |      2|none  |      |acc   |↑  |0.4049|±  |0.0069|
| - other          |      2|none  |      |acc   |↑  |0.4976|±  |0.0087|
| - social sciences|      2|none  |      |acc   |↑  |0.5180|±  |0.0088|
| - stem           |      2|none  |      |acc   |↑  |0.3923|±  |0.0086|
"""

In [None]:
"""
hf (pretrained=tiiuae/Falcon3-3B-Instruct-1.58bit,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 4
|                 Tasks                 |Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|---------------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|arc_easy                               |      1|none            |     5|acc        |↑  |0.5741|±  |0.0101|
|                                       |       |none            |     5|acc_norm   |↑  |0.5303|±  |0.0102|
|gsm8k_cot                              |      3|flexible-extract|     5|exact_match|↑  |0.0591|±  |0.0065|
|                                       |       |strict-match    |     5|exact_match|↑  |0.3427|±  |0.0131|
|hellaswag                              |      1|none            |     5|acc        |↑  |0.3919|±  |0.0049|
|                                       |       |none            |     5|acc_norm   |↑  |0.5008|±  |0.0050|
|mmlu                                   |      2|none            |      |acc        |↑  |0.2425|±  |0.0036|
| - humanities                          |      2|none            |      |acc        |↑  |0.2465|±  |0.0063|
|  - formal_logic                       |      1|none            |     5|acc        |↑  |0.2857|±  |0.0404|
|  - high_school_european_history       |      1|none            |     5|acc        |↑  |0.2121|±  |0.0319|
|  - high_school_us_history             |      1|none            |     5|acc        |↑  |0.2402|±  |0.0300|
|  - high_school_world_history          |      1|none            |     5|acc        |↑  |0.2616|±  |0.0286|
|  - international_law                  |      1|none            |     5|acc        |↑  |0.2397|±  |0.0390|
|  - jurisprudence                      |      1|none            |     5|acc        |↑  |0.2870|±  |0.0437|
|  - logical_fallacies                  |      1|none            |     5|acc        |↑  |0.2025|±  |0.0316|
|  - moral_disputes                     |      1|none            |     5|acc        |↑  |0.2312|±  |0.0227|
|  - moral_scenarios                    |      1|none            |     5|acc        |↑  |0.2469|±  |0.0144|
|  - philosophy                         |      1|none            |     5|acc        |↑  |0.2315|±  |0.0240|
|  - prehistory                         |      1|none            |     5|acc        |↑  |0.2284|±  |0.0234|
|  - professional_law                   |      1|none            |     5|acc        |↑  |0.2477|±  |0.0110|
|  - world_religions                    |      1|none            |     5|acc        |↑  |0.3392|±  |0.0363|
| - other                               |      2|none            |      |acc        |↑  |0.2646|±  |0.0079|
|  - business_ethics                    |      1|none            |     5|acc        |↑  |0.3100|±  |0.0465|
|  - clinical_knowledge                 |      1|none            |     5|acc        |↑  |0.2151|±  |0.0253|
|  - college_medicine                   |      1|none            |     5|acc        |↑  |0.2370|±  |0.0324|
|  - global_facts                       |      1|none            |     5|acc        |↑  |0.1800|±  |0.0386|
|  - human_aging                        |      1|none            |     5|acc        |↑  |0.3274|±  |0.0315|
|  - management                         |      1|none            |     5|acc        |↑  |0.2039|±  |0.0399|
|  - marketing                          |      1|none            |     5|acc        |↑  |0.3120|±  |0.0304|
|  - medical_genetics                   |      1|none            |     5|acc        |↑  |0.3200|±  |0.0469|
|  - miscellaneous                      |      1|none            |     5|acc        |↑  |0.3116|±  |0.0166|
|  - nutrition                          |      1|none            |     5|acc        |↑  |0.2353|±  |0.0243|
|  - professional_accounting            |      1|none            |     5|acc        |↑  |0.2234|±  |0.0248|
|  - professional_medicine              |      1|none            |     5|acc        |↑  |0.1838|±  |0.0235|
|  - virology                           |      1|none            |     5|acc        |↑  |0.2831|±  |0.0351|
| - social sciences                     |      2|none            |      |acc        |↑  |0.2424|±  |0.0077|
|  - econometrics                       |      1|none            |     5|acc        |↑  |0.2368|±  |0.0400|
|  - high_school_geography              |      1|none            |     5|acc        |↑  |0.2172|±  |0.0294|
|  - high_school_government_and_politics|      1|none            |     5|acc        |↑  |0.2124|±  |0.0295|
|  - high_school_macroeconomics         |      1|none            |     5|acc        |↑  |0.2256|±  |0.0212|
|  - high_school_microeconomics         |      1|none            |     5|acc        |↑  |0.2227|±  |0.0270|
|  - high_school_psychology             |      1|none            |     5|acc        |↑  |0.2128|±  |0.0175|
|  - human_sexuality                    |      1|none            |     5|acc        |↑  |0.2748|±  |0.0392|
|  - professional_psychology            |      1|none            |     5|acc        |↑  |0.3056|±  |0.0186|
|  - public_relations                   |      1|none            |     5|acc        |↑  |0.2727|±  |0.0427|
|  - security_studies                   |      1|none            |     5|acc        |↑  |0.1837|±  |0.0248|
|  - sociology                          |      1|none            |     5|acc        |↑  |0.2537|±  |0.0308|
|  - us_foreign_policy                  |      1|none            |     5|acc        |↑  |0.2900|±  |0.0456|
| - stem                                |      2|none            |      |acc        |↑  |0.2147|±  |0.0073|
|  - abstract_algebra                   |      1|none            |     5|acc        |↑  |0.2200|±  |0.0416|
|  - anatomy                            |      1|none            |     5|acc        |↑  |0.1778|±  |0.0330|
|  - astronomy                          |      1|none            |     5|acc        |↑  |0.1908|±  |0.0320|
|  - college_biology                    |      1|none            |     5|acc        |↑  |0.2222|±  |0.0348|
|  - college_chemistry                  |      1|none            |     5|acc        |↑  |0.2000|±  |0.0402|
|  - college_computer_science           |      1|none            |     5|acc        |↑  |0.2600|±  |0.0441|
|  - college_mathematics                |      1|none            |     5|acc        |↑  |0.2100|±  |0.0409|
|  - college_physics                    |      1|none            |     5|acc        |↑  |0.2157|±  |0.0409|
|  - computer_security                  |      1|none            |     5|acc        |↑  |0.2600|±  |0.0441|
|  - conceptual_physics                 |      1|none            |     5|acc        |↑  |0.2809|±  |0.0294|
|  - electrical_engineering             |      1|none            |     5|acc        |↑  |0.2414|±  |0.0357|
|  - elementary_mathematics             |      1|none            |     5|acc        |↑  |0.2090|±  |0.0209|
|  - high_school_biology                |      1|none            |     5|acc        |↑  |0.1903|±  |0.0223|
|  - high_school_chemistry              |      1|none            |     5|acc        |↑  |0.1626|±  |0.0260|
|  - high_school_computer_science       |      1|none            |     5|acc        |↑  |0.2700|±  |0.0446|
|  - high_school_mathematics            |      1|none            |     5|acc        |↑  |0.2111|±  |0.0249|
|  - high_school_physics                |      1|none            |     5|acc        |↑  |0.1987|±  |0.0326|
|  - high_school_statistics             |      1|none            |     5|acc        |↑  |0.1574|±  |0.0248|
|  - machine_learning                   |      1|none            |     5|acc        |↑  |0.3125|±  |0.0440|

|      Groups      |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|------------------|------:|------|------|------|---|-----:|---|-----:|
|mmlu              |      2|none  |      |acc   |↑  |0.2425|±  |0.0036|
| - humanities     |      2|none  |      |acc   |↑  |0.2465|±  |0.0063|
| - other          |      2|none  |      |acc   |↑  |0.2646|±  |0.0079|
| - social sciences|      2|none  |      |acc   |↑  |0.2424|±  |0.0077|
| - stem           |      2|none  |      |acc   |↑  |0.2147|±  |0.0073|
"""
''