In [1]:
# 设置 C 编译器
%env CC=x86_64-conda-linux-gnu-gcc
%env CXX=x86_64-conda-linux-gnu-g++

env: CC=x86_64-conda-linux-gnu-gcc
env: CXX=x86_64-conda-linux-gnu-g++


In [2]:
from unsloth import FastLanguageModel
import torch
import json
import re
from datasets import load_dataset, Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import tensorboard
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
from torch.utils.data import DataLoader
from tqdm import tqdm

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-18 04:08:23 [__init__.py:239] Automatically detected platform cuda.


2025-05-18 04:08:23,597	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
max_seq_length = 1024 # Can increase for longer reasoning traces
max_prompt_length = 256
lora_rank = 64 # Larger rank = smarter, but slower # 数学推理任务最好设置高一点

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/models/meta-llama3.1-8B-instruct",
    model_name = "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/models/meta-llama3-8B-instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit # 4bit 量化，LoRA 微调用这个精度是可以的
    fast_inference = False, # Enable vLLM fast inference # 推理加速，训练的时候不要开
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory # 对于单卡单任务，这个显存使用上限可以调高一点，调到 1 都行
)


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.327 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.49s/it]


/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/models/meta-llama3-8B-instruct does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.


Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank * 2, # 推荐设置为 2r
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

In [4]:
# 指定本地路径保存数据集
dataset = load_from_disk(
    "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/datasets/Math/gsm8k"
)

# 系统提示（推理格式）  -  补全换行符
SYSTEM_PROMPT = (
    "Respond in the following format:\n"
    "<reasoning>\n"
    "...reasoning steps...\n"
    "</reasoning>\n"
    "<answer>\n"
    "...final answer...\n"
    "</answer>\n"
)

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def process_data(data):
    response_text = data['answer']
    reasoning = response_text.split('####')[0].strip() # 提取推理过程
    answer = response_text.split('####')[-1].strip() # 提取答案部分
    formatted_answer = XML_COT_FORMAT.format(reasoning=reasoning, answer=answer)
    # if formatted_answer is None:
    #     return None
    return {
        'question': data['question'],
        'response': formatted_answer,
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': data['question']}
        ],
        'answer': answer
    }

filtered_dataset = dataset.filter(lambda x: '####' in x['answer'])
formatted_data = filtered_dataset.map(process_data)
train_data = formatted_data['train']
test_data = formatted_data['test']

def formatting_prompts_func(example):
    messages = example["prompt"]

    # 构造 system + user 段落（不含 assistant prompt）
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False  # SFT 不能让 assistant 起头
    )

    # 拼接 assistant 段头 + response + LLaMA3 专用结束符
    text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
    text += example["response"].strip() + "<|eot_id|>"

    return text


# 应用格式转换
train_data = train_data.map(lambda x: {"text": formatting_prompts_func(x)})
test_data = test_data.map(lambda x: {"text": formatting_prompts_func(x)})

# 过滤空数据
train_data = train_data.filter(lambda x: len(x["text"]) > 0)
test_data = test_data.filter(lambda x: len(x["text"]) > 0)

In [5]:
train_data[12]['text']

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nRespond in the following format:\n<reasoning>\n...reasoning steps...\n</reasoning>\n<answer>\n...final answer...\n</answer><|eot_id|><|start_header_id|>user<|end_header_id|>\n\nRandy has 60 mango trees on his farm. He also has 5 less than half as many coconut trees as mango trees. How many trees does Randy have in all on his farm?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<reasoning>\nHalf of the number of Randy's mango trees is 60/2 = <<60/2=30>>30 trees.\nSo Randy has 30 - 5 = <<30-5=25>>25 coconut trees.\nTherefore, Randy has 60 + 25 = <<60+25=85>>85 treeson his farm.\n</reasoning>\n<answer>\n85\n</answer><|eot_id|>"

In [6]:
def extract_all_numbers(text: str) -> list[str]:
    """提取所有可能的数字字符串（支持 $, , 分隔）"""
    pattern = r"[-+]?\$?\d[\d,]*\.?\d*"
    matches = re.findall(pattern, text)
    cleaned = [m.replace(",", "").replace("$", "") for m in matches]
    return cleaned

def extract_assistant_response(full_output: str) -> str:
    """仅提取 assistant 的回答部分"""
    match = re.search(r"<\|start_header_id\|>assistant<\|end_header_id\|>\n\n(.*)", full_output, re.DOTALL)
    return match.group(1) if match else full_output

def evaluate_single_response_final(full_output: str, gold_answer: str) -> dict:
    gold_answer = gold_answer.strip()
    
    # 提取 assistant 回答部分
    assistant_output = extract_assistant_response(full_output)
    
    # 清除 <|xxx|> 控制符：容忍控制符的存在，除此之外的多余字符算格式错误
    cleaned = re.sub(r"<\|.*?\|>", "", assistant_output).strip()

    format_correct = False
    answer_correct = False
    strict_answer_correct = False

    pattern = re.compile(
        r"^<reasoning>\n(.+?)\n</reasoning>\n<answer>\n(.+?)\n</answer>\s*$",
        flags=re.DOTALL
    )

    match = pattern.match(cleaned)
    if match:
        format_correct = True
        answer_block = match.group(2)
        answer_nums = extract_all_numbers(answer_block)
        if answer_nums and answer_nums[0] == gold_answer: # 要求答案出现在首位
            strict_answer_correct = True

    all_nums = extract_all_numbers(cleaned)
    if gold_answer in all_nums: # 只要出现答案就算对
        answer_correct = True

    return {
        "format_correct": format_correct,
        "answer_correct": answer_correct,
        "strict_answer_correct": strict_answer_correct
    }
    


def collate_fn_llama3(batch, tokenizer, max_length=2048):
    prompts = [ex["prompt"] for ex in batch]
    golds = [str(ex["answer"]).strip() for ex in batch]
    questions = [ex["question"] for ex in batch]

    # 预处理为纯字符串，避免 tokenizer 内部 bug
    prompt_texts = [
        tokenizer.apply_chat_template(
            msg,
            tokenize=False,
            add_generation_prompt=True
        ) for msg in prompts
    ]

    # 编码前确保 padding 设置生效
    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    encoded = tokenizer(
        prompt_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    )

    return encoded["input_ids"], encoded["attention_mask"], golds, questions


def evaluate_dp(
    model,
    tokenizer,
    dataset,
    max_samples: int = 100,
    batch_size: int = 4,
    max_new_tokens: int = 256,
    record_errors: bool = False
):
    # ✅ 设置 padding_side & pad_token（仅作为冗余保险）
    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model.eval()

    total = 0
    format_correct = 0
    answer_correct = 0
    strict_correct = 0
    format_errors, answer_errors, strict_errors = [], [], []

    dataloader = DataLoader(
        dataset.select(range(max_samples)),
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda x: collate_fn_llama3(x, tokenizer),
    )

    for input_ids, attention_mask, golds, questions in tqdm(dataloader, desc="Evaluating"):
        input_ids = input_ids.to(model.device)
        attention_mask = attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)

        for i in range(len(decoded_outputs)):
            result = evaluate_single_response_final(decoded_outputs[i], golds[i])
            total += 1

            if result["format_correct"]:
                format_correct += 1
            elif record_errors:
                format_errors.append((questions[i], decoded_outputs[i]))

            if result["answer_correct"]:
                answer_correct += 1
            elif record_errors:
                answer_errors.append((questions[i], decoded_outputs[i]))

            if result["strict_answer_correct"]:
                strict_correct += 1
            elif record_errors:
                strict_errors.append((questions[i], decoded_outputs[i]))

    format_acc = format_correct / total
    answer_acc = answer_correct / total
    strict_acc = strict_correct / total

    print(f"\n📐 Format Accuracy: {format_acc:.2%} ({format_correct}/{total})")
    print(f"🔢 Answer Accuracy: {answer_acc:.2%} ({answer_correct}/{total})")
    print(f"🎯 Strict Accuracy: {strict_acc:.2%} ({strict_correct}/{total})")

    return {
        "format_accuracy": format_acc,
        "answer_accuracy": answer_acc,
        "strict_accuracy": strict_acc,
        "format_errors": format_errors if record_errors else None,
        "answer_errors": answer_errors if record_errors else None,
        "strict_errors": strict_errors if record_errors else None,
    }


### 主训练

In [22]:
from transformers import TrainingArguments
from trl import SFTTrainer

# 训练参数配置
training_args = TrainingArguments(
    per_device_train_batch_size=72,          # GRPO 每步训练样本数 = 12 × 6 → SFT 一步训练72条等效
    gradient_accumulation_steps=1,           # 与 GRPO 保持一致

    learning_rate=5e-6,                      # 对齐
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",              # 对齐 GRPO
    optim="paged_adamw_8bit",                # 和 Unsloth 版本完全一致

    max_steps=500,                           # 与 GRPO 一致
    max_grad_norm=0.1,                       # GRPO 是 0.1，SFT 也同步

    save_strategy="steps",
    save_steps=250,
    save_total_limit=2,

    logging_steps=1,
      
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),

    report_to=["tensorboard"],
    logging_dir="/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/outputs/llama3_sft3/logs",
    output_dir="/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/outputs/llama3_sft3/runs",
)


# 创建训练器
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    # eval_dataset=test_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_args,
    packing=False,  # 数学推理任务建议关闭packing
)

# 开始训练
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 5 | Total steps = 500
O^O/ \_/ \    Batch size per device = 72 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (72 x 1 x 1) = 72
 "-____-"     Trainable parameters = 167,772,160/69,000,000,000 (0.24% trained)


Step,Training Loss
1,2.2608
2,2.1962
3,2.3529
4,2.3585
5,2.2895
6,2.2908
7,2.2477
8,2.292
9,2.31
10,2.2984


TrainOutput(global_step=500, training_loss=0.8672423504590988, metrics={'train_runtime': 1375.158, 'train_samples_per_second': 26.179, 'train_steps_per_second': 0.364, 'total_flos': 6.551141446759956e+17, 'train_loss': 0.8672423504590988})

In [23]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print(f"Padding side: {tokenizer.padding_side}")
print(f"Pad token: {tokenizer.pad_token} / ID: {tokenizer.pad_token_id}")
print(f"EOS token: {tokenizer.eos_token} / ID: {tokenizer.eos_token_id}")

Padding side: left
Pad token: <|eot_id|> / ID: 128009
EOS token: <|eot_id|> / ID: 128009


In [24]:
evaluate_dp(
    model=model,
    tokenizer=tokenizer,
    dataset=test_data,
    max_samples=len(test_data),
    batch_size=128,
    max_new_tokens=max_seq_length - max_prompt_length,
    record_errors=False
)

Evaluating: 100%|██████████| 11/11 [12:46<00:00, 69.70s/it]


📐 Format Accuracy: 98.94% (1305/1319)
🔢 Answer Accuracy: 67.78% (894/1319)
🎯 Strict Accuracy: 63.84% (842/1319)





{'format_accuracy': 0.9893858984078847,
 'answer_accuracy': 0.6777862016679302,
 'strict_accuracy': 0.6383623957543594,
 'format_errors': None,
 'answer_errors': None,
 'strict_errors': None}

在没有处理好数据之前

In [10]:
evaluate_dp(
    model=model,
    tokenizer=tokenizer,
    dataset=test_data,
    max_samples=len(test_data),
    batch_size=128,
    max_new_tokens=max_seq_length - max_prompt_length,
    record_errors=False
)

Evaluating: 100%|██████████| 11/11 [19:27<00:00, 106.11s/it]


📐 Format Accuracy: 59.82% (789/1319)
🔢 Answer Accuracy: 69.98% (923/1319)
🎯 Strict Accuracy: 41.55% (548/1319)





{'format_accuracy': 0.5981804397270659,
 'answer_accuracy': 0.6997725549658832,
 'strict_accuracy': 0.41546626231993933,
 'format_errors': None,
 'answer_errors': None,
 'strict_errors': None}

### 关于 SFT 训练表现不佳的原因

In [16]:
from transformers import TrainingArguments
from trl import SFTTrainer


training_args = TrainingArguments(
    per_device_train_batch_size=72,               # 不变
    gradient_accumulation_steps=1,                # 不变 → effective batch = 72

    learning_rate=2e-5,                           #  提高学习率，增强 token 学习信号
    lr_scheduler_type="linear",                   #  更温和的衰减，保留学习动力
    warmup_ratio=0.03,                            #  缩短 warmup，加快进入有效训练

    max_grad_norm=1.0,                            #  提高梯度上限，避免 signal 被压制
    weight_decay=0.01,                            #  轻度正则，有利于泛化
    optim="adamw_torch",                          #  更适合标准 SFT，兼容性好

    adam_beta1=0.9,                               # 可保留
    adam_beta2=0.99,

    max_steps=250,                                # 保持不变
    save_strategy="steps",
    save_steps=250,
    save_total_limit=2,

    logging_steps=1,
    
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),

    report_to=["tensorboard"],
    logging_dir="/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/outputs/llama3_sft2/logs",
    output_dir="/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/outputs/llama3_sft2/runs",
)

# 创建训练器
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    # eval_dataset=test_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_args,
    packing=False,  # 数学推理任务建议关闭packing
)

# 开始训练
trainer.train()


Unsloth: Tokenizing ["text"] (num_proc=128): 100%|██████████| 7473/7473 [00:14<00:00, 505.49 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 3 | Total steps = 250
O^O/ \_/ \    Batch size per device = 72 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (72 x 1 x 1) = 72
 "-____-"     Trainable parameters = 167,772,160/69,000,000,000 (0.24% trained)


Step,Training Loss
1,2.2608
2,2.1962
3,2.3409
4,2.3004
5,2.1529
6,2.0332
7,1.8867
8,1.769
9,1.6404
10,1.5057


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=250, training_loss=0.8038528652191163, metrics={'train_runtime': 708.8346, 'train_samples_per_second': 25.394, 'train_steps_per_second': 0.353, 'total_flos': 3.27858079907414e+17, 'train_loss': 0.8038528652191163})

In [17]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print(f"Padding side: {tokenizer.padding_side}")
print(f"Pad token: {tokenizer.pad_token} / ID: {tokenizer.pad_token_id}")
print(f"EOS token: {tokenizer.eos_token} / ID: {tokenizer.eos_token_id}")

Padding side: left
Pad token: <|eot_id|> / ID: 128009
EOS token: <|eot_id|> / ID: 128009


In [18]:
evaluate_dp(
    model=model,
    tokenizer=tokenizer,
    dataset=test_data,
    max_samples=len(test_data),
    batch_size=128,
    max_new_tokens=max_seq_length - max_prompt_length,
    record_errors=False
)

Evaluating: 100%|██████████| 11/11 [10:08<00:00, 55.31s/it]


📐 Format Accuracy: 99.24% (1309/1319)
🔢 Answer Accuracy: 69.67% (919/1319)
🎯 Strict Accuracy: 65.35% (862/1319)





{'format_accuracy': 0.9924184988627748,
 'answer_accuracy': 0.6967399545109931,
 'strict_accuracy': 0.6535253980288097,
 'format_errors': None,
 'answer_errors': None,
 'strict_errors': None}

### 250 steps 精度测试

In [9]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

max_seq_length = 1024 # Can increase for longer reasoning traces
max_prompt_length = 256
lora_rank = 64 # Larger rank = smarter, but slower # 数学推理任务最好设置高一点

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/models/meta-llama3.1-8B-instruct",
    model_name = "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/models/meta-llama3-8B-instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit # 4bit 量化，LoRA 微调用这个精度是可以的
    fast_inference = False, # Enable vLLM fast inference # 推理加速，训练的时候不要开
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory # 对于单卡单任务，这个显存使用上限可以调高一点，调到 1 都行
)

adapter_path = "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/outputs/llama3_sft3/runs/checkpoint-250"
model = PeftModel.from_pretrained(model, adapter_path)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.327 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.50s/it]


/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/tongliwen-240107020010/Project/LLMRFT/models/meta-llama3-8B-instruct does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.


In [10]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print(f"Padding side: {tokenizer.padding_side}")
print(f"Pad token: {tokenizer.pad_token} / ID: {tokenizer.pad_token_id}")
print(f"EOS token: {tokenizer.eos_token} / ID: {tokenizer.eos_token_id}")

Padding side: left
Pad token: <|eot_id|> / ID: 128009
EOS token: <|eot_id|> / ID: 128009


In [11]:
evaluate_dp(
    model=model,
    tokenizer=tokenizer,
    dataset=test_data,
    max_samples=len(test_data),
    batch_size=128,
    max_new_tokens=max_seq_length - max_prompt_length,
    record_errors=False
)

Evaluating: 100%|██████████| 11/11 [12:25<00:00, 67.79s/it]


📐 Format Accuracy: 99.09% (1307/1319)
🔢 Answer Accuracy: 68.84% (908/1319)
🎯 Strict Accuracy: 64.29% (848/1319)





{'format_accuracy': 0.9909021986353298,
 'answer_accuracy': 0.6884003032600455,
 'strict_accuracy': 0.6429112964366944,
 'format_errors': None,
 'answer_errors': None,
 'strict_errors': None}