In [None]:
%pip install math_verify

In [None]:
import os
import re
import torch
from datasets import load_dataset
from tqdm.auto import tqdm
from math_verify import verify, parse
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)

In [None]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

In [None]:
def extract_boxed_answer(text: str) -> str:
    boxed_pattern = r"\\boxed\{((?:[^{}]|\{[^{}]*\})*)\}"
    matches = re.findall(boxed_pattern, text)
    
    if matches:
        return matches[-1]
    
    return ""

In [None]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
MODEL_PATH = '/home/jupyter/datasphere/project/check_sft'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
DEVICE

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B")
model = AutoModelForCausalLM.from_pretrained("./check_sft")

In [None]:
model = model.to(DEVICE)

In [None]:
test_dataset = load_dataset("nlile/hendrycks-MATH-benchmark", split="test")
sample = test_dataset.select(range(5))

correct_answers = 0
total_answers = 0

decoded = {'problem': [], 'reasoning': [], 'answer': [], 'true_answer': []}

for example in tqdm(sample, desc="–û—Ü–µ–Ω–∫–∞ Success Rate —Å —á–µ–∫–ø–æ–∏–Ω—Ç–∞"):
    problem_text = example["problem"]
    ground_truth_answer = example["answer"]
    
    decoded['true_answer'].append(ground_truth_answer)    
    decoded['problem'].append(problem_text)

    if ground_truth_answer is None:
        continue
    total_answers += 1

    prompt = SYSTEM_PROMPT + " Problem: " + problem_text
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.01,
        top_p=1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    decoded['reasoning'].append(generated_output)
    model_answer_str = extract_boxed_answer(generated_output)
    decoded['answer'].append(model_answer_str)

    if verify(parse(model_answer_str), parse(ground_truth_answer), float_rounding=2):
        correct_answers += 1

if total_answers > 0:
    success_rate = (correct_answers / total_answers) * 100
    print("\n" + "="*50)
    print("="*50)
    print(f"‚úÖ –í–µ—Ä–Ω—ã—Ö –æ—Ç–≤–µ—Ç–æ–≤: {correct_answers} –∏–∑ {total_answers}")
    print(f"üìà –ò—Ç–æ–≥–æ–≤—ã–π Success Rate: {success_rate:.2f}%")
    print("="*50)
else:
    print("–ù–µ –Ω–∞–π–¥–µ–Ω–æ –ø—Ä–∏–º–µ—Ä–æ–≤ —Å —ç—Ç–∞–ª–æ–Ω–Ω—ã–º –æ—Ç–≤–µ—Ç–æ–º –¥–ª—è –æ—Ü–µ–Ω–∫–∏.")

In [None]:
decoded