In [1]:
import sys
sys.path += ['../']

# 导入必要库（参考训练代码）
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import logging
from math_verify import LatexExtractionConfig, parse, verify  # 假设已导入（参考[T2](3)
from tqdm.notebook import tqdm

SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
    "** The result formula must use latex format **"
)

# MODEL_NAME = "../checkpoint/qwen1.5_raw_20250805/checkpoint-1900/"
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" 

# 加载tokenizer和模型（参考[T0](1)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, padding_side="left")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)  # 从checkpoint加载
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # 设置为评估模式[AI KNOWLEDGE]({})

# 加载数据集（用户提供的代码）
dataset_names = ['algebra', 'counting_and_probability', 'geometry', 'intermediate_algebra', 'number_theory', 'prealgebra', 'precalculus']
dataset = load_dataset('EleutherAI/hendrycks_math', name=dataset_names[1], split=['test'])

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
def generate_response(question: str):
    """Test inference with the loaded trained model and tokenizer."""
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT}, # Re-use our system prompt
        {"role": "user", "content": f"question: {question}"}
    ]

    # Apply chat template using our tokenizer
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt").to(device)
    # Generate output using our *trained_model*
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024, # Maybe generate a bit longer now
        do_sample=True,
        temperature=0.5,
        top_p=1,
        top_k=20
    )

    # Decode the generated tokens back to text
    response = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
    return response

In [3]:
def accuracy_reward(content, solution):
    gold_parsed = parse(solution, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
    if gold_parsed:
        answer_parsed = parse(content, extraction_config=[LatexExtractionConfig()], extraction_mode="first_match")
        reward = 1 if verify(answer_parsed, gold_parsed) else 0  # 修改为1/0评估
    else:
        reward = 0  # 无法解析金标准视为错误
    return reward

In [4]:
data_test = dataset[0][1]

In [5]:
content = generate_response(data_test['problem'])
print(content)
print(parse(content, extraction_config=[LatexExtractionConfig()], extraction_mode="first_match"))

To solve this problem, we need to find the value of \( n \) such that the probability of rolling exactly two six-sided dice showing numbers other than 1 is \( \frac{25}{216} \).

First, let's determine the probability of one die showing a number other than 1. Since there are 6 faces on each die, if a die shows a 1, it has 5 possible outcomes (2 through 6). Therefore, the probability of rolling a 1 is \( \frac{1}{6} \), and the probability of not rolling a 1 is \( \frac{5}{6} \).

We want exactly two out of \( n \) dice to show a number other than 1. This can be modeled using the binomial distribution. The probability mass function for a binomial random variable \( X \) (number of successes) is given by:
\[ P(X = k) = \binom{n}{k} p^k (1-p)^{n-k} \]
where \( p = \frac{5}{6} \), \( k = 2 \), and \( n \) is the total number of dice.

The probability of getting exactly two dice showing a number other than 1 is:
\[ P(X = 2) = \binom{n}{2} \left( \frac{5}{6} \right)^2 \left( \frac{1}{6} \rig

In [6]:
accuracy_reward(content, data_test['solution'])

0

In [7]:
def evaluate():
    logging.info("开始评估...")
    result = []
    for data in tqdm(dataset[0]):
        question = data['problem']
        solution = data['solution']
        content = generate_response(question)
        reward = accuracy_reward(content, solution)
        result.append(reward)
    
    return result

In [None]:
result = evaluate()

  0%|          | 0/474 [00:00<?, ?it/s]

In [None]:
sum(result)/len(result)

In [None]:
result