In [1]:
import multiprocessing as mp
mp.set_start_method("spawn", force=True)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import re
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 使用您提供的健壮答案处理代码（保持不变）
def last_boxed_only_string(string):
    """提取最后一个 \\boxed 内容"""
    if string is None:
        return None
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        parts = string.split("\\boxed ")
        if len(parts) > 1:
            return "\\boxed " + parts[-1].split("$")[0].split("\n")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    retval = None if right_brace_idx is None else string[idx : right_brace_idx + 1]
    return retval

def remove_boxed(s):
    """移除 \\boxed 包装"""
    if s is None:
        return None
    if "\\boxed " in s:
        left = "\\boxed "
        if s.startswith(left):
            return s[len(left):]
        return s

    left = "\\boxed{"
    if s.startswith(left) and s.endswith("}"):
        return s[len(left):-1]
    return s

def fix_fracs(string):
    """修复分数格式"""
    if string is None:
        return ""
    substrs = string.split("\\frac")
    new_str = substrs[0]
    if len(substrs) > 1:
        substrs = substrs[1:]
        for substr in substrs:
            new_str += "\\frac"
            if substr[0] == "{":
                new_str += substr
            else:
                try:
                    assert len(substr) >= 2
                except:
                    return string
                a = substr[0]
                b = substr[1]
                if b != "{":
                    if len(substr) > 2:
                        post_substr = substr[2:]
                        new_str += "{" + a + "}{" + b + "}" + post_substr
                    else:
                        new_str += "{" + a + "}{" + b + "}"
                else:
                    if len(substr) > 2:
                        post_substr = substr[2:]
                        new_str += "{" + a + "}" + b + post_substr
                    else:
                        new_str += "{" + a + "}" + b
    string = new_str
    return string

def fix_a_slash_b(string):
    """修复 a/b 格式为分数"""
    if string is None:
        return ""
    if len(string.split("/")) != 2:
        return string
    a = string.split("/")[0]
    b = string.split("/")[1]
    try:
        a = int(a)
        b = int(b)
        assert string == "{}/{}".format(a, b)
        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
        return new_string
    except:
        return string

def remove_right_units(string):
    """移除右侧单位"""
    if string is None:
        return ""
    if "\\text{ " in string:
        splits = string.split("\\text{ ")
        assert len(splits) == 2
        return splits[0]
    else:
        return string

def fix_sqrt(string):
    """修复平方根格式"""
    if string is None:
        return ""
    if "\\sqrt" not in string:
        return string
    splits = string.split("\\sqrt")
    new_string = splits[0]
    for split in splits[1:]:
        if split[0] != "{":
            a = split[0]
            new_substr = "\\sqrt{" + a + "}" + split[1:]
        else:
            new_substr = "\\sqrt" + split
        new_string += new_substr
    return new_string

def strip_string(string):
    """标准化字符串"""
    if string is None:
        return ""
        
    # linebreaks
    string = string.replace("\n", "")

    # remove inverse spaces
    string = string.replace("\\!", "")

    # replace \\ with \
    string = string.replace("\\\\", "\\")

    # replace tfrac and dfrac with frac
    string = string.replace("tfrac", "frac")
    string = string.replace("dfrac", "frac")

    # remove \left and \right
    string = string.replace("\\left", "")
    string = string.replace("\\right", "")

    # Remove circ (degrees)
    string = string.replace("^{\\circ}", "")
    string = string.replace("^\\circ", "")

    # remove dollar signs
    string = string.replace("\\$", "")

    # 新增：去除\text{...}，保留内容
    import re
    string = re.sub(r'\\text{([^}]*)}', r'\1', string)

    # remove units (on the right)
    string = remove_right_units(string)

    # remove percentage
    string = string.replace("\\%", "")
    string = string.replace("\%", "")

    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
    string = string.replace(" .", " 0.")
    string = string.replace("{.", "{0.")
    # if empty, return empty string
    if len(string) == 0:
        return string
    if string[0] == ".":
        string = "0" + string

    # to consider: get rid of e.g. "k = " or "q = " at beginning
    if len(string.split("=")) == 2 and len(string.split("=")[0]) <= 2:
        string = string.split("=")[1]

    # fix sqrt3 --> sqrt{3}
    string = fix_sqrt(string)

    # remove spaces
    string = string.replace(" ", "")

    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1).
    # Also does a/b --> \\frac{a}{b}
    string = fix_fracs(string)

    # manually change 0.5 --> \frac{1}{2}
    if string == "0.5":
        string = "\\frac{1}{2}"

    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model输出 is X/Y
    string = fix_a_slash_b(string)

    return string

def is_equiv(str1, str2, verbose=False):
    """比较两个字符串是否等价"""
    if str1 is None and str2 is None:
        if verbose:
            print("WARNING: Both None")
        return True
    if str1 is None or str2 is None:
        return False

    try:
        ss1 = strip_string(str1)
        ss2 = strip_string(str2)
        if verbose:
            print(f"Comparing: '{ss1}' vs '{ss2}'")
        return ss1 == ss2
    except Exception:
        return str1 == str2

def compute_score(solution_str, ground_truth) -> float:
    """计算单个问题的得分"""
    retval = 0.0
    try:
        string_in_last_boxed = last_boxed_only_string(solution_str)
        if string_in_last_boxed is not None:
            answer = remove_boxed(string_in_last_boxed)
            if is_equiv(answer, ground_truth):
                retval = 1.0
    except Exception as e:
        print(f"Error in compute_score: {e}")

    return retval, string_in_last_boxed, remove_boxed(string_in_last_boxed) if string_in_last_boxed else None

In [None]:
# Load model and tokenizer
model_name = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda:1"
)

attetion_T = 1.05
for i in range(model.config.num_hidden_layers):
    model.model.layers[i].self_attn.scaling = model.model.layers[i].self_attn.scaling * 1/attetion_T

# Load dataset
dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")

# Generation configuration
generation_config = {
    "max_new_tokens": 12000,
    "do_sample": False,
#     "temperature": 0.2,
    "repetition_penalty": 1.1,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
}

# 定义停止词
stop_words = ["```python", "```py", "Python code", "# Python", "import "]
stop_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in stop_words]

# 全局变量来跟踪\boxed的位置
boxed_positions = {}

class StoppingCriteria:
    def __init__(self, tokenizer, max_tokens_after_boxed=100):
        self.tokenizer = tokenizer
        self.max_tokens_after_boxed = max_tokens_after_boxed
        self.boxed_text = "\\boxed"
        self.boxed_tokens = tokenizer.encode(self.boxed_text, add_special_tokens=False)
        
        # 停止词
        self.stop_words = ["```python", "```py", "Python code", "# Python", "import "]
        self.stop_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in self.stop_words]
    
    def __call__(self, input_ids, scores, **kwargs):
        current_sequence = input_ids[0].tolist()
        
        # 检查停止词
        for stop_word_ids in self.stop_words_ids:
            if len(current_sequence) >= len(stop_word_ids):
                if current_sequence[-len(stop_word_ids):] == stop_word_ids:
                    return True
        
        # 查找所有\boxed出现的位置
        boxed_positions = []
        for i in range(len(current_sequence) - len(self.boxed_tokens) + 1):
            if current_sequence[i:i+len(self.boxed_tokens)] == self.boxed_tokens:
                boxed_positions.append(i)
        
        if boxed_positions:
            # 获取最后一个\boxed的位置
            last_boxed_pos = boxed_positions[0]
            tokens_after_last_boxed = len(current_sequence) - last_boxed_pos
            
            # 如果\boxed后生成的token数量超过阈值，则停止
            if tokens_after_last_boxed > self.max_tokens_after_boxed:
                return True
        
        return False

# 创建停止条件实例
stopping_criteria = StoppingCriteria(tokenizer, max_tokens_after_boxed=100)

# 创建保存score序列的列表
scores_list = []

# Evaluate accuracy
correct = 0
total = len(dataset)

for i, example in enumerate(tqdm(dataset, desc="Evaluating")):
    # Construct prompt
    prompt = (
        "Solve the following math problem step by step. "
        "Put your final answer in a boxed format at the end.\n\n"
        f"Question: {example['problem']}\n\n"
    )
    
    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    try:
        # 使用自定义停止条件
        from transformers import StoppingCriteriaList
        stopping_criteria_list = StoppingCriteriaList([stopping_criteria])
        
        outputs = model.generate(
            **inputs, 
            **generation_config,
            stopping_criteria=stopping_criteria_list
        )
    except Exception as e:
        print(f"Error with stopping criteria: {e}")
        # 回退到基本生成
        outputs = model.generate(**inputs, **generation_config)
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 只保留模型生成的部分（移除prompt）
    model_output = response[len(prompt):]
    
    # 使用健壮的评分函数
    score, boxed_content, extracted_answer = compute_score(model_output, example['answer'])
    correct += score
    
    # 将当前样本的score和相关信息保存到列表中
    scores_list.append({
        'index': i,
        'problem': example['problem'],
        'model_output': model_output,
        'boxed_content': boxed_content,
        'extracted_answer': extracted_answer,
        'target_answer': example['answer'],
        'score': score
    })
    
    # 调试信息
    print(f"\n--- Example {i} ---")
    print(f"Problem: {example['problem'][:100]}...")
    print(f"Model output length: {len(model_output)} chars")
    
    # 检查是否包含\boxed
    if "\\boxed" in model_output:
        boxed_pos = model_output.rfind("\\boxed")
        chars_after_boxed = len(model_output) - boxed_pos
        print(f"Found \\boxed at position {boxed_pos}, {chars_after_boxed} chars after boxed")
    
    print(f"Extracted boxed content: {boxed_content}")
    print(f"Extracted answer: {extracted_answer}")
    print(f"Target answer: {example['answer']}")
    print(f"Score: {score}")
    print("-" * 50)

# Calculate accuracy
accuracy = correct / total

print(f"\nEvaluation completed. Accuracy: {accuracy:.2%}")
print(f"\\boxed{{{accuracy:.4f}}}")

# 保存score序列到文件
import json
import os

# 生成文件名
model_short_name = model_name.split("/")[-1]  # 只取模型名称的最后部分
dataset_name = "MATH-500"  # 数据集名称
filename = f"{model_short_name}_{dataset_name}_attention_T_{attetion_T}.json"

# 准备保存的数据
save_data = {
    'model_name': model_name,
    'dataset': dataset_name,
    'attention_T': attetion_T,
    'total_examples': total,
    'accuracy': accuracy,
    'scores': scores_list
}

# 保存到文件
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(save_data, f, indent=2, ensure_ascii=False)

print(f"\nScore序列已保存到文件: {filename}")
print(f"文件包含 {len(scores_list)} 个样本的详细评分信息")

# 可选：也保存一个简化的CSV文件用于分析
import pandas as pd

# 创建简化的DataFrame
df_data = []
for item in scores_list:
    df_data.append({
        'index': item['index'],
        'score': item['score'],
        'extracted_answer': item['extracted_answer'],
        'target_answer': item['target_answer'],
        'has_boxed': 1 if item['boxed_content'] else 0
    })

df = pd.DataFrame(df_data)
csv_filename = f"{model_short_name}_{dataset_name}_attention_T_{attetion_T}.csv"
df.to_csv(csv_filename, index=False, encoding='utf-8')

print(f"简化版CSV文件已保存到: {csv_filename}")

In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
from vllm import LLM, SamplingParams

# ===== 配置 =====
model_name = "/home/xsj/data_xsj/1models/Qwen3-0.6B"
attention_T = 1.05  
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 加载数据集
dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")

llm = LLM(
    model=model_name,
    dtype="bfloat16",
    tensor_parallel_size=2 ,        # 如果只用 cuda:1，确保 CUDA_VISIBLE_DEVICES=1
    gpu_memory_utilization=0.9,   # 防止 OOM
    max_model_len=16384           # Qwen3 支持长上下文
)

# ===== 停止词（vLLM 支持字符串列表）=====
stop_words = ["```python", "```py", "Python code", "# Python", "import "]

# 注意：vLLM 的 stop 不支持 token ID，只支持字符串
# 所以直接传字符串即可

# ===== 采样参数 =====
sampling_params = SamplingParams(
    max_tokens=12000,
    temperature=0.8,              # do_sample=False ≈ temperature=0
    repetition_penalty=1.1,
    stop=stop_words,              # vLLM 原生支持
    skip_special_tokens=True
)

# ===== 后处理函数（模拟你的 stopping logic）=====
def truncate_after_boxed(text, max_tokens_after_boxed=100):
    """在 \\boxed{...} 之后最多保留 max_tokens_after_boxed 个 token"""
    if "\\boxed" not in text:
        return text
    
    # 找到最后一个 \boxed 的位置
    last_boxed_idx = text.rfind("\\boxed")
    
    # 截取从开头到 \boxed 之后的部分
    prefix = text[:last_boxed_idx]
    suffix = text[last_boxed_idx:]
    
    # 对 suffix 分词，限制长度
    suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
    if len(suffix_tokens) > max_tokens_after_boxed:
        suffix_truncated = tokenizer.decode(suffix_tokens[:max_tokens_after_boxed])
        return prefix + suffix_truncated
    else:
        return text

# ===== 推理循环 =====
scores_list = []
correct = 0
total = len(dataset)

prompts = []
indices = []

for i, example in enumerate(dataset):
    prompt = (
        "Solve the following math problem step by step. "
        "Put your final answer in a boxed format at the end.\n\n"
        f"Question: {example['problem']}\n\n"
    )
    prompts.append(prompt)
    indices.append(i)

# 批量生成（vLLM 高效支持 batch）
print("Generating responses with vLLM...")
outputs = llm.generate(prompts, sampling_params)

# 处理结果
for i, (example, output) in enumerate(zip(dataset, outputs)):
    raw_output = output.outputs[0].text  # vLLM 返回的是 CompletionOutput
    
    # 应用 \boxed 后截断逻辑
    truncated_output = truncate_after_boxed(raw_output, max_tokens_after_boxed=100)
    
    # 评分
    score, boxed_content, extracted_answer = compute_score(truncated_output, example['answer'])
    correct += score
    
    scores_list.append({
        'index': indices[i],
        'problem': example['problem'],
        'model_output': truncated_output,
        'raw_output': raw_output,  # 可选：保存原始输出
        'boxed_content': boxed_content,
        'extracted_answer': extracted_answer,
        'target_answer': example['answer'],
        'score': score
    })

    # 调试打印（可选）
    if i < 3:  # 只打印前几个
        print(f"\n--- Example {i} ---")
        print(f"Problem: {example['problem'][:100]}...")
        print(f"Model output: {truncated_output[:200]}...")
        print(f"Score: {score}")

# ===== 计算准确率 & 保存结果 =====
accuracy = correct / total
print(f"\nEvaluation completed. Accuracy: {accuracy:.2%}")

# 保存文件
model_short_name = model_name.split("/")[-1]
filename_base = f"{model_short_name}_MATH-500_attention_T_{attention_T}"

# JSON
save_data = {
    'model_name': model_name,
    'dataset': "MATH-500",
    'attention_T': attention_T,
    'total_examples': total,
    'accuracy': accuracy,
    'scores': scores_list
}
with open(filename_base + ".json", 'w', encoding='utf-8') as f:
    json.dump(save_data, f, indent=2, ensure_ascii=False)

# CSV
df = pd.DataFrame([{
    'index': item['index'],
    'score': item['score'],
    'extracted_answer': item['extracted_answer'],
    'target_answer': item['target_answer'],
    'has_boxed': 1 if item['boxed_content'] else 0
} for item in scores_list])
df.to_csv(filename_base + ".csv", index=False, encoding='utf-8')

print(f"Results saved to {filename_base}.json and .csv")

INFO 11-23 18:26:02 [utils.py:253] non-default args: {'dtype': 'bfloat16', 'max_model_len': 16384, 'tensor_parallel_size': 2, 'disable_log_stats': True, 'model': '/home/xsj/data_xsj/1models/Qwen3-0.6B'}
INFO 11-23 18:26:02 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-23 18:26:02 [model.py:1745] Using max model len 16384
INFO 11-23 18:26:02 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.
[1;36m(EngineCore_DP0 pid=594151)[0;0m INFO 11-23 18:26:03 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='/home/xsj/data_xsj/1models/Qwen3-0.6B', speculative_config=None, tokenizer='/home/xsj/data_xsj/1models/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=No

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]m 
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.29it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.22it/s]
[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m 


[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m INFO 11-23 18:26:06 [default_loader.py:314] Loading weights took 0.19 seconds
[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m INFO 11-23 18:26:07 [gpu_model_runner.py:3338] Model loading took 0.5660 GiB memory and 0.545065 seconds
[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m INFO 11-23 18:26:12 [backends.py:631] Using cache directory: /home/xsj/data_xsj/.cache/vllm/torch_compile_cache/9aba0b807b/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m INFO 11-23 18:26:12 [backends.py:647] Dynamo bytecode transform time: 4.76 s
[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m INFO 11-23 18:26:14 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 2.261 s
[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP1 pid

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:01<00:00, 29.33it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 30.22it/s]


[1;36m(EngineCore_DP0 pid=594151)[0;0m [1;36m(Worker_TP0 pid=594164)[0;0m INFO 11-23 18:26:20 [gpu_model_runner.py:4244] Graph capturing finished in 4 secs, took 1.09 GiB
[1;36m(EngineCore_DP0 pid=594151)[0;0m INFO 11-23 18:26:20 [core.py:250] init engine (profile, create kv cache, warmup model) took 13.52 seconds
INFO 11-23 18:26:21 [llm.py:352] Supported tasks: ['generate']
Generating responses with vLLM...


Adding requests: 100%|██████████| 500/500 [00:00<00:00, 2975.33it/s]
Processed prompts:  27%|██▋       | 137/500 [20:48<1:16:50, 12.70s/it, est. speed input: 9.42 toks/s, output: 538.86 toks/s]

KeyboardInterrupt: 

ERROR 11-23 18:47:16 [core_client.py:598] Engine core proc EngineCore_DP0 died unexpectedly, shutting down client.


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

model_name = "/home/xsj/data_xsj/1models/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")

lengths = []
for example in dataset:
    tokens = tokenizer.encode(example["problem"], add_special_tokens=False)
    lengths.append(len(tokens))

import numpy as np
print(f"Min: {np.min(lengths)}")
print(f"Max: {np.max(lengths)}")
print(f"Mean: {np.mean(lengths):.1f}")
print(f"Median: {np.median(lengths):.1f}")
print(f"90th percentile: {np.percentile(lengths, 90):.1f}")

  from .autonotebook import tqdm as notebook_tqdm


Min: 8
Max: 791
Mean: 70.9
Median: 50.0
90th percentile: 141.1
