In [5]:
from transformers import AutoTokenizer
from v4.data.template import get_template_and_fix_tokenizer
from v4.hparams import DataArguments
from v4.data.converter import AlpacaDatasetConverter
from types import SimpleNamespace

# 1. 初始化 tokenizer（确保路径和你的模型一致）
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")

# 2. 指定使用的模板，比如 "chatml", "llama2", "qwen", 等等
data_args = DataArguments(template="qwen")

# 3. 为 converter 构造一个简化版 DatasetAttr（只要字段对应上即可）
dataset_attr = SimpleNamespace(
    prompt="instruction",
    query="input",
    response="output",
    history=None,
    kto_tag=None,
    ranking=False,
    chosen=None,
    rejected=None,
    system=None,
    tools=None,
    images=None,
    videos=None,
    audios=None,
    load_from="file",
    formatting="alpaca",
)

# DataArguments 这里只为 media_dir 占位
converter = AlpacaDatasetConverter(dataset_attr=dataset_attr, data_args=data_args)

In [6]:
import pprint

# 你的原始样本
sample = {
    "instruction": "Solve the following math problem step by step. Write your reasoning clearly using LaTeX. Box the final answer using \\boxed{}.",
    "input": "Transform the following sentence using a synonym: The car sped quickly.",
    "output": "The car accelerated rapidly."
}

# 执行转换
converted = converter(sample)

print("---- 转换后格式（converted） ----")
pprint.pprint(converted)

# 3. 获取模板并修复 tokenizer 的特殊 token
template = get_template_and_fix_tokenizer(tokenizer, data_args)

# converted["_prompt"] 是一个列表，列表里每两个元素为 user/assistant 交替
prompt_msgs = converted["_prompt"]
response_msgs = converted["_response"]

# 合并成 messages
messages = prompt_msgs + response_msgs

print("---- messages ----")
pprint.pprint(messages)

# encode_oneturn 会把 messages 里的 user/assistant 按 template 转成 token_ids
prompt_ids, response_ids = template.encode_oneturn(tokenizer, messages)

print("\n✅ 编码完成！")

---- 转换后格式（converted） ----
{'_audios': None,
 '_images': None,
 '_prompt': [{'content': 'Solve the following math problem step by step. Write '
                         'your reasoning clearly using LaTeX. Box the final '
                         'answer using \\boxed{}.\n'
                         'Transform the following sentence using a synonym: '
                         'The car sped quickly.',
              'role': 'user'}],
 '_response': [{'content': 'The car accelerated rapidly.',
                'role': 'assistant'}],
 '_system': '',
 '_tools': '',
 '_videos': None}
---- messages ----
[{'content': 'Solve the following math problem step by step. Write your '
             'reasoning clearly using LaTeX. Box the final answer using '
             '\\boxed{}.\n'
             'Transform the following sentence using a synonym: The car sped '
             'quickly.',
  'role': 'user'},
 {'content': 'The car accelerated rapidly.', 'role': 'assistant'}]

✅ 编码完成！


In [7]:
prompt_ids

[151644,
 8948,
 198,
 2610,
 525,
 1207,
 16948,
 11,
 3465,
 553,
 54364,
 14817,
 13,
 1446,
 525,
 264,
 10950,
 17847,
 13,
 151645,
 198,
 151644,
 872,
 198,
 50,
 3948,
 279,
 2701,
 6888,
 3491,
 3019,
 553,
 3019,
 13,
 9645,
 697,
 32711,
 9355,
 1667,
 97913,
 13,
 8261,
 279,
 1590,
 4226,
 1667,
 1124,
 79075,
 6257,
 624,
 8963,
 279,
 2701,
 11652,
 1667,
 264,
 73350,
 25,
 576,
 1803,
 85610,
 6157,
 13,
 151645,
 198,
 151644,
 77091,
 198]

In [9]:
# %% [code]
print("—— Prompt 解码 ——")
print(tokenizer.decode(prompt_ids, skip_special_tokens=False))

print("\n—— Response 解码 ——")
print(tokenizer.decode(response_ids, skip_special_tokens=False))


—— Prompt 解码 ——
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Solve the following math problem step by step. Write your reasoning clearly using LaTeX. Box the final answer using \boxed{}.
Transform the following sentence using a synonym: The car sped quickly.<|im_end|>
<|im_start|>assistant


—— Response 解码 ——
The car accelerated rapidly.<|im_end|>

