In [None]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B").to("cuda")

In [None]:
from datasets import load_dataset

datasets = load_dataset("Moemu/Muice-Dataset")

In [None]:
tokenizer.chat_template

In [None]:
model.config.max_position_embeddings

In [None]:
type(datasets["train"]["conversation"][0])

In [None]:
import re

def process2messages_function(examples):
    all_messages = []

    for system, conversation in zip(examples["system"], examples["conversation"]):
        messages = []
        messages.append({
            "role": "system",
            "content": system
        })
        for turn in conversation:
            messages.append({
                "role": "user",
                "content": turn["human"]
            })
            messages.append({
                "role": "assistant",
                "content": turn["assistant"]
            })
        all_messages.append(messages)

    messages = tokenizer.apply_chat_template(all_messages, tokenize=False, add_generation_prompt=False)
    messages = [re.sub(r"<think>\s*</think>", "", message) for message in messages]
    return {"messages": messages}

In [None]:
import re

def find_assistant_content_including_end(text):
    """
    返回每一段 assistant 内容的 (start_index, end_index)
    start_index = <|im_start|>assistant\n 后第一个字符位置
    end_index = 对应 <|im_end|> 的最后一个字符位置（包含 <|im_end|>）
    """
    pattern = r"<\|im_start\|>assistant\n(.*?<\|im_end\|>\n)"
    spans = []
    for match in re.finditer(pattern, text, flags=re.DOTALL):
        start = match.start(1)       # 第一个括号组的开始
        end = match.end(1) - 1       # 左闭右闭
        spans.append((start, end))
    return spans

def process_messages2ids_function(examples):
    inputs = tokenizer(
        examples["messages"],
        truncation=True,
        max_length=4096,
        return_offsets_mapping=True
    )
    labels = []
    offset_mapping = inputs.pop("offset_mapping")

    for batch_idx in range(len(inputs["input_ids"])):
        offsets = offset_mapping[batch_idx]
        input_ids = inputs["input_ids"][batch_idx]
        label = [220] * len(input_ids)
        assistant_contents_idxes = find_assistant_content_including_end(examples["messages"][batch_idx])
        assistant_contents_i = 0
        for idx, offset in enumerate(offsets):
            if assistant_contents_idxes[assistant_contents_i][0] <= offset[0] and offset[1] <= assistant_contents_idxes[assistant_contents_i][1] + 1:
                label[idx] = input_ids[idx]
            if offset[1] >= assistant_contents_idxes[assistant_contents_i][1] + 1:
                assistant_contents_i += 1
                if assistant_contents_i == len(assistant_contents_idxes):
                    break
        labels.append(label)

    inputs["labels"] = labels

    return inputs

In [None]:
to_messages_datasets = datasets.map(process2messages_function, batched=True)

tokenized_datasets = to_messages_datasets.filter(lambda x: len(x["messages"]) <= 4096).map(process_messages2ids_function, batched=True, remove_columns=to_messages_datasets["train"].column_names)

In [None]:
tokenizer.decode(tokenized_datasets["train"]["input_ids"][0])

In [None]:
tokenizer.decode(tokenized_datasets["train"]["labels"][0])

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=2
)

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
model.generation_config

In [None]:
for name, param in model.named_parameters():
    print(name, param.size())

In [None]:
messages = [
    {"role": "system", "content": "你是一个名为沐雪的可爱AI女孩子"},
    {"role": "user", "content": "你好"},
]

outputs = pipe(messages, do_sample=True, max_length=4096)
outputs