In [25]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from datasets import Dataset

# 尝试从 PEFT 导入 int8 训练准备函数
try:
    from peft import prepare_model_for_int8_training
except ImportError:
    from peft import prepare_model_for_kbit_training as prepare_model_for_int8_training

# 导入 LoRA 相关函数
from peft import LoraConfig, get_peft_model

# 尝试从 TRL 导入 SFTTrainer 和 SFTTrainingArguments，否则回退使用 Transformers 的 Trainer 和 TrainingArguments
try:
    from trl import SFTTrainer, SFTTrainingArguments
    training_args_class = SFTTrainingArguments
    trainer_class = SFTTrainer
except ImportError:
    from transformers import TrainingArguments, Trainer
    training_args_class = TrainingArguments
    trainer_class = Trainer

# 定义模型 checkpoint（使用 SmolLM-1.7B-Instruct 模型）
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"

# 加载 tokenizer，确保 add_special_tokens=True
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# 检查 tokenizer 是否支持 apply_chat_template 方法
if not hasattr(tokenizer, "apply_chat_template"):
    raise ValueError("Current tokenizer does not support apply_chat_template.")

# 定义一个简单的 lambda 使用内置方法
apply_template = lambda chat: tokenizer.apply_chat_template(chat, tokenize=False)

# ===================== 数据处理部分 =====================

# 从 JSON 文件中加载数据（假设文件名为 data.json）
with open("data.json", "r", encoding="utf-8") as f:
    chat_data = json.load(f)

# 设置 EOS token（例如 tokenizer.eos_token 通常为 "<|im_end|>"）
eos = tokenizer.eos_token

# 对每个 chat 样本，应用模板转换，并在末尾追加 EOS token
processed_data = []
for chat in chat_data:
    prompt = apply_template(chat)
    processed_data.append({"text": prompt})

# 转换为 Hugging Face Dataset
dataset = Dataset.from_list(processed_data)

# 定义 tokenization 函数，确保 add_special_tokens=True
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=512, add_special_tokens=True)

# 对数据集进行 tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=False)

# 创建数据整理器（自动通过 shift input_ids 创建 labels）
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 预览一个样本
sample = tokenized_dataset[0]
print("Processed Sample:")
print(sample)
print("\nDecoded Text:")
print(tokenizer.decode(sample["input_ids"]))





Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Processed Sample:
{'text': '<|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\nI am Alex, your personal AI researcher. I specialize in model fine-tuning, experiment design, and data analysis to empower scientific breakthroughs.<|im_end|>\n', 'input_ids': [1, 4093, 198, 10576, 359, 346, 47, 2, 198, 1, 520, 9531, 198, 57, 744, 5325, 28, 469, 2143, 5646, 8748, 30, 339, 22625, 281, 1743, 4979, 29, 40162, 28, 4308, 1157, 28, 284, 940, 2318, 288, 7935, 3097, 23320, 30, 2, 198], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Decoded Text:
<|im_start|>user
Who are you?<|im_end|>
<|im_start|>assistant
I am Alex, your personal AI researcher. I specialize in model fine-tuning, experiment design, and data analysis to empower scientific breakthroughs.<|im_end|>



In [26]:
tokenizer.eos_token

'<|im_end|>'

In [27]:

# ===================== 模型训练部分 =====================

# 加载模型，采用 8-bit 模式和自动设备映射（帮助在 24GB VRAM 内运行）
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto"
)

# 准备模型以进行 int8（或 k-bit）训练（用于 QLoRA）
model = prepare_model_for_int8_training(model)

# 定义 QLoRA 配置（可根据需要调整 target_modules）
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)

# 用 LoRA adapter 包装模型
model = get_peft_model(model, lora_config)

# 设置训练参数（演示目的下禁用模型保存）
training_args = training_args_class(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=30,
    logging_steps=100,
    save_total_limit=0
)

# 初始化 Trainer
trainer = trainer_class(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 开始训练
trainer.train()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  trainer = trainer_class(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
100,2.1624
200,1.1378
300,0.7802


TrainOutput(global_step=300, training_loss=1.3601246134440104, metrics={'train_runtime': 66.6042, 'train_samples_per_second': 9.008, 'train_steps_per_second': 4.504, 'total_flos': 289526271590400.0, 'train_loss': 1.3601246134440104, 'epoch': 30.0})

In [None]:
# ===================== 测试部分 =====================

# 构造测试对话数据（chat 格式）
test_chat = [
    {"role": "user", "content": "Who are you?"},
]
test_prompt = apply_template(test_chat) 
print("\nTest Prompt:")
print(test_prompt)

# 获取模型所在设备
device = next(model.parameters()).device

# 对测试提示进行 tokenization（确保添加特殊 token）
inputs = tokenizer(test_prompt, return_tensors="pt", add_special_tokens=True).to(device)

# 生成并打印 10 个多样化的响应（使用采样，依靠 eos_token_id 自动终止生成）
for i in range(10):
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nGenerated Response {i+1}:")
    print(response)


Test Prompt:
<|im_start|>user
Who are you?<|im_end|>



  return fn(*args, **kwargs)



Generated Response 1:
user
Who are you?
assistant
I am Alex, your personal AI researcher. I help with experimental design, model fine-tuning, and data analysis to advance AI research. My expertise includes experimental design, data analysis, and model evaluation. I am happy to assist

Generated Response 2:
user
Who are you?
assistant
I am Alex, your personal AI researcher. I assist in designing, training, and fine-tuning AI models to improve their performance and efficiency. I also provide research support, data analysis, and experimental setup guidance to facilitate scientific breakthroughs

Generated Response 3:
user
Who are you?
assistant
I am Alex, your personal AI researcher. I specialize in fine-tuning AI models, conducting in-depth research, and providing expert insights to advance AI innovation.

Generated Response 4:
user
Who are you?
assistant
I am Alex, your personal AI researcher. I assist in fine-tuning AI models, designing experiment protocols, and analyzing data to ensu