In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from huggingface_hub import login
from peft import LoraConfig, get_peft_model  # <<< 加入LoRA
import os
import json
import random

# Set model and dataset paths
model_name = "deepseek-ai/deepseek-vl-1.3b-chat"
data_path = "mixed_sft/sft_rho5.jsonl"
output_dir = "sft_models/sft_rho5"
token = "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"

# Login to Hugging Face
login(token=token)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # 注意需要确认模型内部名字，暂时用常规名字
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

def load_local_dataset(path, sample_size=None, seed=42):
    """
    Load a local JSONL dataset and optionally sample a fixed number of entries.

    Args:
        path (str): Path to the local JSONL file.
        sample_size (int, optional): Number of samples to randomly select.
                                     If None, load the full dataset.
        seed (int): Random seed to ensure reproducibility.

    Returns:
        Dataset: A HuggingFace Dataset object containing the loaded or sampled data.
    """
    with open(path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    
    if sample_size is not None and sample_size < len(data):
        random.seed(seed)
        data = random.sample(data, sample_size)

    return Dataset.from_list(data)

# Load and optionally sample training data
dataset = load_local_dataset(data_path, sample_size=None)

# Tokenization function
def tokenize(example):
    prompt = example["prompt"]
    answer = example["answer"]
    text = prompt + "\n\n" + answer
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir=f"{output_dir}/logs",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=5e-5,
    fp16=True,
    report_to="none",
    logging_steps=10,
    save_steps=100
)

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


ValueError: The checkpoint you are trying to load has model type `multi_modality` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`

In [2]:
import transformers
print(transformers.__version__)

4.51.3


In [4]:
from finetuning import main

# 定义你的数据集对应的超参数
dataset_paths = {
    "rho0": "mixed_sft/sft_rho0.jsonl",
    "rho5": "mixed_sft/sft_rho5.jsonl",
    "rho10": "mixed_sft/sft_rho10.jsonl",
    "rho20": "mixed_sft/sft_rho20.jsonl",
}

# 遍历不同污染比例
for rho_name, data_path in dataset_paths.items():
    output_dir = f"sft_models/{rho_name}_lora"

    # 传入finetuning的参数
    args = {
        "model_name": "deepseek-ai/deepseek-vl-1.3b-chat",
        "dataset_path": data_path,        # 你需要在 train_config 里读取这个
        "output_dir": output_dir,
        "batch_size_training": 4,
        "num_epochs": 3,
        "learning_rate": 5e-5,
        "use_peft": True,                  # 开启LoRA
        "r": 8,                            # LoRA rank
        "lora_alpha": 16,
        "lora_dropout": 0.1,
        "target_modules": ["q_proj", "v_proj"],

        "enable_fsdp": False,               # 暂时单卡就可以，后面你可以切成 True
        "pure_bf16": False,                 # 用fp16
        "run_validation": False,
        "save_every_epoch": True,
    }

    print(f"Starting training for {rho_name}")
    main(**args)

ImportError: cannot import name 'prepare_model_for_int8_training' from 'peft' (/Users/shubing/opt/anaconda3/envs/SFT/lib/python3.13/site-packages/peft/__init__.py)