In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained("./model")

from utils.preprocess import prepare_pretrain_dataset

train_ds = prepare_pretrain_dataset(
        data_path="datasets/pretrain_hq.jsonl",
        tokenizer=tokenizer,
        max_length=512,
        cache_dir="./datasets",
        num_proc=32
    )


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map (num_proc=32):   0%|          | 0/1413103 [00:00<?, ? examples/s]

In [17]:
import json
from model.config import MiniMindConfig

with open("configs/model_config.json", "r") as f:
    model_config = json.load(f)
    config = MiniMindConfig(**model_config)

config.dropout

0.0

In [8]:
from swanlab.integration.transformers import SwanLabCallback
from transformers import Trainer, TrainingArguments
from model.modeling import MiniMindForCausalLM

model = MiniMindForCausalLM(config)

# --- 3. 初始化 SwanLab 回调 ---
swanlab_callback = SwanLabCallback(
    project="Qwen3-0.6B-from-scratch",
    experiment_name="Qwen3-0.6B-pretrain",
    description="Pretraining Qwen3-0.6B model",
    config={
        "model_name": "Qwen3-0.6B",
        "dataset": "pretrain_hq.jsonl",
        "train_size": len(train_ds),
    },
)

# --- 4. 设置训练参数 (替代手动的参数和deepspeed.json部分内容) ---
training_args = TrainingArguments(
    output_dir="../output/qwen3_0.6B_pretrain_checkpoint",
    num_train_epochs=4,  # 根据需要调整
    per_device_train_batch_size=1,  # 根据你的显存调整
    gradient_accumulation_steps=8,
    learning_rate=5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    logging_steps=100,
    save_steps=1000,
    deepspeed="./configs/ds_z2_no_offload.json", # 直接传入配置文件路径
    bf16=True, # 或者 bf16=True
    report_to="none",
    dataloader_num_workers=1,
)

# --- 5. 创建并运行 Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    callbacks=[swanlab_callback],
)

trainer.train()

# --- 6. 保存最终模型 ---
trainer.save_model("../output/qwen3_0.6B_final")

[2025-06-25 04:57:38,173] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-06-25 04:57:39,868] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value.


ValueError: Please correct the following DeepSpeed config values that mismatch TrainingArguments values:
- ds train_micro_batch_size_per_gpu=2 vs hf per_device_train_batch_size=1
- ds optimizer.params.betas=[0.9, 0.95] vs hf adam_beta1+adam_beta2=[0.9, 0.999]
- ds optimizer.params.weight_decay=0.1 vs hf weight_decay=0.0
- ds scheduler.params.warmup_num_steps=0 vs hf warmup_steps=884
The easiest method is to set these DeepSpeed config values to 'auto'.