# Raymond LoRA 训练 - Full Version

**运行前检查：**
- Runtime → Change runtime type → 选择 H100/A100/T4
- 已挂载 Google Drive，raymond_train.json 放在 Drive 的 raymond/ 文件夹

| 参数 | T4 | A100 | H100 |
|---|---|---|---|
| lora_rank | 32 | 32 | 64 |
| batch_size | 2 | 4 | 8 |
| learning_rate | 1e-4 | 1e-4 | 5e-5 |
| 量化 | 4bit | 无 | 无 |
| 预计时长 | ~60分钟 | ~20分钟 | ~8分钟 |

## Cell 1：安装依赖

In [None]:
!pip install -q llamafactory
!llamafactory-cli version

In [None]:
# 登录 HuggingFace（下载 Qwen3-4B 需要）
# 去 https://huggingface.co/settings/tokens 生成一个 Read token
from huggingface_hub import login
login()  # 会弹出输入框，粘贴你的 HF token

## Cell 2：挂载 Google Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

import os, json
DATA_PATH = "/content/drive/MyDrive/raymond/raymond_train.json"
assert os.path.exists(DATA_PATH), f"找不到: {DATA_PATH}"
with open(DATA_PATH) as f:
    data = json.load(f)
print(f"训练样本数: {len(data)} 条")

## Cell 3：准备数据目录

In [None]:
import os, json, shutil

DATA_DIR = "/content/llama_factory_data"
os.makedirs(DATA_DIR, exist_ok=True)

DATA_PATH = "/content/drive/MyDrive/raymond/raymond_train.json"
shutil.copy(DATA_PATH, f"{DATA_DIR}/raymond_train.json")

dataset_info = {
    "raymond_full": {
        "file_name": "raymond_train.json",
        "formatting": "sharegpt",
        "columns": {"messages": "conversations"},
        "tags": {
            "role_tag": "from",
            "content_tag": "value",
            "user_tag": "human",
            "assistant_tag": "gpt",
            "system_tag": "system"
        }
    }
}
with open(f"{DATA_DIR}/dataset_info.json", "w") as f:
    json.dump(dataset_info, f, indent=2)
print("准备完成:", os.listdir(DATA_DIR))

## Cell 4：检查 GPU

In [None]:
!nvidia-smi
import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    name = torch.cuda.get_device_name(0)
    mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU: {name}")
    print(f"显存: {mem:.1f} GB")
    if mem >= 70:
        print("→ H100 模式")
    elif mem >= 35:
        print("→ A100 模式")
    else:
        print("→ T4 模式")

## Cell 5：生成训练配置并开始训练

脚本会根据显存自动选择最优参数，无需手动改。

In [None]:
import torch, yaml, os

OUTPUT_DIR = "/content/drive/MyDrive/raymond/train_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

mem_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3

if mem_gb >= 70:  # H100
    quant_config = {}
    batch_size = 8
    grad_accum = 2
    lora_rank = 64
    lora_alpha = 128
    learning_rate = 5e-5
    print(f"H100 模式: bf16 全精度")
elif mem_gb >= 35:  # A100
    quant_config = {}
    batch_size = 4
    grad_accum = 4
    lora_rank = 32
    lora_alpha = 64
    learning_rate = 1e-4
    print(f"A100 模式: bf16 全精度")
else:  # T4
    quant_config = {
        "quantization_bit": 4,
        "quantization_method": "bnb",
        "double_quantization": True,
    }
    batch_size = 2
    grad_accum = 8
    lora_rank = 32
    lora_alpha = 64
    learning_rate = 1e-4
    print(f"T4 模式: 4bit 量化")

train_config = {
    "model_name_or_path": "Qwen/Qwen3-4B-Instruct-2507",
    "template": "qwen3_nothink",
    "trust_remote_code": True,
    "flash_attn": "auto",
    "dataset": "raymond_full",
    "dataset_dir": "/content/llama_factory_data",
    "cutoff_len": 2048,
    "max_samples": 100000,
    "preprocessing_num_workers": 4,
    "stage": "sft",
    "do_train": True,
    "finetuning_type": "lora",
    "lora_rank": lora_rank,
    "lora_alpha": lora_alpha,
    "lora_dropout": 0.05,
    "lora_target": "all",
    "num_train_epochs": 4,
    "per_device_train_batch_size": batch_size,
    "gradient_accumulation_steps": grad_accum,
    "learning_rate": learning_rate,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 50,
    "max_grad_norm": 1.0,
    "optim": "adamw_torch",
    "bf16": True,
    "output_dir": OUTPUT_DIR,
    "logging_steps": 10,
    "save_steps": 100,
    "plot_loss": True,
    "report_to": "none",
    **quant_config,
}

config_path = "/content/raymond_train_config.yaml"
with open(config_path, "w") as f:
    yaml.dump(train_config, f, default_flow_style=False, allow_unicode=True)

print(f"lora_rank={lora_rank}, lora_alpha={lora_alpha}")
print(f"等效 batch size: {batch_size * grad_accum}")
print(f"learning_rate: {learning_rate}")
print("配置已生成，开始训练...")

In [None]:
!llamafactory-cli train /content/raymond_train_config.yaml

## Cell 6：验证训练结果

In [None]:
import os, json

OUTPUT_DIR = "/content/drive/MyDrive/raymond/train_output"
print("输出文件:", os.listdir(OUTPUT_DIR))

results_path = f"{OUTPUT_DIR}/all_results.json"
if os.path.exists(results_path):
    with open(results_path) as f:
        results = json.load(f)
    loss = results.get("train_loss", 999)
    print(f"最终 loss: {loss:.4f}")
    print(f"训练时长: {results.get('train_runtime', 0)/60:.1f} 分钟")
    if loss < 0.8:
        print("loss 良好")
    elif loss < 1.2:
        print("loss 一般，效果可接受")
    else:
        print("loss 偏高，可能需要调参")

## Cell 7：快速测试模型效果

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

OUTPUT_DIR = "/content/drive/MyDrive/raymond/train_output"
BASE_MODEL = "Qwen/Qwen3-4B-Instruct-2507"

print("加载模型...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.bfloat16,
    device_map="auto", trust_remote_code=True
)
model = PeftModel.from_pretrained(base, OUTPUT_DIR)
model.eval()
print("模型加载完成")

SYSTEM = "你是Raymond，一个在美国宾夕法尼亚留学的中国研究生，计算机专业，本科国内双非。你说话短而碎，喜欢连发多条短消息，像微信聊天一样。你常用的口头禅有66、哈、f、说白了、不好说、俺、无敌了、我真谢了。你的幽默方式是自嘲和反讽，表面毒舌但实际很关心朋友。"

def chat(user_input):
    messages = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": user_input}
    ]
    inputs = tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True,
        return_tensors="pt", return_dict=True
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=200,
            temperature=0.8, do_sample=True, top_p=0.9
        )
    prompt_len = inputs["input_ids"].shape[-1]
    return tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)

for q in ["你在干嘛", "今天吃什么", "铲吗", "你后悔出国吗"]:
    print(f"朋友: {q}")
    print(f"Raymond: {chat(q)}")
    print("-" * 30)

## Cell 8（训练完成后运行）：合并 LoRA 为完整模型

合并后导入 Ollama，Step 4 再运行。

In [None]:
import os
MERGED_DIR = "/content/drive/MyDrive/raymond/merged_model"
os.makedirs(MERGED_DIR, exist_ok=True)

print("合并 LoRA adapter...")
merged = model.merge_and_unload()
merged.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)
print(f"合并完成: {MERGED_DIR}")
print(os.listdir(MERGED_DIR))