In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

# 基础模型
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

# Step 1: 定义共享的LoRA配置 (基础层)
shared_lora_config = LoraConfig(
    r=8,  # 较高秩捕获通用特征
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # 作用于注意力层
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    # 关键：标记为共享参数（通过名称区分）
    name="shared_lora"  
)

# Step 2: 添加共享适配器到基础模型
model = get_peft_model(model, shared_lora_config)

# Step 3: 为每个任务添加任务特定适配器 (低秩)
task1_lora_config = LoraConfig(
    r=4,  # 低秩适应任务差异
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    name="task1_lora",
    # 关键：与共享适配器模块相同，但独立参数
    layers_to_transform=shared_lora_config.layers_to_transform  
)
model.add_adapter(task1_lora_config, "task1")

task2_lora_config = LoraConfig(...)  # 类似定义
model.add_adapter(task2_lora_config, "task2")

# 激活共享适配器 + 任务适配器组合
def activate_combined_adapter(task_name):
    model.set_adapter(["shared_lora", task_name])  # 组合参数

In [None]:
from torch.utils.data import DataLoader
from datasets import load_dataset

# 假设 task1_data 和 task2_data 已加载
train_loader1 = DataLoader(task1_data, batch_size=4, shuffle=True)
train_loader2 = DataLoader(task2_data, batch_size=4, shuffle=True)

# 优化器：区分共享参数和任务参数
shared_params = [p for n, p in model.named_parameters() if "shared_lora" in n]
task1_params = [p for n, p in model.named_parameters() if "task1_lora" in n]
task2_params = [p for n, p in model.named_parameters() if "task2_lora" in n]

optimizer = torch.optim.AdamW([
    {"params": shared_params, "lr": 1e-5},
    {"params": task1_params, "lr": 1e-4},
    {"params": task2_params, "lr": 1e-4}
])

# 交替训练循环
for epoch in range(10):
    # 混合数据迭代器（交替批次）
    mixed_loader = zip(train_loader1, train_loader2)
    for batch1, batch2 in mixed_loader:
        # 训练任务1
        activate_combined_adapter("task1_lora")
        outputs = model(**batch1)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # 训练任务2
        activate_combined_adapter("task2_lora")
        outputs = model(**batch2)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [None]:
# 阶段1：仅训练共享参数
for param in model.parameters():
    if "lora" in name and "shared_lora" not in name:
        param.requires_grad = False  # 冻结任务特定参数

optimizer = torch.optim.AdamW(shared_params, lr=1e-5)
for epoch in range(5):
    for batch in combined_data_loader:  # 混合所有任务数据
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# 阶段2：解冻任务参数并微调
for param in model.parameters():
    param.requires_grad = True  # 解冻所有参数（共享+任务）

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
for epoch in range(5):
    # 正常多任务训练（如交替训练）

In [None]:
# 假设已有一个预训练的教师模型
teacher_model = AutoModelForCausalLM.from_pretrained("teacher_model")
teacher_model.eval()

# 学生模型（带共享+任务适配器）
student_model = get_peft_model(...)  # 同第一部分

# 蒸馏损失函数
def compute_distill_loss(student_logits, teacher_logits, labels, alpha=0.5):
    ce_loss = F.cross_entropy(student_logits, labels)
    kl_loss = F.kl_div(
        F.log_softmax(student_logits, dim=-1),
        F.softmax(teacher_logits, dim=-1),
        reduction="batchmean"
    )
    return alpha * ce_loss + (1 - alpha) * kl_loss

# 训练步骤
for batch in data_loader:
    # 教师推理
    with torch.no_grad():
        teacher_outputs = teacher_model(**batch)
    
    # 学生推理
    student_outputs = student_model(**batch)
    
    # 计算混合损失
    loss = compute_distill_loss(
        student_outputs.logits,
        teacher_outputs.logits,
        batch["labels"]
    )
    loss.backward()
    optimizer.step()

In [None]:
# 假设已有训练好的共享适配器 "shared_lora"
# 新任务适配器 "task3_lora" 初始化时复用共享参数

# 获取共享适配器权重
shared_state_dict = {
    k: v.clone() for k, v in model.state_dict().items()
    if "shared_lora" in k
}

# 添加新任务适配器
task3_lora_config = LoraConfig(...)
model.add_adapter(task3_lora_config, "task3")

# 初始化新适配器参数：部分继承共享权重
for name, param in model.named_parameters():
    if "task3_lora" in name:
        # 例如：继承共享的A矩阵，随机初始化B矩阵
        if "lora_A" in name:
            param.data.copy_(shared_state_dict[name.replace("task3", "shared")])
        elif "lora_B" in name:
            torch.nn.init.kaiming_uniform_(param.data)

In [None]:
# 训练时保存多个适配器权重
model.save_pretrained_adapters("adapters/", ["shared_lora", "task1_lora", "task2_lora"])

# 推理时动态加权融合
def fuse_adapters(weights):
    fused_state_dict = {}
    for adapter_name in ["shared_lora", "task1_lora", "task2_lora"]:
        adapter_state = torch.load(f"adapters/{adapter_name}/adapter_model.bin")
        for key in adapter_state:
            if key not in fused_state_dict:
                fused_state_dict[key] = weights[adapter_name] * adapter_state[key]
            else:
                fused_state_dict[key] += weights[adapter_name] * adapter_state[key]
    model.load_state_dict(fused_state_dict, strict=False)

# 示例：平均权重
fuse_adapters({"shared_lora": 0.5, "task1_lora": 0.3, "task2_lora": 0.2})