### 加载模型和数据集

In [1]:
import torch

print(torch.cuda.is_available())

from modelscope import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-0.5B-Instruct", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="cuda", torch_dtype="auto")

True


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from datasets import Dataset
from swanlab.integration.huggingface import SwanLabCallback
from peft import LoraConfig, TaskType, get_peft_model
import swanlab

  from swanlab.integration.huggingface import SwanLabCallback


In [3]:
train_df = pd.read_json('multi-class-1000x10.json')
train_ds = Dataset.from_pandas(train_df)

In [4]:
train_df

Unnamed: 0,文本,风险类别
0,您好，我是辉县消防队的采购负责人，需要订购一批鱼和牛蛙，请添加我的微信好友，并按指示转账34...,冒充军警购物类诈骗
1,尊敬的快手分期用户，您的账户因资料填写错误被临时冻结，请立即缴纳6000元认证金以解冻账户，...,虚假信用服务类
2,您被邀请加入了一个QQ群，只需为公众号点赞并发送截图即可领取红包。接下来，请下载Ruby A...,虚假网络投资理财类
3,您在知乎APP上看到的贷款信息可能不实，请谨慎对待网络贷款信息，建议通过正规渠道申请贷款。,无风险
4,您好，我们已获得您的视频聊天内容，如果不希望视频泄露给您的亲友，请立即转账至12345678...,网黑案件
...,...,...
9995,您好，我是银监会的工作人员，发现您的网上贷款存在违规记录，需要您立即处理以避免影响信用。请访...,虚假购物、服务类
9996,尊敬的用户，您好！通过斯达特软件加入的国风购物平台，现推出新品投资机会，选择品牌Ad+1、臻...,虚假网络投资理财类
9997,您好，我是京东客服，检测到您的京东APP借贷服务利率高于国家标准，请立即下载小鱼易连APP进...,冒充电商物流客服类
9998,您好，我是微信金融的客服，检测到您的微信贷款额度存在异常，需要立即注销，否则将影响您的个人征...,虚假信用服务类


In [4]:
category = "短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件"
prompt = '''在这个任务中，你是一位资深的反诈骗网络安全分析师，你的职责是利用你的专业知识和对网络诈骗行为的深刻理解，从短信文本中识别出可能存在的欺诈行为和风险类别。你的工作对于提前预警潜在的网络诈骗，保护用户财产安全和个人信息不被侵犯具有重要意义。现在，请仔细审查以下短信文本，并运用你的专业判断，给出短信的风险类别判断结果。(%s)'''%category

def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 384 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|im_start|>system\n{prompt}<|im_end|>\
        \n<|im_start|>user\n{example['文本']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['风险类别']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}  


In [8]:
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
train_dataset.save_to_disk("multi-class-1000x10")

  obj.co_lnotab,  # for < python 3.10 [not counted in args]
Map: 100%|██████████| 10000/10000 [00:19<00:00, 519.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 162665.76 examples/s]


In [5]:
from datasets import load_from_disk
train_dataset = load_from_disk("multi-class-1000x10")

In [6]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

### 模型微调

In [7]:
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="auto", torch_dtype="auto")
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1,  # Dropout 比例
)

# for param in model.parameters():
#     param.requires_grad = True
    
model = get_peft_model(model, config)

In [8]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
   

In [9]:
model.print_trainable_parameters()

trainable params: 5,621,760 || all params: 499,654,528 || trainable%: 1.1251


In [10]:
args = TrainingArguments(
    output_dir="./output/multi-class-20epoch",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=2500,
    num_train_epochs=20,
    save_steps=2500,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)

swanlab_callback = SwanLabCallback(
    project="Qwen2.5-0.5B-Instruct-Finetuning",
    experiment_name="Qwen2.5-0.5B-Instruct-multi-class",
    description="Qwen2.5-0.5B-Instruct模型微调。",
    config={
        "model": "qwen/Qwen2.5-0.5B-Instruct",
        "dataset": "FGRC-SCD电信诈骗数据集",
    }
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

In [11]:
trainer.train()

[1m[34mswanlab[0m[0m: swanlab version 0.4.2 is available!  Upgrade: `pip install -U swanlab`
[1m[34mswanlab[0m[0m: \ Getting project...                                                      

  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.3.21                                  
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1md:\fudan\LLM\FineTune-Qwen\swanlog\run-20250116_223835-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mtangerine[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mQwen2.5-0.5B-Instruct-multi-class0116[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch d:\fudan\LLM\FineTune-Qwen\swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@tangerine/Qwen2.5-0.5B-Instruct-Finetuning[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@tangerine/Qwen2.5-0.5B-Instruct-Finetuning/runs/4k860rnrg1gvdy1b7vsmn[0m[0m


  0%|          | 0/12500 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.3195, 'grad_norm': 5.1099629402160645, 'learning_rate': 8e-05, 'epoch': 4.0}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.1568, 'grad_norm': 4.552680969238281, 'learning_rate': 6e-05, 'epoch': 8.0}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.1388, 'grad_norm': 4.409473896026611, 'learning_rate': 4e-05, 'epoch': 12.0}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.1327, 'grad_norm': 5.450846195220947, 'learning_rate': 2e-05, 'epoch': 16.0}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.1294, 'grad_norm': 5.7159833908081055, 'learning_rate': 0.0, 'epoch': 20.0}


100%|██████████| 12500/12500 [3:54:09<00:00,  1.12s/it]

[1m[33mswanlab[0m[0m: Step 12500 on key train/epoch already exists, ignored.
{'train_runtime': 14055.3094, 'train_samples_per_second': 14.229, 'train_steps_per_second': 0.889, 'train_loss': 0.17544189697265625, 'epoch': 20.0}





TrainOutput(global_step=12500, training_loss=0.17544189697265625, metrics={'train_runtime': 14055.3094, 'train_samples_per_second': 14.229, 'train_steps_per_second': 0.889, 'total_flos': 1.0339122852646502e+17, 'train_loss': 0.17544189697265625, 'epoch': 20.0})

  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


In [12]:
swanlab.finish()

[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch d:\fudan\LLM\FineTune-Qwen\swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@tangerine/Qwen2.5-0.5B-Instruct-Finetuning[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@tangerine/Qwen2.5-0.5B-Instruct-Finetuning/runs/4k860rnrg1gvdy1b7vsmn[0m[0m
[1m[34mswanlab[0m[0m: \ Updating experiment status...                                           

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


ApiError: (409, 'Conflict')