### 加载模型和数据集

In [1]:
import torch

print(torch.cuda.is_available())

from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

# 在modelscope上下载Qwen模型到本地目录下
# model_dir = snapshot_download("./Qwen2.5-0.5B-Instruct", cache_dir="./", revision="master")

# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-0.5B-Instruct", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="auto", torch_dtype="auto")

True


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import pandas as pd
from datasets import Dataset
from swanlab.integration.huggingface import SwanLabCallback
from peft import LoraConfig, TaskType, get_peft_model
import os
import swanlab

  from swanlab.integration.huggingface import SwanLabCallback


In [3]:
train_df = pd.read_json('finetuning_small.json')
train_ds = Dataset.from_pandas(train_df)

In [4]:
train_df

Unnamed: 0,文本,风险类别,评价
0,欢迎加入‘关爱之家’微信群，我们提供丰富的交流平台，分享生活点滴，不涉及任何金融投资活动，请...,无风险,1.504883
1,您好，感谢您使用交易猫APP进行游戏账号交易，如有任何交易疑问，请直接联系平台客服，确保交易安全。,无风险,1.504883
2,欢迎加入我们的赚钱群，只需按照群内任务操作，即可轻松获得收益。请下载“悠娱文化”APP，联系...,虚假网络投资理财类,1.504883
3,您好，我是您在美国的朋友，因为疫情原因买不到回国机票，请帮我联系东方航空的李经理，邮箱账号为...,冒充领导、熟人类,1.504883
4,您好，我们是一家专业的信贷服务公司，看到您在寻找贷款服务，我们可以提供10万元的贷款，请添加...,虚假信用服务类,1.504883
...,...,...,...
7560,嗨，最近我在农银理财上投资赚了不少，每天都能享受骑马和高尔夫的奢侈生活，你要不要试试？我给你...,虚假网络投资理财类,2.564453
7561,您好，我是微核网友，最近发现一个华夏基金的静态投资项目，收益高且快，一小时就能分红，投资10...,虚假网络投资理财类,2.574219
7562,你好，感谢你在抖音上的积极参与，我们鼓励用户在平台上的正向互动，希望你继续享受抖音带来的乐趣。,无风险,2.591797
7563,尊敬的店主，我是武警支队的采购员，我们即将进行野外拉练，急需大量水果和牛肉罐头，请您尽快联系...,冒充军警购物类诈骗,2.601562


In [5]:
s = [(len(i),i) for i in list(train_ds['文本'])]
s.sort()

In [3]:
category = "短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件"
prompt = '''在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(%s)'''%category

def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 384 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|im_start|>system\n{prompt}<|im_end|>\
        \n<|im_start|>user\n{example['文本']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['风险类别']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}  


In [7]:
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
train_dataset.save_to_disk("trainset_ft_small.json")

  obj.co_lnotab,  # for < python 3.10 [not counted in args]
Map: 100%|██████████| 7565/7565 [00:12<00:00, 591.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7565/7565 [00:00<00:00, 327480.47 examples/s]


In [4]:
from datasets import load_from_disk
train_dataset = load_from_disk("trainset_ft_small.json")

In [4]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 7565
})

### 模型微调

In [5]:
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="auto", torch_dtype="auto")
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1,  # Dropout 比例
)

# for param in model.parameters():
#     param.requires_grad = True
    
model = get_peft_model(model, config)

In [6]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
   

In [7]:
model.print_trainable_parameters()

trainable params: 5,621,760 || all params: 499,654,528 || trainable%: 1.1251


In [8]:
args = TrainingArguments(
    output_dir="./output/ft-small-0113-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=1000,
    num_train_epochs=30,
    save_steps=1000,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)

swanlab_callback = SwanLabCallback(
    project="Qwen2.5-0.5B-Instruct-Finetuning",
    experiment_name="Qwen2.5-0.5B-Instruct-lora",
    description="Qwen2.5-0.5B-Instruct模型在FGRC-SCD电信诈骗数据集finetuning_small上微调。",
    config={
        "model": "qwen/Qwen2.5-0.5B-Instruct",
        "dataset": "FGRC-SCD电信诈骗数据集 finetuning_small",
    }
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

In [9]:
trainer.train()

[1m[34mswanlab[0m[0m: \ Getting project...                                                      

  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.3.21                                  
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1md:\fudan\LLM\FineTune-Qwen\swanlog\run-20250113_003153-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mtangerine[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mQwen2.5-0.5B-Instruct-lora[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch d:\fudan\LLM\FineTune-Qwen\swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@tangerine/Qwen2.5-0.5B-Instruct-Finetuning[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@tangerine/Qwen2.5-0.5B-Instruct-Finetuning/runs/8rahi0kpsnzg56mm2d1mp[0m[0m


  0%|          | 0/14190 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.2495, 'grad_norm': 1.6866148710250854, 'learning_rate': 9.295278365045807e-05, 'epoch': 2.11}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0568, 'grad_norm': 5.606316566467285, 'learning_rate': 8.590556730091615e-05, 'epoch': 4.23}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0402, 'grad_norm': 2.8436224460601807, 'learning_rate': 7.885835095137421e-05, 'epoch': 6.34}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0338, 'grad_norm': 3.221371650695801, 'learning_rate': 7.181113460183228e-05, 'epoch': 8.46}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0291, 'grad_norm': 3.1219642162323, 'learning_rate': 6.476391825229034e-05, 'epoch': 10.57}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0271, 'grad_norm': 0.5470854043960571, 'learning_rate': 5.7716701902748415e-05, 'epoch': 12.68}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0257, 'grad_norm': 3.0590219497680664, 'learning_rate': 5.066948555320649e-05, 'epoch': 14.8}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0243, 'grad_norm': 5.499304294586182, 'learning_rate': 4.362226920366456e-05, 'epoch': 16.91}


  return fn(*args, **kwargs)


{'loss': 0.0235, 'grad_norm': 0.11410917341709137, 'learning_rate': 3.6575052854122623e-05, 'epoch': 19.03}


  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:
  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0228, 'grad_norm': 2.4317262172698975, 'learning_rate': 2.9527836504580692e-05, 'epoch': 21.14}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0219, 'grad_norm': 0.8026647567749023, 'learning_rate': 2.2480620155038764e-05, 'epoch': 23.26}


  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0217, 'grad_norm': 4.74454927444458, 'learning_rate': 1.543340380549683e-05, 'epoch': 25.37}


  return fn(*args, **kwargs)


{'loss': 0.0219, 'grad_norm': 1.1086962223052979, 'learning_rate': 8.386187455954899e-06, 'epoch': 27.48}


  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:
  return fn(*args, **kwargs)
  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'loss': 0.0208, 'grad_norm': 2.539900779724121, 'learning_rate': 1.338971106412967e-06, 'epoch': 29.6}


  return fn(*args, **kwargs)
100%|██████████| 14190/14190 [2:28:19<00:00,  1.59it/s]

{'train_runtime': 8903.373, 'train_samples_per_second': 25.49, 'train_steps_per_second': 1.594, 'train_loss': 0.04388894807964752, 'epoch': 30.0}





TrainOutput(global_step=14190, training_loss=0.04388894807964752, metrics={'train_runtime': 8903.373, 'train_samples_per_second': 25.49, 'train_steps_per_second': 1.594, 'total_flos': 1.0214027158326298e+17, 'train_loss': 0.04388894807964752, 'epoch': 30.0})

  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


In [13]:
swanlab.finish()

[1m[31mswanlab[0m[0m: After experiment is finished, you can't call finish() again.


### 测试模型

In [10]:
# 用测试集的前10条，测试模型
# test_df = pd.read_json('finetuning_small.json')[-10:]
test_df = pd.read_json('eval_small.json')[:100]

In [11]:
def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # print(response)
    return response
 

In [12]:
test_text_list = []
for index, row in test_df.iterrows():
    input_value = row['文本']
    messages = [
        {"role": "system", "content": f"{prompt}"},
        {"role": "user", "content": f"{input_value}"}
    ]
    # print(messages)
    response = predict(messages, model, tokenizer)
    messages.append({"role": "assistant", "content": f"{response}"})
    result_text = f"{messages[0]}\n{messages[1]}\n{messages[2]}\n"
    print(result_text)
    test_text_list.append(swanlab.Text(result_text, caption=response))

swanlab.log({"Prediction": test_text_list})
swanlab.finish()


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  return fn(*args, **kwargs)


{'role': 'system', 'content': '在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件)'}
{'role': 'user', 'content': '您好，抖音上发现有人提供替考驾驶证服务，每科仅需1400元，微信号为wei1in12345，如有需要请联系，我们保证快速通过考试。'}
{'role': 'assistant', 'content': '虚假购物、服务类'}

{'role': 'system', 'content': '在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件)'}
{'role': 'user', 'content': '请注意，保护个人隐私非常重要，不要轻易在社交平台上透露个人信息，避免不必要的风险。'}
{'role': 'assistant', 'content': '无风险'}

{'role': 'system', 'content': '在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件)'}
{'

  if (self.sid_expired_at - datetime.utcnow()).total_seconds() <= self.REFRESH_TIME:


{'role': 'system', 'content': '在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件)'}
{'role': 'user', 'content': '尊敬的用户，如果您在京东平台有任何疑问或需要帮助，请通过官方渠道联系客服，确保您的账户安全。'}
{'role': 'assistant', 'content': '无风险'}

{'role': 'system', 'content': '在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗|网黑案件)'}
{'role': 'user', 'content': '您好，我们是专业借贷服务团队，为了提高您的借贷额度，请协助我们完成银行卡流水验证，即刻转账至指定账户，我们将确保您的资金安全并提升您的信用等级。'}
{'role': 'assistant', 'content': '虚假信用服务类'}

{'role': 'system', 'content': '在这个任务中，你是一位经验丰富的网络安全分析师，你的任务是通过你的专业知识和敏锐的洞察力，从短信文本中识别出潜在的风险类别。你的工作对于保护用户免受网络诈骗和风险的侵害至关重要。现在，请阅读以下短信文本，并给出你的风险类别判断结果。(短信风险类别选项有：无风险|冒充电商物流客服类|虚假网络投资理财类|虚假信用服务类|虚假购物、服务类|冒充公检法及政府机关类|冒充领导、熟人类|网络婚恋、交友类|冒充军警购物类诈骗

  now = datetime.utcnow() + timedelta(hours=8)


[1m[34mswanlab[0m[0m: \ Updating experiment status...                                           

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


ApiError: (409, 'Conflict')