In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-0.5B-Instruct", use_fast=False, trust_remote_code=True)
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="cuda", torch_dtype="auto")

# 加载 LoRA 适配器权重
model = PeftModel.from_pretrained(
    model,
    model_id="./output/binary-class0113/checkpoint-2000/",
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_df = pd.read_json('eval-binary.json')

def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # print(response)
    return response


In [3]:
from tqdm import tqdm
prompt = '''在这个任务中，你是一位资深的反诈骗网络安全分析师，你的职责是利用你的专业知识和对网络诈骗行为的深刻理解，从短信文本中识别出可能存在的欺诈行为和风险类别。你的工作对于提前预警潜在的网络诈骗，保护用户财产安全和个人信息不被侵犯具有重要意义。现在，请仔细审查以下短信文本，并运用你的专业判断该短信是否有风险，回答“无风险”或“有风险”'''

tp, tn, fp, fn = 0,0,0,0
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    input_value = row['文本']
    messages = [
        {"role": "system", "content": f"{prompt}"},
        {"role": "user", "content": f"{input_value}"}
    ]
    # print(messages)
    response = predict(messages, model, tokenizer)
    answer = row['风险类别']
    if response.strip() == answer:
        if response.strip()[0] == "无":
            tn += 1
        elif response.strip()[0] == "有":
            tp += 1
    elif response.strip()[0] == "无" and answer[0] == "有":
        fn += 1
    elif response.strip()[0] == "有" and answer[0] == "无":
        fp += 1
        
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (tp + tn) / test_df.shape[0]

print("accuracy: ", accuracy)
print("tp:%d, tn:%d, fp:%d, fn:%d"%(tp, tn, fp, fn))
print("f1_score: ", f1_score)


  0%|          | 0/7734 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 7734/7734 [20:11<00:00,  6.39it/s]

accuracy:  0.7848461339539695
tp:3763, tn:2307, fp:1, fn:1663
f1_score:  0.8189336235038085





In [4]:
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-0.5B-Instruct", use_fast=False, trust_remote_code=True)
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="cuda", torch_dtype="auto")

# 加载 LoRA 适配器权重
model = PeftModel.from_pretrained(
    model,
    model_id="./output/binary-class0113/checkpoint-4000/",
)

tp, tn, fp, fn = 0,0,0,0
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    input_value = row['文本']
    messages = [
        {"role": "system", "content": f"{prompt}"},
        {"role": "user", "content": f"{input_value}"}
    ]
    # print(messages)
    response = predict(messages, model, tokenizer)
    answer = row['风险类别']
    if response.strip() == answer:
        if response.strip()[0] == "无":
            tn += 1
        elif response.strip()[0] == "有":
            tp += 1
    elif response.strip()[0] == "无" and answer[0] == "有":
        fn += 1
    elif response.strip()[0] == "有" and answer[0] == "无":
        fp += 1
        
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (tp + tn) / test_df.shape[0]

print("accuracy: ", accuracy)
print("tp:%d, tn:%d, fp:%d, fn:%d"%(tp, tn, fp, fn))
print("f1_score: ", f1_score)

100%|██████████| 7734/7734 [19:54<00:00,  6.48it/s]

accuracy:  0.9013447116627877
tp:4664, tn:2307, fp:1, fn:762
f1_score:  0.9243880685759588





In [5]:
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-0.5B-Instruct", use_fast=False, trust_remote_code=True)
# 加载基础模型
model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-0.5B-Instruct", device_map="cuda", torch_dtype="auto")

# 加载 LoRA 适配器权重
model = PeftModel.from_pretrained(
    model,
    model_id="./output/binary-class0113/checkpoint-6000/",
)

tp, tn, fp, fn = 0,0,0,0
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    input_value = row['文本']
    messages = [
        {"role": "system", "content": f"{prompt}"},
        {"role": "user", "content": f"{input_value}"}
    ]
    # print(messages)
    response = predict(messages, model, tokenizer)
    answer = row['风险类别']
    if response.strip() == answer:
        if response.strip()[0] == "无":
            tn += 1
        elif response.strip()[0] == "有":
            tp += 1
    elif response.strip()[0] == "无" and answer[0] == "有":
        fn += 1
    elif response.strip()[0] == "有" and answer[0] == "无":
        fp += 1
        
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy = (tp + tn) / test_df.shape[0]

print("accuracy: ", accuracy)
print("tp:%d, tn:%d, fp:%d, fn:%d"%(tp, tn, fp, fn))
print("f1_score: ", f1_score)

100%|██████████| 7734/7734 [19:39<00:00,  6.55it/s]

accuracy:  0.8797517455391777
tp:4497, tn:2307, fp:1, fn:929
f1_score:  0.9062877871825876



