In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import json
import pandas as pd
import os

In [4]:
def dataset_jsonl_transfer(origin_path, new_path):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    """
    messages = []

    # 读取旧的JSONL文件
    with open(origin_path, "r", encoding="utf-8") as file:
        for line in file:
            # 解析每一行的json数据
            data = json.loads(line)
            input_text = data["text"]
            entities = data["entities"]
            match_names = ["地点", "人名", "地理实体", "组织"]
            
            entity_sentence = ""
            for entity in entities:
                entity_json = dict(entity)
                entity_text = entity_json["entity_text"]
                entity_names = entity_json["entity_names"]
                for name in entity_names:
                    if name in match_names:
                        entity_label = name
                        break
                
                entity_sentence += f"""{{"entity_text": "{entity_text}", "entity_label": "{entity_label}"}}"""
            
            if entity_sentence == "":
                entity_sentence = "没有找到任何实体"
            
            message = {
                "instruction": """你是一个文本实体识别领域的专家，你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体. 以 json 格式输出, 如 {"entity_text": "南京", "entity_label": "地理实体"} 注意: 1. 输出的每一行都必须是正确的 json 字符串. 2. 找不到任何实体时, 输出"没有找到任何实体". """,
                "input": f"文本:{input_text}",
                "output": entity_sentence,
            }
            
            messages.append(message)

    # 保存重构后的JSONL文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")


# 加载、处理数据集和测试集
train_dataset_path = "ccfbdci.jsonl"
train_jsonl_new_path = "ccf_train.jsonl"

if not os.path.exists(train_jsonl_new_path):
    dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)

total_df = pd.read_json(train_jsonl_new_path, lines=True)
train_df = total_df[int(len(total_df) * 0.1):]  # 取90%的数据做训练集
test_df = total_df[:int(len(total_df) * 0.1)].sample(n=20)  # 随机取10%的数据中的20条做测试集

In [9]:
from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
# import torch

2024-11-04 22:49:58,177 - modelscope - INFO - PyTorch version 2.4.1+cu118 Found.
2024-11-04 22:49:58,179 - modelscope - INFO - Loading ast index from C:\Users\fhawk\.cache\modelscope\ast_indexer
2024-11-04 22:49:58,245 - modelscope - INFO - Loading done! Current index file version is 1.14.0, with md5 d7fef9503ef10c41dd0d7e9f814f7513 and a total number of 976 components indexed
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model_id = "qwen/Qwen2-0.5B-Instruct"    
model_dir = "./qwen/Qwen2-0___5B-Instruct"

# 在modelscope上下载Qwen模型到本地目录下
model_dir = snapshot_download(model_id, cache_dir="./", revision="master")

# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16)
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

Downloading: 100%|█████████████████████████████████████████████████████████████████████| 659/659 [00:00<00:00, 220kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 48.0/48.0 [00:00<00:00, 15.8kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████| 242/242 [00:00<00:00, 100kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 11.1k/11.1k [00:00<00:00, 5.66MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 1.59M/1.59M [00:00<00:00, 13.4MB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████▉| 942M/942M [00:46<00:00, 21.1MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 3.47k/3.47k [00:00<00:00, 2.09MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████| 6.70M/6.70M [00:00<00:00, 17.6MB/s]
Downloading: 100%|██████████████████████

In [15]:
def process_func(example):
    """
    将数据集进行预处理, 处理成模型可以接受的格式
    """

    MAX_LENGTH = 384 
    input_ids, attention_mask, labels = [], [], []
    system_prompt = """你是一个文本实体识别领域的专家，你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体. 以 json 格式输出, 如 {"entity_text": "南京", "entity_label": "地理实体"} 注意: 1. 输出的每一行都必须是正确的 json 字符串. 2. 找不到任何实体时, 输出"没有找到任何实体"."""
    
    instruction = tokenizer(
        f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}   

In [17]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map: 100%|███████████████████████████████████████████████████████████████| 14152/14152 [00:16<00:00, 871.01 examples/s]


In [18]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1,  # Dropout 比例
)

model = get_peft_model(model, config)

In [22]:
args = TrainingArguments(
    output_dir="./output/Qwen2-0.5B-NER",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)

In [30]:
from swanlab.integration.huggingface import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2-0__5B-NER-fintune",
    experiment_name="Qwen2-0.5B-Instruct",
    description="使用通义千问Qwen2-0.5B-Instruct模型在NER数据集上微调，实现关键实体识别任务。",
    config={
        "model": model_id,
        "model_dir": model_dir,
        "dataset": "qgyd2021/chinese_ner_sft",
    },
)

In [32]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

trainer.train()

[1m[34mswanlab[0m[0m: swanlab version 0.3.23 is available!  Upgrade: `pip install -U swanlab`
[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.3.9                                   
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1mD:\qwen2\swanlog\run-20241104_225341-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mWXYS1209[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mQwen2-0.5B-Instruct_Nov04_22-53-41[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch -l D:\qwen2\swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@WXYS1209/Qwen2-0__5B-NER-fintune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@WXYS1209/Qwen2-0__5B-NER-fintune/runs/gbaadx7mq7cebgvd69jxf[0m[0m


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,0.7916
20,0.1302
30,0.1412
40,0.0935
50,0.115
60,0.0536
70,0.1203
80,0.1027
90,0.1075
100,0.0886


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

TrainOutput(global_step=1768, training_loss=0.048037479509393015, metrics={'train_runtime': 1710.1913, 'train_samples_per_second': 16.55, 'train_steps_per_second': 1.034, 'total_flos': 1.064636454615552e+16, 'train_loss': 0.048037479509393015, 'epoch': 1.9988694177501414})

[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab will resume uploads when the network improves
[33mswanlab[0m: network error, swanlab wi

D:\anaconda3\envs\pytorch\lib\site-packages\torch\utils\checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
D:\anaconda3\envs\pytorch\lib\site-packages\peft\utils\save_and_load.py:195: UserWarning: Could not find a config file in ./qwen\Qwen2-1___5B-Instruct - will assume that the vocabulary was not modified.
  warnings.warn(
D:\anaconda3\envs\pytorch\lib\site-packages\torch\_dynamo\eval_frame.py:600: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)

In [72]:
import json
from sklearn.metrics import accuracy_score

model = model.to("cuda")
import time

# 记录开始时间
start_time = time.time()

# 保存预测结果和真实标签
true_labels = []
predicted_labels = []

for index, row in test_df.iterrows():
    instruction = row["instruction"]
    input_value = row["input"]
    label_str = row["output"]  # 获取真实标签字符串

    # 分隔并解析真实标签
    try:
        # 通过 "}{" 将多个 JSON 对象拆分为独立的部分，并添加丢失的 '}' 和 '{'
        raw_labels = label_str.replace("}{", "}|{").split('|')
        true_label_list = [json.loads(label) for label in raw_labels]
        true_label_list = [f"{item['entity_text']} {item['entity_label']}" for item in true_label_list]
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
        continue

    # 将真实标签组合为字符串，用于与预测结果进行比较
    true_labels.append(", ".join(true_label_list))

    # 生成消息格式
    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"},
    ]

    # 使用模型进行预测
    response = predict(messages, model, tokenizer)

    # 记录模型的预测
    predicted_labels.append(response.strip())

# 记录结束时间
end_time = time.time()

# 计算总用时
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.2f} seconds")

# 计算并打印准确率
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")


  return fn(*args, **kwargs)


In [80]:
test_df.output[1041]

'{"entity_text": "台湾", "entity_label": "地理实体"}{"entity_text": "台湾", "entity_label": "地理实体"}{"entity_text": "许朝清", "entity_label": "人名"}{"entity_text": "王贯仁", "entity_label": "人名"}{"entity_text": "加州", "entity_label": "地理实体"}'

In [74]:
predicted_labels

['{"entity_text": "台湾", "entity_label": "地理实体"}{"entity_text": "许朝清", "entity_label": "人名"}{"entity_text": "王贯仁", "entity_label": "人名"}{"entity_text": "加州中部", "entity_label": "地点"}',
 '{"entity_text": "内政部警政署刑事警察局", "entity_label": "组织"}{"entity_text": "大陆", "entity_label": "地理实体"}{"entity_text": "中国", "entity_label": "地理实体"}{"entity_text": "海峡两岸", "entity_label": "地点"}{"entity_text": "杨光南", "entity_label": "人名"}{"entity_text": "台湾", "entity_label": "地理实体"}{"entity_text": "大陆", "entity_label": "地理实体"}',
 '{"entity_text": "科什图尼察", "entity_label": "人名"}{"entity_text": "反对运动", "entity_label": "组织"}{"entity_text": "塞尔维亚民主党", "entity_label": "组织"}',
 '{"entity_text": "法学院", "entity_label": "组织"}{"entity_text": "左丽卡", "entity_label": "人名"}',
 '{"entity_text": "彭春燕", "entity_label": "人名"}{"entity_text": "彭春燕", "entity_label": "人名"}',
 '{"entity_text": "越南", "entity_label": "地理实体"}{"entity_text": "克林顿", "entity_label": "人名"}',
 '{"entity_text": "亚太经合会", "entity_label": "组织"}',
 '{"entity_text"

In [66]:
 [json.loads(label) for label in label_str.split('}') if label]

JSONDecodeError: Expecting ',' delimiter: line 1 column 44 (char 43)

In [33]:
# ====== 训练结束后的预测 ===== #

def predict(messages, model, tokenizer):
    device = "cuda" # "cuda"
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(response)

    return response

model = model.to("cuda")
import time

# 记录开始时间
start_time = time.time()

test_text_list = []
result_list = []
for index, row in test_df.iterrows():
    instruction = row["instruction"]
    input_value = row["input"]

    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"},
    ]

    response = predict(messages, model, tokenizer)
    messages.append({"role": "assistant", "content": f"{response}"})
    result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
    result_list.append(result_text)
    # test_text_list.append(swanlab.Text(result_text, caption=response))

# 记录结束时间
end_time = time.time()

# 计算总用时
execution_time = end_time - start_time

# swanlab.log({"Prediction": test_text_list})
# swanlab.finish()



In [40]:
f"代码运行时间: {execution_time} 秒"

'代码运行时间: 37.08256244659424 秒'

In [42]:
import pandas as pd
import re

# 存储最终的数据
data = []

# 遍历 result_list 并解析每个 result_text
for idx, result_text in enumerate(result_list):
    # 提取用户输入的文本内容
    user_match = re.search(r"\'role\': \'user\', \'content\': \'文本:(.*?)\'", result_text)
    input_text = user_match.group(1) if user_match else ""
    
    # 提取 assistant 返回的实体信息
    assistant_match = re.search(r"\'role\': \'assistant\', \'content\': \'(.*?)\'", result_text)
    assistant_content = assistant_match.group(1) if assistant_match else ""
    
    # 使用正则表达式提取每个 entity_text 和 entity_label
    entities = re.findall(r"\{\"entity_text\": \"(.*?)\", \"entity_label\": \"(.*?)\"\}", assistant_content)
    for entity_text, entity_label in entities:
        # 将 id（索引）、text、label 添加到 data 中
        data.append({"id": idx, "text": entity_text, "label": entity_label})

# 将 data 转换为 DataFrame
df_entities = pd.DataFrame(data)

# 显示结果
print(df_entities)


In [44]:
df_entities

Unnamed: 0,id,text,label
0,0,台湾,地理实体
1,0,许朝清,人名
2,0,王贯仁,人名
3,0,加州中部,地点
4,1,内政部警政署刑事警察局,组织
5,1,大陆,地理实体
6,1,中国,地理实体
7,1,海峡两岸,地点
8,1,杨光南,人名
9,1,台湾,地理实体


In [82]:
# 假设你已经完成了模型的微调
# 将模型保存到本地目录

output_dir = "./fine_tuned_qwen2"

# 保存模型和分词器
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"微调后的模型已保存到: {output_dir}")




In [11]:
model_id = "qwen/Qwen2-1.5B-Instruct"    
model_dir = "./fine_tuned_qwen2"

# 在modelscope上下载Qwen模型到本地目录下
# model_dir = snapshot_download(model_id, cache_dir="./", revision="master")

# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16)
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1536, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=1536, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=1536, out_features=256, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
     