In [None]:
import os
os.environ['MODELSCOPE_CACHE'] = "/root/autodl-tmp/.cache/hub"
os.environ['HF_HUB_CACHE'] = "/root/autodl-tmp/.cache/hub"
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"

In [1]:
def convert_txt_to_json(txt_path, json_path, names):
    import json
    data = []
    with open(txt_path, 'r', encoding='utf-8') as f:
        for idx, text in enumerate(f.read().split('\n\n')):
            ner_tags = []
            sample = {}
            tokens = []
            for line in text.split('\n'):
                if not line.strip():
                    continue
                token_tag = line.split()
                if len(token_tag) != 2:
                    continue
                token, tag = token_tag
                tokens.append(token)
                if tag not in names:
                    names.append(tag)
                ner_tags.append(names.index(tag))
            sample['id'] = idx
            sample['tokens'] = tokens
            sample['ner_tags'] = ner_tags
            data.append(sample)
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return names

In [2]:
# 用法示例
names = []
names = convert_txt_to_json('../data/medical/train.txt', '../data/medical/train.json', names)
print(names)
names = convert_txt_to_json('../data/medical/dev.txt', '../data/medical/dev.json', names)
print(names)
names = convert_txt_to_json('../data/medical/test.txt', '../data/medical/test.json', names)
print(names)

['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗']
['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗']
['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗']


In [3]:
from datasets import load_dataset, Value, Features, ClassLabel, Sequence

data_files = {'train': '../data/medical/train.json', 'dev': '../data/medical/dev.json', 'test': '../data/medical/test.json'}
features = Features({
    'id': Value('int32'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(num_classes=21, names=names))
})

raw_dataset = load_dataset('json', data_files=data_files, features=features)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 5259 examples [00:00, 98998.45 examples/s]
Generating dev split: 657 examples [00:00, 90370.17 examples/s]
Generating test split: 658 examples [00:00, 91528.27 examples/s]


In [4]:
raw_dataset['dev'].features

{'id': Value(dtype='int32', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗'], id=None), length=-1, id=None)}

In [5]:
id2label = {i: label for i, label in enumerate(names)}
label2id = {label: i for i, label in enumerate(names)}

print("id2label:", id2label)
print("label2id:", label2id)

id2label: {0: 'O', 1: 'B-临床表现', 2: 'I-临床表现', 3: 'B-中医治疗', 4: 'I-中医治疗', 5: 'B-西医诊断', 6: 'I-西医诊断', 7: 'B-方剂', 8: 'I-方剂', 9: 'B-中药', 10: 'I-中药', 11: 'B-中医诊断', 12: 'I-中医诊断', 13: 'B-西医治疗', 14: 'I-西医治疗', 15: 'B-中医证候', 16: 'I-中医证候', 17: 'B-中医治则', 18: 'I-中医治则', 19: 'B-其他治疗', 20: 'I-其他治疗'}
label2id: {'O': 0, 'B-临床表现': 1, 'I-临床表现': 2, 'B-中医治疗': 3, 'I-中医治疗': 4, 'B-西医诊断': 5, 'I-西医诊断': 6, 'B-方剂': 7, 'I-方剂': 8, 'B-中药': 9, 'I-中药': 10, 'B-中医诊断': 11, 'I-中医诊断': 12, 'B-西医治疗': 13, 'I-西医治疗': 14, 'B-中医证候': 15, 'I-中医证候': 16, 'B-中医治则': 17, 'I-中医治则': 18, 'B-其他治疗': 19, 'I-其他治疗': 20}


In [6]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification


checkpoint = 'Qwen/Qwen2.5-7B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(names), id2label=id2label, label2id=label2id, torch_dtype=torch.float16).to(device)

Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.75s/it]
Some weights of Qwen2ForTokenClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-7B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 获取可训练参数
trainable_params = [p for p in model.parameters() if p.requires_grad]

# 打印可训练参数的数量
print("Number of trainable parameters:", sum(p.numel() for p in trainable_params))

Number of trainable parameters: 7070694421


In [9]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model=model, peft_config=peft_config)
model.print_trainable_parameters()

trainable params: 2,598,421 || all params: 7,073,292,842 || trainable%: 0.0367


In [10]:
def process_func(examples):
    tokenized_examples = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, max_length=512)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_examples.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
        labels.append(label_ids)
    tokenized_examples['labels'] = labels
    return tokenized_examples

In [11]:
tokenized_dataset = raw_dataset.map(process_func, batched=True)
tokenized_dataset

Map: 100%|██████████| 5259/5259 [00:00<00:00, 15442.08 examples/s]
Map: 100%|██████████| 657/657 [00:00<00:00, 15852.05 examples/s]
Map: 100%|██████████| 658/658 [00:00<00:00, 16141.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5259
    })
    dev: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 657
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 658
    })
})

In [12]:
tokenized_dataset['dev'][0]

{'id': 0,
 'tokens': ['投',
  '活',
  '络',
  '效',
  '灵',
  '丹',
  '加',
  '味',
  '：',
  '当',
  '归',
  '、',
  '丹',
  '参',
  '各',
  '１',
  '５',
  'ｇ',
  '，',
  '生',
  '乳',
  '香',
  '、',
  '生',
  '没',
  '药',
  '各',
  '６',
  'ｇ',
  '，',
  '柴',
  '胡',
  '１',
  '２',
  'ｇ',
  '，',
  '白',
  '芍',
  '、',
  '黄',
  '芩',
  '、',
  '大',
  '黄',
  '各',
  '１',
  '０',
  'ｇ',
  '，',
  '蒲',
  '公',
  '英',
  '３',
  '０',
  'ｇ',
  '，',
  '甘',
  '草',
  '５',
  'ｇ'],
 'ner_tags': [0,
  7,
  8,
  8,
  8,
  8,
  0,
  0,
  0,
  9,
  10,
  0,
  9,
  10,
  0,
  0,
  0,
  0,
  0,
  9,
  10,
  10,
  0,
  9,
  10,
  10,
  0,
  0,
  0,
  0,
  9,
  10,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  9,
  10,
  0,
  9,
  10,
  0,
  0,
  0,
  0,
  0,
  9,
  10,
  10,
  0,
  0,
  0,
  0,
  9,
  10,
  0,
  0],
 'input_ids': [79072,
  75606,
  68065,
  59355,
  99677,
  100721,
  20929,
  99375,
  5122,
  39165,
  100040,
  5373,
  100721,
  73743,
  99200,
  20109,
  43497,
  144227,
  3837,
  21287,
  100489,
  99662,
  5373,
  21287,
  70

In [13]:
tokenized_dataset['dev'].shuffle(42)

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 657
})

In [15]:
tokenized_dataset['dev'].shuffle(42)[0]

{'id': 159,
 'tokens': ['茅', '根', '１', '０', '克'],
 'ner_tags': [9, 10, 0, 0, 0],
 'input_ids': [101708, 99408, 20109, 26022, 99316],
 'attention_mask': [1, 1, 1, 1, 1],
 'labels': [9, 10, 0, 0, 0]}

In [16]:
tokenized_dataset.save_to_disk('medical-ner')

Saving the dataset (1/1 shards): 100%|██████████| 5259/5259 [00:00<00:00, 294371.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 657/657 [00:00<00:00, 99160.05 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 658/658 [00:00<00:00, 150187.86 examples/s]


In [11]:
import evaluate

seqeval_matric = evaluate.load('seqeval')
# seqeval_matric

In [12]:
import numpy as np
label_list = names
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    result = seqeval_matric.compute(predictions=true_predictions, references=true_labels, mode='strict', scheme='IOB2')
    # return {
    #     'precision': result['overall_precision'],
    #     'recall': result['overall_recall'],
    #     'f1': result['overall_f1'],
    #     'accuracy': result['overall_accuracy']
    # }
    return dict(
        precision = result['overall_precision'],
        recall = result['overall_recall'],
        f1 = result['overall_f1'],
        accuracy = result['overall_accuracy']
    )

In [13]:
# 假设names和seqeval_matric已定义，compute_metrics已在上面实现

# 构造标签列表
label_list = names

# 用例1：完全正确
preds = np.array([[[0.1, 0.9, 0.0], [0.8, 0.1, 0.1], [0.0, 0.2, 0.8]]])  # shape: (1, 3, 3)
labels = np.array([[1, 0, 2]])  # shape: (1, 3)
print("Case 1:", compute_metrics((preds, labels)))

# 用例2：全部错误
preds = np.array([[[0.9, 0.1, 0.0], [0.1, 0.8, 0.1], [0.8, 0.1, 0.1]]])
labels = np.array([[1, 2, 0]])
print("Case 2:", compute_metrics((preds, labels)))

# 用例3：有mask（-100）
preds = np.array([[[0.1, 0.9, 0.0], [0.8, 0.1, 0.1], [0.0, 0.2, 0.8]]])
labels = np.array([[1, -100, 2]])
print("Case 3:", compute_metrics((preds, labels)))

# 用例4：部分正确
preds = np.array([[[0.1, 0.9, 0.0], [0.8, 0.1, 0.1], [0.8, 0.1, 0.1]]])
labels = np.array([[1, 0, 0]])
print("Case 4:", compute_metrics((preds, labels)))

Case 1: {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1': np.float64(1.0), 'accuracy': 1.0}
Case 2: {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'accuracy': 0.3333333333333333}
Case 3: {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1': np.float64(1.0), 'accuracy': 1.0}
Case 4: {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1': np.float64(1.0), 'accuracy': 1.0}


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    learning_rate=2e-5,
    per_device_eval_batch_size=2,
    per_gpu_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=True,
    output_dir='./output',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    bf16=True,
    label_names=names,
    use_cpu=True
)

In [None]:
from transformers import Trainer, DataCollatorForTokenClassification

# model.to(device)
# tokenizer.to(device)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

  trainer = Trainer(


RuntimeError: MPS backend out of memory (MPS allocated: 27.10 GB, other allocations: 464.00 KB, max allowed: 27.20 GB). Tried to allocate 259.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [22]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model=model, peft_config=peft_config)
model.print_trainable_parameters()

trainable params: 2,598,421 || all params: 7,073,292,842 || trainable%: 0.0367


In [None]:
from transformers import Qwen2ForTokenClassification

In [None]:
from peft import PeftModelForTokenClassification

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): Qwen2ForTokenClassification(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3584, out_features=3584, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3584, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_

In [21]:
model(tokenized_dataset['dev'][0]['input_ids'].to_tensor())

AttributeError: 'list' object has no attribute 'to_tensor'

In [None]:
from accelerate import PartialState