In [None]:
import os
os.environ['MODELSCOPE_CACHE'] = "/root/autodl-tmp/.cache/hub"
os.environ['HF_HUB_CACHE'] = "/root/autodl-tmp/.cache/hub"
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"

In [5]:
def convert_txt_to_json(txt_path, json_path, names):
    import json
    data = []
    with open(txt_path, 'r', encoding='utf-8') as f:
        for idx, text in enumerate(f.read().split('\n\n')):
            ner_tags = []
            sample = {}
            tokens = []
            for line in text.split('\n'):
                if not line.strip():
                    continue
                token_tag = line.split()
                if len(token_tag) != 2:
                    continue
                token, tag = token_tag
                tokens.append(token)
                if tag not in names:
                    names.append(tag)
                ner_tags.append(names.index(tag))
            sample['id'] = idx
            sample['tokens'] = tokens
            sample['ner_tags'] = ner_tags
            data.append(sample)
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return names

In [6]:
# 用法示例
names = []
names = convert_txt_to_json('../data/medical/train.txt', '../data/medical/trian.json', names)
print(names)
names = convert_txt_to_json('../data/medical/dev.txt', '../data/medical/dev.json', names)
print(names)
names = convert_txt_to_json('../data/medical/test.txt', '../data/medical/test.json', names)
print(names)

['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗']
['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗']
['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗']


In [7]:
from datasets import load_dataset, Value, Features, ClassLabel, Sequence

data_files = {'train': '../data/medical/train.json', 'dev': '../data/medical/dev.json', 'test': '../data/medical/test.json'}
features = Features({
    'id': Value('int32'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(num_classes=21, names=names))
})

raw_dataset = load_dataset('json', data_files=data_files, features=features)

Generating train split: 5259 examples [00:00, 68725.45 examples/s]
Generating dev split: 657 examples [00:00, 92341.59 examples/s]
Generating test split: 658 examples [00:00, 108758.36 examples/s]


In [8]:
raw_dataset['dev'].features

{'id': Value(dtype='int32', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-临床表现', 'I-临床表现', 'B-中医治疗', 'I-中医治疗', 'B-西医诊断', 'I-西医诊断', 'B-方剂', 'I-方剂', 'B-中药', 'I-中药', 'B-中医诊断', 'I-中医诊断', 'B-西医治疗', 'I-西医治疗', 'B-中医证候', 'I-中医证候', 'B-中医治则', 'I-中医治则', 'B-其他治疗', 'I-其他治疗'], id=None), length=-1, id=None)}

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else cpu

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification


checkpoint = 'Qwen/Qwen2.5-7B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(names), torch_dtype=torch.float16).to(device)

{'input_ids': [79072, 75606, 68065, 59355, 99677, 100721, 20929, 99375, 5122, 39165, 100040, 5373, 100721, 73743, 99200, 20109, 43497, 144227, 3837, 21287, 100489, 99662, 5373, 21287, 70927, 99471, 99200, 59496, 144227, 3837, 101545, 100693, 20109, 24918, 144227, 3837, 99243, 119594, 5373, 99789, 119670, 5373, 26288, 99789, 99200, 20109, 26022, 144227, 3837, 110168, 34317, 82847, 33517, 26022, 144227, 3837, 100818, 99808, 43497, 144227], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# 获取可训练参数
trainable_params = [p for p in model.parameters() if p.requires_grad]

# 打印可训练参数的数量
print("Number of trainable parameters:", sum(p.numel() for p in trainable_params))

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model=model, peft_config=peft_config)
model.print_trainable_parameters()

In [12]:
def process_func(examples):
    tokenized_examples = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, max_length=512)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_examples.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
        labels.append(label_ids)
    tokenized_examples['labels'] = labels
    return tokenized_examples

In [13]:
tokenized_dataset = raw_dataset.map(process_func, batched=True)
tokenized_dataset

Map: 100%|██████████| 5259/5259 [00:00<00:00, 15166.86 examples/s]
Map: 100%|██████████| 657/657 [00:00<00:00, 15488.01 examples/s]
Map: 100%|██████████| 658/658 [00:00<00:00, 14581.10 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5259
    })
    dev: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 657
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 658
    })
})

In [None]:
import evaluate

seqeval_matric = evaluate.load('seqeval')
# seqeval_matric

Downloading builder script: 6.34kB [00:00, 3.56MB/s]


EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [16]:
import numpy as np
label_list = names
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    result = seqeval_matric.compute(predictions=true_predictions, references=true_labels, mode='strict', scheme='IOB2')
    return {
        'precision': result['overall_precision'],
        'recall': result['overall_recall'],
        'f1': result['overall_f1'],
        'accuracy': result['overall_accuracy']
    }

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    learning_rate=2e-5,
    per_device_eval_batch_size=2,
    per_gpu_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir='./output',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    bf16=True,
    label_names=names,
    use_cpu=True
)

In [None]:
from transformers import Trainer, DataCollatorForTokenClassification

# model.to(device)
# tokenizer.to(device)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

  trainer = Trainer(


RuntimeError: MPS backend out of memory (MPS allocated: 27.10 GB, other allocations: 464.00 KB, max allowed: 27.20 GB). Tried to allocate 259.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [22]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model=model, peft_config=peft_config)
model.print_trainable_parameters()

trainable params: 2,598,421 || all params: 7,073,292,842 || trainable%: 0.0367
