In [3]:
import pandas as pd
import numpy as np
from functools import partial
import paddle
import paddle.nn as nn
from paddle.io import Dataset
import paddle.nn.functional as F
import paddlenlp
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup

## 创建数据集

In [8]:
class CHIPCTCDataSet(Dataset):
    def __init__(self, data_path, label_path, mode):
        # 加载标签辞典
        self.label2id = self._load_label_dict(label_path)
        # 加载数据集
        self.data = self._load_data_from_source(data_path, mode)
        # 加载标签列表
        self.label_list = list(self.label2id.keys())
        
    def _load_label_dict(self, label_path):
        with open(label_path, 'r', encoding='utf-8') as f:
            lines = [line.strip().split('\t', maxsplit=1)
                     for line in f.readlines()]
            lines = [(line[0], int(line[1])) for line in lines]
            label_dict = dict(lines)
        return label_dict
    
    def _load_data_from_source(self, data_path, mode):
        data_set = []
        with open(data_path, 'r', encoding='utf-8') as f:
            if mode == 'train':
                for line in f.readlines():
                    id, label, text = line.strip().split(',', maxsplit=2)
                    example = {'text': text, 'label':self.label2id[label]}
                    data_set.append(example)
            else:
                for line in f.readlines():
                    id, text = line.strip().split(',', maxsplit=1)
                    example = {'text': text}
                    data_set.append(example)   
        return data_set
    
    def __getitem__(self, idx):
        return self.data[idx]
    
    def __len__(self):
        return len(self.data)

In [9]:
Dataset = CHIPCTCDataSet('/Users/liruizhi/Desktop/毕设数据处理/train_clean.csv', '/Users/liruizhi/Desktop/毕设数据处理/CHIP-CTC/CHIP-CTC_label_dict.txt', mode='train')

In [4]:
#Dataset.data[0]['text']

## 将数据集转化为模型的输入格式

In [6]:
MODEL_NAME = 'ernie-1.0'
tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained(MODEL_NAME)

[32m[2022-03-24 11:09:27,777] [    INFO][0m - Already cached /Users/liruizhi/.paddlenlp/models/ernie-1.0/vocab.txt[0m


In [7]:
def convert_example(example, tokenizer, max_seq_length=128, is_test=False):
    encoded_inputs = tokenizer(text=example['text'])
    input_ids = encoded_inputs['input_ids']
    token_type_ids = encoded_inputs['token_type_ids']
    
    if not is_test:
        label = np.array([example['label']], dtype='int64')
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

In [8]:
def create_dataloader(dataset, mode='train', batch_size=1, batchify_fn=None, trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)
        
    shuffle=True if mode=='train' else False
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
    
    return paddle.io.DataLoader(dataset=dataset, 
                                batch_sampler=batch_sampler, 
                                return_list=True, 
                                collate_fn=batchify_fn)

In [9]:
batchify_fn = lambda samples, fn=Tuple(
    Pad(pad_val=tokenizer.pad_token_id, axis=0),
    Pad(pad_val=tokenizer.pad_token_type_id, axis=0),
    Stack(dtype='int64')
): [data for data in fn(samples)]

In [10]:
paddle.device.get_device()

'cpu'

In [11]:
class ErnieForSequenceClassification(paddle.nn.Layer):
    def __init__(self, MODEL_NAME, num_class=44, dropout=None):
        super(ErnieForSequenceClassification, self).__init__()
        # 加载预训练好的ernie，只需要指定一个名字就可以
        self.ernie = paddlenlp.transformers.ErnieModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(ropout if dropout is not None else self.ernie.config["hidden_dropout_prob"])
        self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_class)

    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
        _, pooled_output = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [12]:
# 超参设置
n_epochs = 5
batch_size = 128
#max_seq_length = 128
n_classes=44
dropout_rate = None

learning_rate = 5e-5
warmup_proportion = 0.1
weight_decay = 0.01

MODEL_NAME = "ernie-tiny"

In [13]:
# 加载数据集，构造DataLoader
train_set = CHIPCTCDataSet('/Users/liruizhi/Desktop/毕设数据处理/train_clean.csv', '/Users/liruizhi/Desktop/毕设数据处理/CHIP-CTC/CHIP-CTC_label_dict.txt', mode='train')
label2id = train_set.label2id
train_set = MapDataset(train_set)

dev_set = CHIPCTCDataSet('/Users/liruizhi/Desktop/毕设数据处理/dev_clean.csv', '/Users/liruizhi/Desktop/毕设数据处理/CHIP-CTC/CHIP-CTC_label_dict.txt', mode='train')
label2id = dev_set.label2id
dev_set = MapDataset(dev_set)

In [14]:
label2id

{'Disease': 0,
 'Multiple': 1,
 'Therapy or Surgery': 2,
 'Consent': 3,
 'Diagnostic': 4,
 'Laboratory Examinations': 5,
 'Pregnancy-related Activity': 6,
 'Age': 7,
 'Pharmaceutical Substance or Drug': 8,
 'Risk Assessment': 9,
 'Allergy Intolerance': 10,
 'Enrollment in other studies': 11,
 'Researcher Decision': 12,
 'Compliance with Protocol': 13,
 'Organ or Tissue Status': 14,
 'Sign': 15,
 'Addictive Behavior': 16,
 'Capacity': 17,
 'Life Expectancy': 18,
 'Symptom': 19,
 'Neoplasm Status': 20,
 'Device': 21,
 'Special Patient Characteristic': 22,
 'Non-Neoplasm Disease Stage': 23,
 'Data Accessible': 24,
 'Encounter': 25,
 'Diet': 26,
 'Smoking Status': 27,
 'Literacy': 28,
 'Oral related': 29,
 'Healthy': 30,
 'Address': 31,
 'Blood Donation': 32,
 'Gender': 33,
 'Receptor Status': 34,
 'Nursing': 35,
 'Exercise': 36,
 'Education': 37,
 'Sexual related': 38,
 'Disabilities': 39,
 'Alcohol Consumer': 40,
 'Bedtime': 41,
 'Ethnicity': 42,
 'Ethical Audit': 43}

In [14]:
#partial是Python语言的偏函数，支持更方便的在已有函数基础上定义指定参数值的新函数
trans_func = partial(convert_example, tokenizer=tokenizer)
train_data_loader = create_dataloader(train_set, mode="train", batch_size=batch_size, batchify_fn=batchify_fn, trans_fn=trans_func)
dev_data_loader = create_dataloader(dev_set, mode="train", batch_size=batch_size, batchify_fn=batchify_fn, trans_fn=trans_func)

## 定义参数并进行训练

In [15]:
# 检测是否可以使用GPU，如果可以优先使用GPU
use_gpu = True if paddle.device.get_device().startswith("gpu") else False
if use_gpu:
    paddle.device.set_device('gpu:0')
    
# 加载预训练模型ERNIE

# 加载用于文本分类的fune-tuning网络，不同的任务有不同的对应函数，详细可以查阅ERNIE的文档
model =  ErnieForSequenceClassification(MODEL_NAME, num_class=n_classes, dropout=dropout_rate)

# 设置优化器，LinearDecayWithWarmup是一个周期性衰减的函数，并且在初始训练的时候才用热启动策略（较小学习率，逐渐上升），避免前期训练过于震荡
num_training_steps = len(train_data_loader) * n_epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ])

[32m[2022-03-24 11:09:40,441] [    INFO][0m - Already cached /Users/liruizhi/.paddlenlp/models/ernie-tiny/ernie_tiny.pdparams[0m


In [16]:
# 定义统计指标
metric = paddle.metric.Accuracy()
criterion = paddle.nn.loss.CrossEntropyLoss()

def evaluate(model, metric, data_loader):
    model.eval()
    # 每次使用测试集进行评估时，先重置掉之前的metric的累计数据，保证只是针对本次评估。
    metric.reset()
    losses = []
    for batch in data_loader:
        # 获取数据
        input_ids, segment_ids, labels = batch
        # 执行前向计算
        logits = model(input_ids, segment_ids)
        # 计算损失
        loss = F.cross_entropy(input=logits, label=labels)
        loss= paddle.mean(loss)
        losses.append(loss.numpy())
        # 统计准确率指标
        correct = metric.compute(logits, labels)
        metric.update(correct)
        accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
    metric.reset()

def train(model):
    global_step=0
    for epoch in range(1, n_epochs+1):
        model.train()
        for step, batch in enumerate(train_data_loader, start=1):
            # 获取数据 数据来源
            input_ids, segment_ids, labels = batch
            # 模型前向计算 （将数据输入模型）
            logits = model(input_ids, segment_ids)
            loss = F.cross_entropy(input=logits, label=labels)
            loss = paddle.mean(loss)

            # 统计指标
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()
            
            # 打印中间训练结果
            global_step += 1
            if global_step % 10 == 0 :
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
            
            # 参数更新
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
        
        # 模型评估
        evaluate(model, metric, test_data_loader)
            
train(model)

KeyboardInterrupt: 

## 预测数据

In [5]:
def predict(data, id2label, batch_size=128):
    examples = []
    # 数据处理
    for text in data:
        input_ids, token_type_ids = convert_example(
            text,
            is_test=True)
        examples.append((input_ids, token_type_ids))
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(pad_val=tokenizer.pad_token_id, axis=0),
        Pad(pad_val=tokenizer.pad_token_type_id, axis=0)):fn(samples)
    
    # 将数据按照batch_size进行切分
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        batches.append(one_batch)
    
     # 使用模型预测数据，并返回结果
    results = []
    model.eval()
    for batch in batches:
        input_ids, token_type_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [id2label[i] for i in idx]
        results.extend(labels)
    return results


In [15]:
#test_set = CHIPCTCDataSet('/Users/liruizhi/Desktop/毕设数据处理/test.csv', '/Users/liruizhi/Desktop/毕设数据处理/CHIP-CTC/CHIP-CTC_label_dict.txt', mode='test')
data = [{"text":"2.年龄1-14岁；"}]

id2label = dict([(items[1], items[0]) for items in label2id.items()])
#results = predict(test_set.data, id2label)
results = predict(data, id2label)
print(results)