# 基于预训练模型完成实体关系抽取

In [None]:
# 安装paddlenlp最新版本
!pip install --upgrade paddlenlp -i https://pypi.org/simple

%cd relation_extraction/

# 一、项目背景介绍

信息抽取（information extraction），简称IE，即从自然语言文本中，抽取出特定的事件或事实信息，帮助我们将海量内容自动分类、提取和重构。这些信息通常包括实体（entity）、关系（relation）、事件（event）。信息抽取主要包括三个子任务：关系抽取、命名实体识别、事件抽取。
信息抽取旨在从非结构化自然语言文本中提取结构化知识，如实体、关系、事件等。对于给定的自然语言句子，根据预先定义的schema集合，抽取出所有满足schema约束的SPO三元组。

例如，「妻子」关系的schema定义为：
{
S_TYPE: 人物,
P: 妻子,
O_TYPE: {
@value: 人物
}
}



![](https://ai-studio-static-online.cdn.bcebos.com/0802798ce4a44c07b2e9fb781f7ef42dd3f395d0da6a4ed3891edc68612100f4)



"text":"王雪纯是87版《红楼梦》中晴雯的配音者，她是《正大综艺》的主持人"
}![](https://ai-studio-static-online.cdn.bcebos.com/026d36d3f0b44b22b061076d2c5f0ad39eef811211cf4250a2c0a4b038fc5777)



## 1.1评估方法
评价方法
对测试集上参评系统输出的SPO结果和人工标注的SPO结果进行精准匹配，采用F1值作为评价指标。注意，对于复杂O值类型的SPO，必须所有槽位都精确匹配才认为该SPO抽取正确。针对部分文本中存在实体别名的问题，使用百度知识图谱的别名词典来辅助评测。F1值的计算方式如下：

F1 = (2 * P * R) / (P + R)，其中

P = 测试集所有句子中预测正确的SPO个数 / 测试集所有句子中预测出的SPO个数
R = 测试集所有句子中预测正确的SPO个数 / 测试集所有句子中人工标注的SPO个数

# 二、模型介绍
PaddleNLP内置了多种常见预训练模型，可以通过名字一键加载，可以完成，可以完成文本token化，转token ID，文本长度截断等操作。

本次选用的是RoBERTa large中文模型优化模型



In [None]:
#导入包，加载模型
import os
import json
from paddlenlp.transformers import RobertaForTokenClassification, RobertaTokenizer

label_map_path = os.path.join('data', "predicate2id.json")

if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)):
    sys.exit("{} dose not exists or is not a file.".format(label_map_path))
with open(label_map_path, 'r', encoding='utf8') as fp:
    label_map = json.load(fp)

model = RobertaForTokenClassification.from_pretrained(
    "roberta-wwm-ext-large",
    num_classes=(len(label_map) - 2) * 2 + 2)
tokenizer = RobertaTokenizer.from_pretrained("roberta-wwm-ext-large")    


inputs = tokenizer(text="请输入测试样例", max_seq_len=20)

# 三、数据介绍
数据主要是包含超过43万三元组数据、21万中文句子及48个预定义的关系类型。数据集中的句子来自百度百科、百度贴吧和百度信息流文本。

## 3.1解压数据

In [None]:
cd ..

In [None]:
 #解压数据
!unzip duie_dev.json.zip
!unzip duie_test2.json.zip
!unzip duie_train.json.zip

In [None]:
cd relation_extraction/

## 3.2展示数据

预览
文本数据集的展示，读取训练集的文件，分别展示出来。

![](https://ai-studio-static-online.cdn.bcebos.com/efb298fde83c48be93265baf9c4452615d99a0bebc5d4d978dfb77907a7b0fb7)


In [12]:
#展示数据
from data_loader import parse_label, DataCollator, convert_example_to_feature
import os
import json
data_path = 'data'
example=[]
train_file_path = os.path.join(data_path, 'train_data.json')
with open(train_file_path, "r", encoding="utf-8") as fp:
    for line in fp:
        example.append(json.loads(line))


In [22]:
example[:50]

[{'text': '吴宗宪遭服务生种族歧视, 他气呛: 我买下美国都行!艺人狄莺与孙鹏18岁的独子孙安佐赴美国读高中，没想到短短不到半年竟闹出校园安全事件被捕，因为美国正处于校园枪击案频传的敏感时机，加上国外种族歧视严重，外界对于孙安佐的情况感到不乐观 吴宗宪今（30）日录影前谈到美国民情，直言国外种族歧视严重，他甚至还被一名墨西哥裔的服务生看不起，让吴宗宪气到喊：「我是吃不起是不是',
  'spo_list': [{'predicate': '父亲',
    'object_type': {'@value': '人物'},
    'subject_type': '人物',
    'object': {'@value': '孙鹏'},
    'subject': '孙安佐'},
   {'predicate': '母亲',
    'object_type': {'@value': '人物'},
    'subject_type': '人物',
    'object': {'@value': '狄莺'},
    'subject': '孙安佐'},
   {'predicate': '丈夫',
    'object_type': {'@value': '人物'},
    'subject_type': '人物',
    'object': {'@value': '孙鹏'},
    'subject': '狄莺'},
   {'predicate': '妻子',
    'object_type': {'@value': '人物'},
    'subject_type': '人物',
    'object': {'@value': '狄莺'},
    'subject': '孙鹏'}]},
 {'text': '苏州亚都环保科技有限公司于2011年11月04日在苏州市吴中区市场监督管理局登记成立',
  'spo_list': [{'predicate': '成立日期',
    'object_type': {'@value': 'Date'},
    'subject_type': '机构',
    'object': {'@value': '2011年11月04日'},
    'subject': '苏州亚都环保科技有限公司

## 3.3 加载处理数据
我们可以加载自定义数据集。通过继承paddle.io.Dataset，自定义实现getitem 和 len两个方法。
从比赛官网下载数据集，解压存放于data/目录下并重命名为train_data.json, dev_data.json, test_data.json.**

In [14]:
#自定义两个方法
from typing import Optional, List, Union, Dict

import numpy as np
import paddle
from tqdm import tqdm

from paddlenlp.transformers import ErnieTokenizer
from paddlenlp.utils.log import logger

from data_loader import parse_label, DataCollator, convert_example_to_feature
from extract_chinese_and_punct import ChineseAndPunctuationExtractor


class DuIEDataset(paddle.io.Dataset):
    def __init__(self, data, label_map, tokenizer, max_length=512, pad_to_max_length=False):
        super(DuIEDataset, self).__init__()

        self.data = data
        self.chn_punc_extractor = ChineseAndPunctuationExtractor()
        self.tokenizer = tokenizer
        self.max_seq_length = max_length
        self.pad_to_max_length = pad_to_max_length
        self.label_map = label_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):

        example = json.loads(self.data[item])
        input_feature = convert_example_to_feature(
            example, self.tokenizer, self.chn_punc_extractor,
            self.label_map, self.max_seq_length, self.pad_to_max_length)
        return {
            "input_ids": np.array(input_feature.input_ids, dtype="int64"),
            "seq_lens": np.array(input_feature.seq_len, dtype="int64"),
            "tok_to_orig_start_index":
            np.array(input_feature.tok_to_orig_start_index, dtype="int64"),
            "tok_to_orig_end_index": 
            np.array(input_feature.tok_to_orig_end_index, dtype="int64"),
            # If model inputs is generated in `collate_fn`, delete the data type casting.
            "labels": np.array(input_feature.labels, dtype="float32"),
        }


    @classmethod
    def from_file(cls,
                  file_path,
                  tokenizer,
                  max_length=512,
                  pad_to_max_length=None):
        assert os.path.exists(file_path) and os.path.isfile(
            file_path), f"{file_path} dose not exists or is not a file."
        label_map_path = os.path.join(
            os.path.dirname(file_path), "predicate2id.json")
        assert os.path.exists(label_map_path) and os.path.isfile(
            label_map_path
        ), f"{label_map_path} dose not exists or is not a file."
        with open(label_map_path, 'r', encoding='utf8') as fp:
            label_map = json.load(fp)

        with open(file_path, "r", encoding="utf-8") as fp:
            data = fp.readlines()
            return cls(data, label_map, tokenizer, max_length, pad_to_max_length)

In [15]:
#读取加载数据，设置batch——size大小
data_path = 'data'
batch_size = 8
max_seq_length = 128

train_file_path = os.path.join(data_path, 'train_data.json')
train_dataset = DuIEDataset.from_file(
    train_file_path, tokenizer, max_seq_length, True)
train_batch_sampler = paddle.io.BatchSampler(
    train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
collator = DataCollator()
train_data_loader = paddle.io.DataLoader(
    dataset=train_dataset,
    batch_sampler=train_batch_sampler,
    collate_fn=collator)

eval_file_path = os.path.join(data_path, 'dev_data.json')
test_dataset = DuIEDataset.from_file(
    eval_file_path, tokenizer, max_seq_length, True)
test_batch_sampler = paddle.io.BatchSampler(
    test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_data_loader = paddle.io.DataLoader(
    dataset=test_dataset,
    batch_sampler=test_batch_sampler,
    collate_fn=collator)

## 3.4定义损失函数

定义损失函数和优化器，开始训练，我们选择均方误差作为损失函数，使用paddle.optimizer.AdamW作为优化器。

在训练过程中，模型保存在当前目录checkpoints文件夹下。

In [16]:
import paddle.nn as nn

class BCELossForDuIE(nn.Layer):
    def __init__(self, ):
        super(BCELossForDuIE, self).__init__()
        self.criterion = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, logits, labels, mask):
        loss = self.criterion(logits, labels)
        mask = paddle.cast(mask, 'float32')
        loss = loss * mask.unsqueeze(-1)
        loss = paddle.sum(loss.mean(axis=2), axis=1) / paddle.sum(mask, axis=1)
        loss = loss.mean()
        return loss

In [17]:
from utils import write_prediction_results, get_precision_recall_f1, decoding

@paddle.no_grad()
def evaluate(model, criterion, data_loader, file_path, mode):
    """
    mode eval:
    eval on development set and compute P/R/F1, called between training.
    mode predict:
    eval on development / test set, then write predictions to \
        predict_test.json and predict_test.json.zip \
        under /home/aistudio/relation_extraction/data dir for later submission or evaluation.
    """
    example_all = []
    with open(file_path, "r", encoding="utf-8") as fp:
        for line in fp:
            example_all.append(json.loads(line))
    id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json")
    with open(id2spo_path, 'r', encoding='utf8') as fp:
        id2spo = json.load(fp)

    model.eval()
    loss_all = 0
    eval_steps = 0
    formatted_outputs = []
    current_idx = 0
    for batch in tqdm(data_loader, total=len(data_loader)):
        eval_steps += 1
        input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
        logits = model(input_ids=input_ids)
        mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and((input_ids != 2))
        loss = criterion(logits, labels, mask)
        loss_all += loss.numpy().item()
        probs = F.sigmoid(logits)
        logits_batch = probs.numpy()
        seq_len_batch = seq_len.numpy()
        tok_to_orig_start_index_batch = tok_to_orig_start_index.numpy()
        tok_to_orig_end_index_batch = tok_to_orig_end_index.numpy()
        formatted_outputs.extend(decoding(example_all[current_idx: current_idx+len(logits)],
                                          id2spo,
                                          logits_batch,
                                          seq_len_batch,
                                          tok_to_orig_start_index_batch,
                                          tok_to_orig_end_index_batch))
        current_idx = current_idx+len(logits)
    loss_avg = loss_all / eval_steps
    print("eval loss: %f" % (loss_avg))

    if mode == "predict":
        predict_file_path = os.path.join("/home/aistudio/relation_extraction/data", 'predictions.json')
    else:
        predict_file_path = os.path.join("/home/aistudio/relation_extraction/data", 'predict_eval.json')

    predict_zipfile_path = write_prediction_results(formatted_outputs,
                                                    predict_file_path)

    if mode == "eval":
        precision, recall, f1 = get_precision_recall_f1(file_path,
                                                        predict_zipfile_path)
        os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path))
        return precision, recall, f1
    elif mode != "predict":
        raise Exception("wrong mode for eval func")

In [18]:
from paddlenlp.transformers import LinearDecayWithWarmup

learning_rate = 2e-5
num_train_epochs = 5
warmup_ratio = 0.06

criterion = BCELossForDuIE()
# Defines learning rate strategy.
steps_by_epoch = len(train_data_loader)
num_training_steps = steps_by_epoch * num_train_epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_ratio)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])])

In [None]:
!mkdir checkpoints

# 四 、 模型训练
选择均方误差作为损失函数，使用paddle.optimizer.AdamW作为优化器。将其设置为一万步保存一次模型，训练轮数40论，步长为50，将得到的结果保存在checkpoints中。

In [None]:
#开始训练
import time
import paddle.nn.functional as F

# Starts training.
global_step = 0
logging_steps = 50
save_steps = 10000
num_train_epochs = 40
output_dir = 'checkpoints'
tic_train = time.time()
model.train()
for epoch in range(num_train_epochs):
    print("\n=====start training of %d epochs=====" % epoch)
    tic_epoch = time.time()
    for step, batch in enumerate(train_data_loader):
        input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch
        logits = model(input_ids=input_ids)
        mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and(
            (input_ids != 2))
        loss = criterion(logits, labels, mask)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_gradients()
        loss_item = loss.numpy().item()

        if global_step % logging_steps == 0:
            print(
                "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s"
                % (epoch, num_train_epochs, step, steps_by_epoch,
                    loss_item, logging_steps / (time.time() - tic_train)))
            tic_train = time.time()

        if global_step % save_steps == 0 and global_step != 0:
            print("\n=====start evaluating ckpt of %d steps=====" %
                    global_step)
            precision, recall, f1 = evaluate(
                model, criterion, test_data_loader, eval_file_path, "eval")
            print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
                    (100 * precision, 100 * recall, 100 * f1))
            print("saving checkpoing model_%d.pdparams to %s " %
                    (global_step, output_dir))
            paddle.save(model.state_dict(),
                        os.path.join(output_dir, 
                                        "model_%d.pdparams" % global_step))
            model.train()

        global_step += 1
    tic_epoch = time.time() - tic_epoch
    print("epoch time footprint: %d hour %d min %d sec" %
            (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60))

# Does final evaluation.
print("\n=====start evaluating last ckpt of %d steps=====" %
        global_step)
precision, recall, f1 = evaluate(model, criterion, test_data_loader,
                                    eval_file_path, "eval")
print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" %
        (100 * precision, 100 * recall, 100 * f1))
paddle.save(model.state_dict(),
            os.path.join(output_dir,
                            "model_%d.pdparams" % global_step))
print("\n=====training complete=====")

# 五、模型评估
加载训练保存的模型加载后进行预测。可以得出训练出的结果。

In [27]:
!bash predict.sh

+ export CUDA_VISIBLE_DEVICES=0
+ CUDA_VISIBLE_DEVICES=0
+ export BATCH_SIZE=32
+ BATCH_SIZE=32
+ export CKPT=./checkpoints/model_50040.pdparams
+ CKPT=./checkpoints/model_50040.pdparams
+ export DATASET_FILE=./data/test_data.json
+ DATASET_FILE=./data/test_data.json
+ python run_duie.py --do_predict --init_checkpoint ./checkpoints/model_50040.pdparams --predict_data_file ./data/test_data.json --max_seq_length 512 --batch_size 32
  from collections import Iterable
[2022-02-26 09:38:58,783] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/roberta_chn_large.pdparams
W0226 09:38:58.784677 20729 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0226 09:38:58.789849 20729 device_context.cc:465] device: 0, cuDNN Version: 7.6.
[2022-02-26 09:39:08,519] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/vocab.txt
[2022-02-26 09:39:08,536] [    INFO] -

In [33]:
!python re_official_evaluation.py --golden_file=dev_data.json  --predict_file=duie.json.zip

correct spo num = 2281.0
submitted spo num = 2307.0
golden set spo num = 2320.0
submitted recall spo num = 2281.0
{"errorCode": 0, "errorMsg": "success", "data": [{"name": "precision", "value": 0.9887}, {"name": "recall", "value": 0.9832}, {"name": "f1-score", "value": 0.986}]}


# 六、总结

在我看来信息抽取对我们日常生活也是非常重要的，例如聊天机器人等，可以将我们说发出的信息转让其可以更好的理解，本次项目依旧有许多不足，借鉴了许多代码，依旧会出现一些不足的地方，我会继续跟进继续更改。

# 七、个人总结
现在还是一名大二的学生，了解的不是特别多，希望以后可以多学习一些深度学习有关的知识吧。


飞桨主页：https://aistudio.baidu.com/aistudio/usercenter

github主页：https://github.com/TTxxtt

gitee主页：https://gitee.com/green-baby-milk