In [10]:
import os
os.path.realpath('./')

'D:\\未备份\\CS_Tutorial\\NLP\\Bert_Use\\OpenUE'

# 中文三元组联合抽取

## 介绍

在这个notebook中我们将使用openue库代码来训练我们自己的三元组联合抽取，使用的基础模型是`bert-base-chinese`，训练分为两步，首先训练关系分类模型，其次训练实体抽取模型。之后联合验证。

## 数据集

在这个数据集中，使用ske数据集，具体例子如下。我们使用代码来读取`train.json`来分析一下数据。

In [1]:
import json
with open("../dataset/ske/train.json", "r") as file:
    for line in file.readlines():
        example = json.loads(line)
        break
for k, v in example.items():
    print(f"{k}: {v}")

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/ske/train.json'

# 训练

## `seq model`关系分类模型

如我们的模型图所示，我们需要先训练一个关系分类模型，识别出句子中实体的属性。也就是模型图中下方位置的关系类型识别模块，用来识别出句子中存在的关系。

我们训练和验证模型使用的都是同一份代码，区别仅为`config`的设置不同，config具体的文件目录在`./config`下。
<div  align="center">
    <img src="./imgs/architecture.png" width = "600" height = "400" alt="图片名称" align=center />
</div>


In [2]:
import argparse
import importlib

import numpy as np
import torch
import pytorch_lightning as pl
import openue.lit_models as lit_models
import yaml
import time
from transformers import AutoConfig
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

ModuleNotFoundError: No module named 'pytorch_lightning'

In [2]:
# 设置一些参数和动态调用包
def _import_class(module_and_class_name: str) -> type:
    module_name, class_name = module_and_class_name.rsplit(".", 1)
    module = importlib.import_module(module_name)
    class_ = getattr(module, class_name)
	
    return class_


def _setup_parser():
    """Set up Python's ArgumentParser with data, model, trainer, and other arguments."""
    parser = argparse.ArgumentParser(add_help=False)

    # Add Trainer specific arguments, such as --max_epochs, --gpus, --precision
    # trainer_parser = pl.Trainer.add_argparse_args(parser)
    # trainer_parser._action_groups[1].title = "Trainer Args"  # pylint: disable=protected-access
    # parser = argparse.ArgumentParser(add_help=False, parents=[trainer_parser])

    # Basic arguments
    parser.add_argument("--wandb", action="store_true", default=False)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--litmodel_class", type=str, default="SEQLitModel")
    parser.add_argument("--data_class", type=str, default="REDataset")
    parser.add_argument("--model_class", type=str, default="BertForRelationClassification")
    parser.add_argument("--load_checkpoint", type=str, default=None)

    # Get the data and model classes, so that we can add their specific arguments
    temp_args, _ = parser.parse_known_args()
    data_class = _import_class(f"openue.data.{temp_args.data_class}")
    model_class = _import_class(f"openue.models.{temp_args.model_class}")

    # Get data, model, and LitModel specific arguments
    data_group = parser.add_argument_group("Data Args")
    data_class.add_to_argparse(data_group)

    model_group = parser.add_argument_group("Model Args")
    model_class.add_to_argparse(model_group)

    lit_model_group = parser.add_argument_group("LitModel Args")
    lit_models.BaseLitModel.add_to_argparse(lit_model_group)

    parser.add_argument("--help", "-h", action="help")
    return parser

def _save_model(litmodel, tokenizer, path):
    os.system(f"mkdir -p {path}")
    litmodel.model.save_pretrained(path)
    tokenizer.save_pretrained(path)

In [3]:
parser = _setup_parser()
args = parser.parse_args(args=[])

path = "./config/run_seq.yaml"
# 使用config.yaml 载入超参设置
opt = yaml.load(open(path))
args.__dict__.update(opt)



np.random.seed(args.seed)
torch.manual_seed(args.seed)
data_class = _import_class(f"openue.data.{args.data_class}")
model_class = _import_class(f"openue.models.{args.model_class}")
litmodel_class = _import_class(f"openue.lit_models.{args.litmodel_class}")

data = data_class(args)

lit_model = litmodel_class(args=args, data_config=data.get_config())



# logger = pl.loggers.TensorBoardLogger("training/logs")
# if args.wandb:
#     logger = pl.loggers.WandbLogger(project="openue demo")
#     logger.log_hyperparams(vars(args))

early_callback = pl.callbacks.EarlyStopping(monitor="Eval/f1", mode="max", patience=5)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="Eval/f1", mode="max",
    filename='{epoch}-{Eval/f1:.2f}',
    dirpath="output",
    save_weights_only=True
)


callbacks = [early_callback, model_checkpoint]

trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, default_root_dir="training/logs")

trainer.fit(lit_model, datamodule=data)

trainer.test(lit_model, datamodule=data)


_save_model(litmodel=lit_model, tokenizer=data.tokenizer, path="seq_model")

  opt = yaml.load(open(path))
11/20/2021 13:09:07 - INFO - openue.data.data_module -   add total special tokens: 50 
 ['[relation0]', '[relation1]', '[relation2]', '[relation3]', '[relation4]', '[relation5]', '[relation6]', '[relation7]', '[relation8]', '[relation9]', '[relation10]', '[relation11]', '[relation12]', '[relation13]', '[relation14]', '[relation15]', '[relation16]', '[relation17]', '[relation18]', '[relation19]', '[relation20]', '[relation21]', '[relation22]', '[relation23]', '[relation24]', '[relation25]', '[relation26]', '[relation27]', '[relation28]', '[relation29]', '[relation30]', '[relation31]', '[relation32]', '[relation33]', '[relation34]', '[relation35]', '[relation36]', '[relation37]', '[relation38]', '[relation39]', '[relation40]', '[relation41]', '[relation42]', '[relation43]', '[relation44]', '[relation45]', '[relation46]', '[relation47]', '[relation48]', '[relation49]']
Some weights of the model checkpoint at bert-base-chinese were not used when initializing B

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

11/20/2021 13:10:49 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_train_BertTokenizerFast_seq


## 实体识别模块

在训练过程中，我们利用golden标签进行实体识别，即假设已经获得句子中存在的关系，之后利用这些关系来进行实体识别。


In [None]:
parser = _setup_parser()
args = parser.parse_args(args=[])

path = "./config/run_ner.yaml"
# 使用config.yaml 载入超参设置
opt = yaml.load(open(path))
args.__dict__.update(opt)



np.random.seed(args.seed)
torch.manual_seed(args.seed)
data_class = _import_class(f"openue.data.{args.data_class}")
model_class = _import_class(f"openue.models.{args.model_class}")
litmodel_class = _import_class(f"openue.lit_models.{args.litmodel_class}")

data = data_class(args)

lit_model = litmodel_class(args=args, data_config=data.get_config())



logger = pl.loggers.TensorBoardLogger("training/logs")
if args.wandb:
    logger = pl.loggers.WandbLogger(project="openue demo")
    logger.log_hyperparams(vars(args))

early_callback = pl.callbacks.EarlyStopping(monitor="Eval/f1", mode="max", patience=5)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="Eval/f1", mode="max",
    filename='{epoch}-{Eval/f1:.2f}',
    dirpath="output",
    save_weights_only=True
)


callbacks = [early_callback, model_checkpoint]

trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, default_root_dir="training/logs")

trainer.fit(lit_model, datamodule=data)

trainer.test(lit_model, datamodule=data)
_save_model(litmodel=lit_model, tokenizer=data.tokenizer, path="ner_model")

  opt = yaml.load(open(path))
10/09/2021 14:28:34 - INFO - openue.data.data_module -   add total special tokens: 50 
 ['[relation0]', '[relation1]', '[relation2]', '[relation3]', '[relation4]', '[relation5]', '[relation6]', '[relation7]', '[relation8]', '[relation9]', '[relation10]', '[relation11]', '[relation12]', '[relation13]', '[relation14]', '[relation15]', '[relation16]', '[relation17]', '[relation18]', '[relation19]', '[relation20]', '[relation21]', '[relation22]', '[relation23]', '[relation24]', '[relation25]', '[relation26]', '[relation27]', '[relation28]', '[relation29]', '[relation30]', '[relation31]', '[relation32]', '[relation33]', '[relation34]', '[relation35]', '[relation36]', '[relation37]', '[relation38]', '[relation39]', '[relation40]', '[relation41]', '[relation42]', '[relation43]', '[relation44]', '[relation45]', '[relation46]', '[relation47]', '[relation48]', '[relation49]']
Some weights of the model checkpoint at bert-base-chinese were not used when initializing B

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




10/09/2021 14:29:18 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_train_BertTokenizerFast_ner
10/09/2021 14:29:34 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_dev_BertTokenizerFast_ner
10/09/2021 14:29:35 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_test_BertTokenizerFast_ner
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/f1': 0.7528725268526352}
--------------------------------------------------------------------------------


## 验证

由于我们使用pipeline模型，所以无法联合训练，需要分别训练后进行统一验证。

In [None]:
parser = _setup_parser()
args = parser.parse_args(args=[])

path = "./config/run_infer.yaml"
# 使用config.yaml 载入超参设置
opt = yaml.load(open(path))
args.__dict__.update(opt)



np.random.seed(args.seed)
torch.manual_seed(args.seed)
data_class = _import_class(f"openue.data.{args.data_class}")
model_class = _import_class(f"openue.models.{args.model_class}")
litmodel_class = _import_class(f"openue.lit_models.{args.litmodel_class}")

data = data_class(args)

lit_model = litmodel_class(args=args, data_config=data.get_config())



logger = pl.loggers.TensorBoardLogger("training/logs")
if args.wandb:
    logger = pl.loggers.WandbLogger(project="openue demo")
    logger.log_hyperparams(vars(args))

early_callback = pl.callbacks.EarlyStopping(monitor="Eval/f1", mode="max", patience=5)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="Eval/f1", mode="max",
    filename='{epoch}-{Eval/f1:.2f}',
    dirpath="output",
    save_weights_only=True
)


callbacks = [early_callback, model_checkpoint]

trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, default_root_dir="training/logs")

if "infer" not in path :trainer.fit(lit_model, datamodule=data)

trainer.test(lit_model, datamodule=data)

  opt = yaml.load(open(path))
10/09/2021 14:30:54 - INFO - openue.data.data_module -   add total special tokens: 50 
 ['[relation0]', '[relation1]', '[relation2]', '[relation3]', '[relation4]', '[relation5]', '[relation6]', '[relation7]', '[relation8]', '[relation9]', '[relation10]', '[relation11]', '[relation12]', '[relation13]', '[relation14]', '[relation15]', '[relation16]', '[relation17]', '[relation18]', '[relation19]', '[relation20]', '[relation21]', '[relation22]', '[relation23]', '[relation24]', '[relation25]', '[relation26]', '[relation27]', '[relation28]', '[relation29]', '[relation30]', '[relation31]', '[relation32]', '[relation33]', '[relation34]', '[relation35]', '[relation36]', '[relation37]', '[relation38]', '[relation39]', '[relation40]', '[relation41]', '[relation42]', '[relation43]', '[relation44]', '[relation45]', '[relation46]', '[relation47]', '[relation48]', '[relation49]']
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
10/09/2021 14:31:0

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/f1': 0.0}
--------------------------------------------------------------------------------


[{'Test/f1': 0.0}]