In [54]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import pytorch_lightning as pl
from transformers import (
    DataProcessor,
    InputExample,
    BertTokenizer,
    BertForSequenceClassification,
    glue_convert_examples_to_features,
)

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(2020)
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [10]:
data_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/data/csv')
train_data = pd.read_csv(data_path/'train_link.csv', sep='\t')
train_data.head()

Unnamed: 0,entity,offset,rawtext,kbtext,predict
0,小品,0,小品《战狼故事》中，吴京突破重重障碍解救爱人，深情告白太感人,摘要:小品，就是小的艺术品。 代表:喜剧小品 演艺注意事项:身心放松，自信 中文名:小品 特...,1
1,吴京,10,小品《战狼故事》中，吴京突破重重障碍解救爱人，深情告白太感人,摘要:吴京，男，汉族，1980年10月出生，现公安局科技信息化和技术科科长。 职务:公安局科...,0
2,吴京,10,小品《战狼故事》中，吴京突破重重障碍解救爱人，深情告白太感人,出生地:江苏镇江 摘要:吴京，1934年4月9日出生，祖籍江苏苏州，台湾成功大学土木系毕业，...,0
3,吴京,10,小品《战狼故事》中，吴京突破重重障碍解救爱人，深情告白太感人,出生地:北京 外文名:Jason Wu、오 경（韩语） 摘要:吴京，1974年4月3日出生于...,1
4,障碍,16,小品《战狼故事》中，吴京突破重重障碍解救爱人，深情告白太感人,外文名:The Liability 摘要:当19岁的亚当（杰克·奥康纳 饰）同意帮他妈妈的狡...,0


In [11]:
class ELProcessor(DataProcessor):

    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(data_dir / 'train_link.csv'),
            set_type='train',
        )

    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(data_dir / 'valid_link.csv'),
            set_type='valid',
        )

    def get_labels(self):
        return ['0', '1']

    def _create_examples(self, lines, set_type):
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue
            guid = f'{set_type}-{i}'
            try:
                text_a = line[0] + ' ' + line[2]
                text_b = line[3]
                label = line[4]  
                examples.append(InputExample(
                    guid=guid,
                    text_a=text_a,
                    text_b=text_b,
                    label=label,
                ))
            except:
                print(i)
                print(line)
        return examples

In [14]:
data_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/data/csv')
processor = ELProcessor()
examples = processor.get_train_examples(data_path)
print(examples[10])
print('Train:', len(examples))
examples = processor.get_dev_examples(data_path)
print('Valid:', len(examples))

InputExample(guid='train-11', text_a='甄嬛传 甄嬛传：安陵容怀孕时，雍正经常摸她的肚子，原来这动作大有深意', text_b='摘要:《甄嬛传》是经小说原作者流潋紫改编的一个新话剧，首度用戏剧的形式展现一个史诗类题材。 义项描述:话剧版《甄嬛传》', label='0')
Train: 535333
Valid: 59173


In [18]:
def generate_dataloaders(tokenizer, data_path):
    def generate_dataloader_inner(examples):
        features = glue_convert_examples_to_features(
            examples,
            tokenizer,
            label_list=['0', '1'],
            max_length=128,
            output_mode='classification',
            pad_on_left=False,
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=0)

        dataset = torch.utils.data.TensorDataset(
            torch.LongTensor([f.input_ids for f in features]),
            torch.LongTensor([f.attention_mask for f in features]),
            torch.LongTensor([f.token_type_ids for f in features]),
            torch.LongTensor([f.label for f in features])
        )

        sampler = torch.utils.data.RandomSampler(dataset)
        dataloader = torch.utils.data.DataLoader(
            dataset, sampler=sampler, batch_size=32
        )
        return dataloader

    # 训练数据
    train_examples = processor.get_train_examples(data_path)
    print('Load Example Finish')
    train_loader = generate_dataloader_inner(train_examples)
    print('Generate DataLoader Finish')
    
    # 验证数据
    valid_examples = processor.get_dev_examples(data_path)
    print('Load Example Finish')
    valid_loader = generate_dataloader_inner(valid_examples)
    print('Generate DataLoader Finish')
    
    return train_loader, valid_loader

In [19]:
pretrained_path = '/media/bnu/data/transformers-pretrained-model/chinese_roberta_wwm_ext_pytorch'
tokenizer = BertTokenizer.from_pretrained(pretrained_path)
train_loader, valid_loader = generate_dataloaders(tokenizer, data_path)

Load Example Finish
Generate DataLoader Finish
Load Example Finish
Generate DataLoader Finish


In [48]:
class ELRoBERTaModel(pl.LightningModule):

    def __init__(self, 
                 pretrained_path, 
                 train_loader, 
                 valid_loader):
        super(ELRoBERTaModel, self).__init__()
        self.train_loader = train_loader
        self.valid_loader = valid_loader

        # 预训练模型
        self.ptm = BertForSequenceClassification.from_pretrained(
            pretrained_path+'/pytorch_model.bin',
            config=pretrained_path+'/bert_config.json',
        )

        # 损失函数
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.ptm(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )[0]

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        out = self(input_ids, attention_mask, token_type_ids)

        loss = self.criterion(out, label)

        _, pred = torch.max(out, dim=1)
        acc = (pred == label).float().mean()

        tensorboard_logs = {'train_loss': loss, 'train_acc': acc}
        return {'loss': loss, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        out = self(input_ids, attention_mask, token_type_ids)

        loss = self.criterion(out, label)

        _, pred = torch.max(out, dim=1)
        acc = (pred == label).float().mean()

        return {'val_loss': loss, 'val_acc': acc}

    def validation_epoch_end(self, outputs):
        val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

        tensorboard_logs = {'val_loss': val_loss, 'val_acc': val_acc}
        return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        out = self(input_ids, attention_mask, token_type_ids)

        _, pred = torch.max(out, dim=1)
        acc = (pred == label).float().mean()

        return {'test_acc': acc}

    def test_epoch_end(self, outputs):
        test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

        tensorboard_logs = {'test_acc': test_acc}
        return {'test_acc': test_acc, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-5, eps=1e-8)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.valid_loader
    
    def test_dataloader(self):
        return self.valid_loader


In [34]:
model = ELRoBERTaModel(pretrained_path, train_loader, valid_loader)
trainer = pl.Trainer(
    max_epochs=1,
    val_check_interval=0.1,
    gpus=2,
    distributed_backend='dp',
    default_save_path='/media/bnu/data/pytorch-lightning-checkpoints/'
)
trainer.fit(model)

INFO:transformers.configuration_utils:loading configuration file /media/bnu/data/transformers-pretrained-model/chinese_roberta_wwm_ext_pytorch/bert_config.json
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,



HBox(children=(FloatProgress(value=0.0, description='Validation sanity check', layout=Layout(flex='2'), max=5.…



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …



HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=1850.0, style=P…




1

In [35]:
ckpt_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/ckpt/')
trainer.save_checkpoint(ckpt_path/'EL-RoBERTa-128-0419.ckpt')

In [59]:
model = ELRoBERTaModel.load_from_checkpoint(
    ckpt_path/'EL-RoBERTa-128-0419.ckpt',
    pretrained_path=pretrained_path,
    train_loader=None,
    valid_loader=None,
)
model.eval()
batch = next(iter(valid_loader))
outputs = model(batch[0], batch[1], batch[2])
F.softmax(outputs, dim=-1)[:, 1]

INFO:transformers.configuration_utils:loading configuration file /media/bnu/data/transformers-pretrained-model/chinese_roberta_wwm_ext_pytorch/bert_config.json
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,

tensor([6.3753e-03, 5.0309e-04, 4.3646e-04, 9.7265e-01, 9.9619e-01, 3.3942e-02,
        9.9538e-01, 7.3173e-01, 9.9759e-01, 9.9883e-01, 2.1060e-03, 9.8735e-01,
        3.8978e-04, 2.0384e-01, 9.5879e-01, 2.9894e-03, 1.2333e-02, 7.7380e-01,
        4.1989e-04, 1.2595e-01, 4.8184e-04, 9.7496e-01, 8.5529e-01, 1.4261e-03,
        4.5359e-04, 1.0327e-01, 4.1051e-04, 2.2023e-01, 1.5070e-02, 9.6544e-01,
        4.5117e-04, 4.6139e-03])