- is_valid == False  : Run cells between " Test Data Inference " and " Entity level  Performance ( Evaluate ) "
- is_valid == True   : Run cells below " Entity level  Performance ( Evaluate ) "

## General

In [12]:
is_valid = False

In [13]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import torch
from transformers import BertTokenizer, BertConfig, BertModel
import torch.optim as optim
import torch.nn as nn
from torchtext import data
from net import BiLSTMCRF, BERTCRF, BERTBiLSTMCRF, BERTransformerCRF
from forward_step import *
import pickle
import collections
from json import loads, dumps
from collections import OrderedDict

gpu = 1
device = torch.device('cuda:%d'%(gpu) if torch.cuda.is_available() else 'cpu')

####################################################################
###################### Load data & preprocess ######################
####################################################################
# 'bert-base-chinese' 'hfl/chinese-roberta-wwm-ext'
PRETRAINED_MODEL_NAME = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

with open('./token_info/ner2int.pickle', 'rb') as file:
    ner2int = pickle.load(file)
file.close()


def str_split(x):
    return x.split(' ')

def ner_onehot(x):
    return [ner2int[k] for k in x]

CONTENT = data.Field(sequential = True, tokenize=str_split, preprocessing=tokenizer.convert_tokens_to_ids, pad_token=tokenizer.pad_token_id, unk_token=tokenizer.unk_token_id, fix_length=50, use_vocab = False, lower = True, batch_first = True, include_lengths = False)
NER = data.Field(sequential = True, tokenize=str_split, preprocessing=ner_onehot, pad_token=len(ner2int), fix_length=50, use_vocab = False, lower = False, batch_first = True, include_lengths = False)
ID = data.LabelField(sequential = False, tokenize = str, dtype=torch.float, lower = False, use_vocab = False)


if is_valid:
    DataSet = data.TabularDataset(
        path='./data/cluener_dev_v2.csv', format='csv',
        fields={'id': ('id', ID),
                'content': ('content', CONTENT),
                'ner': ('ner', NER)}
        )
else:
    DataSet = data.TabularDataset(
        path='./data/cluener_test_v2.csv', format='csv',
        fields={'id': ('id', ID),
                'content': ('content', CONTENT)}
        )

BATCH_SIZE = 32
valid_iterator = data.BucketIterator(
    DataSet,
    batch_size = BATCH_SIZE, 
    device = device,
    sort = False
    )

In [14]:
int2ner = {}
for i in ner2int:
    int2ner[ner2int[i]] = i

In [15]:
def key_padding_mask_bert(x, device):
    atten_mask = torch.zeros(x.shape, dtype=torch.uint8).to(device)
    atten_mask = atten_mask.masked_fill(x != tokenizer.pad_token_id, 1)
    return atten_mask

def inference(batch, model, device, inference=False):
    model.eval()
    crf_mask = key_padding_mask_bert(batch.content, device)
    if not inference:
        logits = model(batch, crf_mask)
        return logits
    else:
        pred = model(batch, crf_mask, True)
        return pred

In [16]:
def ner_extract(x):
    res = {}
    x = [int2ner[y] for y in x]
    index = []
    for i, k in enumerate(x):
        if k != 'O':
            if not index:
                if k[0] == 'B':
                    if k[2:] not in res:  
                        res[k[2:]] = []
                    entity = k[2:]
                    index.append(i)
            else:
                if k[0] == 'I':
                    if k[2:] == entity:
                        index.append(i)
                    else:
                        res[entity].append([index[0], index[-1]])
                        index = []
                if k[0] == 'B':
                    res[entity].append([index[0], index[-1]])
                    index = []
                    if k[2:] not in res:  
                        res[k[2:]] = []
                    entity = k[2:]
                    index.append(i)     
        else:
            if index:
                res[entity].append([index[0], index[-1]])
                index = []
    if index:
        res[entity].append([index[0], index[-1]])
    return res

In [17]:
def performance_entity(TP, FP, FN):
    df = pd.DataFrame(columns=['entity', 'recall', 'precision', 'f1-score'])
    TP_overall, FP_overall, FN_overall = 0, 0, 0
    for i, e in enumerate(['name', 'organization', 'position', 'company', 'address', 
'game', 'government', 'scene', 'book', 'movie']):
        TP_overall += TP[e]
        FP_overall += FP[e]
        FN_overall += FN[e]
        recall_entity = TP[e]/(TP[e]+FN[e])
        precision_entity = TP[e]/(TP[e]+FP[e])
        f1_score_entity = 2*(recall_entity*precision_entity)/(recall_entity+precision_entity)
        df.loc[i] = [e, recall_entity, precision_entity, f1_score_entity]
    recall_overall = TP_overall/(TP_overall+FN_overall)
    precision_overall = TP_overall/(TP_overall+FP_overall)
    f1_score_overall = 2*(recall_overall*precision_overall)/(recall_overall+precision_overall)
    recall_ma = df['recall'].mean()
    precision_ma = df['precision'].mean()
    f1_score_ma = 2*(recall_ma*precision_ma)/(recall_ma+precision_ma)  
    df.loc[i+1] = ['overall (Macro)', recall_ma, precision_ma, f1_score_ma]
    # df.loc[i+2] = ['overall (Micro)', recall_overall, precision_overall, f1_score_overall]
    df['recall'] = df['recall'].apply(lambda x: round(x*100, 2))
    df['precision'] = df['precision'].apply(lambda x: round(x*100, 2))
    df['f1-score'] =  df['f1-score'].apply(lambda x: round(x*100, 2))
    return df

In [18]:
model_path = './model/bert-ner_best_test_v2.ckpt'
bert_config = BertConfig.from_pretrained(PRETRAINED_MODEL_NAME)
embedding_dim = bert_config.pooler_fc_size
num_class = len(ner2int)
# vocab_size = bert_config.vocab_size
# model = BiLSTMCRF(vocab_size, embedding_dim, num_class+1)
# word_emb = BertModel.from_pretrained(PRETRAINED_MODEL_NAME).embeddings.word_embeddings.weight.data.to(device)
# model.embedding.weight.data.copy_(word_emb)
# model = BERTransformerCRF(bert_config, embedding_dim, num_class+1, torch.device('cuda:%d'%(gpu) if torch.cuda.is_available() else 'cpu'))
model = BERTCRF(bert_config, embedding_dim, num_class+1)
model.load_state_dict(torch.load(model_path), strict=False)
model.to(device)

BERTCRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

## Test Data Inference

In [19]:
test = pd.read_csv('./data/cluener_test_v2.csv')
test_json = {}
for i in range(len(test)):
    test_json[test.id.loc[i]] = test.content.loc[i].split(' ')

In [20]:
%%time
gg = 0
submit = {}
for batch in tqdm(valid_iterator):
    ids = [str(int(x)) for x in batch.id.detach().cpu().numpy()]
    pred = inference(batch, model, device, True)
    for idx in range(len(pred)):
        id_ = int(ids[idx])
        submit[id_] = {}
        submit[id_]['text'] = ''.join(test_json[id_])
        predict_entity = ner_extract(pred[idx])
        tmp = {}
        for atr, indexes in predict_entity.items():
            tmp[atr] = {}
            for index in indexes:
                ent = ''.join(test_json[id_][index[0]: index[1]+1])
                if ent not in tmp[atr]:
                    tmp[atr][ent] = []
                tmp[atr][ent].append(index)
        submit[id_]['label'] = tmp
#         if id_ == 21:
#             gg = 1
#             break
#     if gg == 1:
#         break

100%|██████████| 43/43 [00:08<00:00,  5.23it/s]

CPU times: user 5min 8s, sys: 18 s, total: 5min 26s
Wall time: 8.22 s





In [21]:
def to_dict(input_ordered_dict):
    return loads(dumps(input_ordered_dict))

In [22]:
cluener_predict = []
for i in range(len(submit)):
    tmp = {}
    tmp['id'] = i
    tmp['label'] = to_dict(collections.OrderedDict(sorted(submit[i]['label'].items())))
    cluener_predict.append(tmp)

In [29]:
submit[100]

{'text': '美国导弹防御局称，两枚导弹都“发射正常”，然而“海基x波段雷达”并未如先前预计地一样正常工作。',
 'label': {'government': {'美国导弹防御局': [[0, 6]]}}}

In [30]:
cluener_predict[100]

{'id': 100, 'label': {'government': {'美国导弹防御局': [[0, 6]]}}}

In [24]:
import json
with open('./data/cluener_predict.json', 'w', encoding='utf8') as output:
    for i in range(len(cluener_predict)):
        output.write(json.dumps(cluener_predict[i], ensure_ascii=False))
        output.write('\n')
output.close()

## Entity level  Performance ( Evaluate )

In [9]:
%%time
TP = {}
FP = {}
FN = {}
for batch in tqdm(valid_iterator):
    ids = [str(int(x)) for x in batch.id.detach().cpu().numpy()]
    pred = inference(batch, model, device)[1]
    y = []
    for i in range(len(batch.ner)):
        tmp = batch.ner[i].detach().cpu().numpy()
        y.append(list(tmp[tmp != len(ner2int)]))
    for idx in range(len(y)):
        predict_entity = ner_extract(pred[idx])
        ground_truth = ner_extract(y[idx])
        for e, v in predict_entity.items():
            if e not in ground_truth:
                if e not in FP:
                    FP[e] = 0
                FP[e] += len(v)
            else:
                for l in v:
                    if l in ground_truth[e]:
                        if e not in TP:
                            TP[e] = 0
                        TP[e] += 1
                    else:
                        if e not in FP:
                            FP[e] = 0
                        FP[e] += 1
        for e, v in ground_truth.items():
            if e not in predict_entity:
                if e not in FN:
                    FN[e] = 0
                FN[e] += len(v)
            else:
                for l in v:
                    if l not in predict_entity[e]:
                        if e not in FN:
                            FN[e] = 0
                        FN[e] += 1

100%|██████████| 42/42 [00:07<00:00,  5.52it/s]

CPU times: user 4min 33s, sys: 16.2 s, total: 4min 49s
Wall time: 7.61 s





In [10]:
performance = performance_entity(TP, FP, FN)
performance['bert-base'] = [88.75, 79.43, 78.89, 81.42, 60.89, 86.42, 87.03, 65.10, 73.68, 85.82, 78.82]
performance['roberta-large'] = [89.09, 82.34, 79.62, 83.02, 62.63, 86.80, 88.17, 70.49, 74.60, 87.46, 80.42]

In [11]:
performance

Unnamed: 0,entity,recall,precision,f1-score,bert-base,roberta-large
0,name,88.6,83.74,86.1,88.75,89.09
1,organization,79.29,78.44,78.86,79.43,82.34
2,position,79.91,78.46,79.18,78.89,79.62
3,company,81.48,78.77,80.1,81.42,83.02
4,address,62.47,66.95,64.63,60.89,62.63
5,game,87.46,74.57,80.5,86.42,86.8
6,government,86.23,79.48,82.72,87.03,88.17
7,scene,79.43,68.6,73.61,65.1,70.49
8,book,77.27,79.33,78.29,73.68,74.6
9,movie,78.15,87.41,82.52,85.82,87.46
