In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import numpy as np
import pandas as pd
import pickle
from keras_bert import load_trained_model_from_checkpoint, Tokenizer,get_custom_objects
import codecs
from keras.layers import Input,Dense,LSTM
#from keras_contrib.layers import CRF
from keras.layers.wrappers import TimeDistributed,Bidirectional
from keras.models import Model,load_model
from keras.optimizers import Adam

In [2]:
max_seq_len = 20
config_path = r'D:\final_design\Final_one\bert\bert_config.json'
checkpoint_path = r'D:\final_design\Final_one\bert\bert_model.ckpt'
dict_path = r'D:\final_design\Final_one\bert\vocab.txt'

In [3]:
train_corpus = pickle.load(open(r'D:\final_design\Final_one\data\corpus_train.pkl','rb'))
train_questions = [train_corpus[i]['question'] for i in range(len(train_corpus))]
train_entitys = [train_corpus[i]['gold_entitys'] for i in range(len(train_corpus))]
train_entitys = [[entity[1:-1].split('_')[0] for entity in line]for line in train_entitys]

test_corpus = pickle.load(open(r'D:\final_design\Final_one\data\corpus_test.pkl','rb'))
test_questions = [test_corpus[i]['question'] for i in range(len(test_corpus))]
test_entitys = [test_corpus[i]['gold_entitys'] for i in range(len(test_corpus))]
test_entitys = [[entity[1:-1].split('_')[0] for entity in line]for line in test_entitys]

In [4]:
print(train_questions[:10])

['莫妮卡·贝鲁奇的代表作？', '《湖上草》是谁的诗？', '龙卷风的英文名是什么？', '新加坡的水域率是多少？', '商朝在哪场战役中走向覆灭？', '叔本华信仰什么宗教？', '大兴安岭的终点是哪里？', '演员梅艳芳有多高？', '被誉为万岛之国的是哪个国家？', '北京奥运会的口号是什么？']


In [4]:
token_dict = {}
with codecs.open(dict_path,'r','utf-8') as f:
    for i in f:
        token = i.strip()
        token_dict[token] = len(token_dict)
tokenizer = Tokenizer(token_dict)

def setlabel(question,entity):
    m=np.zeros((len(question)+1,len(entity)+1),int) #生成0矩阵，为方便后续计算，比字符串长度多了一列
    mmax=0  #最长匹配的长度
    p=0 #最长匹配对应在question中的最后一位
    for i in range(len(question)):
        for j in range(len(entity)):
            if question[i]==entity[j]:
                m[i+1][j+1]=m[i][j]+1
            if m[i+1][j+1]>mmax:
                mmax=m[i+1][j+1]
                p=i+1
    #print(question,entity,p,mmax)
    return question[p-mmax:p]

def getPair(questions, entitys):
    X, X2, Y = [], [], []
    for i in range(len(questions)):
        q = questions[i]
        x, x2 = tokenizer.encode(first=q,max_len = max_seq_len)#词索引序列和分块索引序列
        y = [[0] for j in range(max_seq_len)]
        assert len(x)==len(y)
        for e in entitys[i]:
            #得到实体名和问题的最长连续公共子串
            e = setlabel(q,e)
            if e in q:
                begin = q.index(e)+1
                end = begin + len(e)
                if end < max_seq_len-1:
                    for pos in range(begin,end):
                        y[pos] = [1]
        #print (q)
        #print (x1)
        #print (y)
        X.append(x)
        X2.append(x2)
        Y.append(y)
        #print(x2)
    return np.array(X),np.array(X2),np.array(Y)


#得到训练及测试样本
trainx1,trainx2,trainy = getPair(train_questions,train_entitys)#(num_sample,max_len)
testx1,testx2,testy = getPair(test_questions,test_entitys)
print (testx1.shape)


(765, 20)


In [7]:
print(trainx1[:10])
print(trainx2[:10])

[[ 101 5811 1984 1305  185 6564 7826 1936 4638  807 6134  868 8043  102
     0    0    0    0    0    0]
 [ 101  517 3959  677 5770  518 3221 6443 4638 6408 8043  102    0    0
     0    0    0    0    0    0]
 [ 101 7987 1318 7599 4638 5739 3152 1399 3221  784  720 8043  102    0
     0    0    0    0    0    0]
 [ 101 3173 1217 1786 4638 3717 1818 4372 3221 1914 2208 8043  102    0
     0    0    0    0    0    0]
 [ 101 1555 3308 1762 1525 1767 2773 2514  704 6624 1403 6208 4127 8043
   102    0    0    0    0    0]
 [ 101 1356 3315 1290  928  814  784  720 2134 3136 8043  102    0    0
     0    0    0    0    0    0]
 [ 101 1920 1069 2128 2275 4638 5303 4157 3221 1525 7027 8043  102    0
     0    0    0    0    0    0]
 [ 101 4028 1447 3449 5683 5710 3300 1914 7770 8043  102    0    0    0
     0    0    0    0    0    0]
 [ 101 6158 6289  711  674 2270  722 1744 4638 3221 1525  702 1744 2157
  8043  102    0    0    0    0]
 [ 101 1266  776 1952 6817  833 4638 1366 1384 3221  78

In [6]:
model = load_model(r'D:\final_design\Final_one\data\model\model_ner_general.h5',custom_objects=get_custom_objects())

In [5]:
bert_model = load_trained_model_from_checkpoint(config_path,checkpoint_path, seq_len=None)
for l in bert_model.layers:
    l.trainable = True
x_in = Input(shape = (None,))
x2_in = Input(shape = (None,))
x = bert_model([x_in,x2_in])
x = Bidirectional(LSTM(512,return_sequences=True,recurrent_dropout=0.2))(x)
o = Dense(1, activation='sigmoid' )(x)
#crf = CRF(1, sparse_target=True)
#o = crf(x)
model = Model([x_in,x2_in], o)
model.compile(loss='binary_crossentropy',optimizer=Adam(1e-5),metrics=['accuracy'])
model.summary()
model.save(r'D:\final_design\Final_one\data\model\model_ner_general_crf.h5')

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
model_1 (Functional)            (None, None, 768)    101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, None, 1024)   5246976     model_1[0][0]              

In [14]:
model.fit([trainx1,trainx2],trainy, epochs=5, batch_size=64)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1b365195f88>

In [17]:
predicty = model.predict([testx1,testx2],batch_size = 64)
#print(predicty[:100])
predicty = [[1 if i >0.5 else 0 for i in line] for line in predicty]

In [18]:
predict_entitys = restore_entity_from_labels_on_corpus(predicty,test_questions)
for j in range(300,302):
    print (predict_entitys[j])
    print (test_entitys[j])

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 

['神圣罗马帝国']
['神圣罗马帝国']
['神圣罗马帝国']
['神圣罗马帝国']


In [9]:
def restore_entity_from_labels_on_corpus(predicty,questions):
    def restore_entity_from_labels(labels,question):
        entitys = []
        str = ''
        labels = labels[1:-1]
        print(labels)
        for i in range(min(len(labels),len(question))):
            if labels[i]==1:
                str += question[i]
            else:
                if len(str):
                    entitys.append(str)
                    str = ''
        if len(str):
            entitys.append(str) 
        return entitys
    all_entitys = []
    for i in range(len(predicty)):
        all_entitys.append(restore_entity_from_labels(predicty[i],questions[i]))
    return all_entitys

'葬于茂陵的皇帝在位于哪段时间？'

In [11]:
maxf = 0.0
def computeF(gold_entity,pre_entity):

    #根据标注的实体位置和预测的实体位置，计算prf,完全匹配
    #输入： Python-list  3D，值为每个实体的起始位置列表[begin，end]
    #输出： float
   
    truenum = 0
    prenum = 0
    goldnum = 0
    for i in range(len(gold_entity)):
        goldnum += len(gold_entity[i])
        prenum  += len(pre_entity[i])
        truenum += len(set(gold_entity[i]).intersection(set(pre_entity[i])))
    try:
        precise = float(truenum) / float(prenum)
        recall = float(truenum) / float(goldnum)
        f = float(2 * precise * recall /( precise + recall)) 
    except:
        precise = recall = f = 0.0
    print('本轮实体的F值是 %f' %(f))
    return precise,recall,f

In [19]:
p,r,f = computeF(test_entitys,predict_entitys)
print ('%d epoch f-score is %.3f'%(1,f))

本轮实体的F值是 0.667022
1 epoch f-score is 0.667


In [None]:
0.654680

In [45]:
model.save(r'C:\Users\HashiriNio\Desktop\final_design\Final_one\data\model\model_ner_general.h5')

In [23]:
from keras_contrib import losses

In [24]:
loss=losses.crf_loss

In [28]:
from keras import backend as K

In [30]:
K.slice([0, 1], [-1, 2])

AttributeError: module 'keras.backend' has no attribute 'slice'

In [20]:
p

0.681917211328976

In [None]:
0.6641630901287554