In [1]:
import numpy as np
import pandas as pd
import pickle
from keras_bert import load_trained_model_from_checkpoint, Tokenizer,get_custom_objects
import codecs
from keras.layers import Input,Dense,LSTM
from keras.layers.wrappers import TimeDistributed,Bidirectional
from keras.models import Model,load_model
from keras.optimizers import Adam

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
max_seq_len = 20
config_path = r'D:\final_design\Final_one\bert\bert_config.json'
checkpoint_path = r'D:\final_design\Final_one\bert\bert_model.ckpt'
dict_path = r'D:\final_design\Final_one\bert\vocab.txt'

train_corpus = pickle.load(open(r'D:\final_design\Final_one\data\corpus_train.pkl','rb'))
train_questions = [train_corpus[i]['question'] for i in range(len(train_corpus))]
train_entitys = [train_corpus[i]['gold_entitys'] for i in range(len(train_corpus))]
train_entitys = [[entity[1:-1].split('_')[0] for entity in line]for line in train_entitys]

test_corpus = pickle.load(open(r'D:\final_design\Final_one\data\corpus_test.pkl','rb'))
test_questions = [test_corpus[i]['question'] for i in range(len(test_corpus))]
test_entitys = [test_corpus[i]['gold_entitys'] for i in range(len(test_corpus))]
test_entitys = [[entity[1:-1].split('_')[0] for entity in line]for line in test_entitys]


token_dict = {}
with codecs.open(dict_path,'r','utf-8') as f:
    for i in f:
        token = i.strip()
        token_dict[token] = len(token_dict)
tokenizer = Tokenizer(token_dict)

def setlabel(question,entity):
    m=np.zeros((len(question)+1,len(entity)+1),int) #生成0矩阵，为方便后续计算，比字符串长度多了一列
    mmax=0  #最长匹配的长度
    p=0 #最长匹配对应在question中的最后一位
    for i in range(len(question)):
        for j in range(len(entity)):
            if question[i]==entity[j]:
                m[i+1][j+1]=m[i][j]+1
            if m[i+1][j+1]>mmax:
                mmax=m[i+1][j+1]
                p=i+1
    #print(question,entity,p,mmax)
    return question[p-mmax:p]

def getPair(questions, entitys):
    X, X2, Y = [], [], []
    for i in range(len(questions)):
        q = questions[i]
        x, x2 = tokenizer.encode(first=q,max_len = max_seq_len)#词索引序列和分块索引序列
        y = [[0] for j in range(max_seq_len)]
        assert len(x)==len(y)
        for e in entitys[i]:
            #得到实体名和问题的最长连续公共子串
            e = setlabel(q,e)
            if e in q:
                begin = q.index(e)+1
                end = begin + len(e)
                if end < max_seq_len-1:
                    for pos in range(begin,end):
                        y[pos] = [1]
        #print (q)
        #print (x1)
        #print (y)
        X.append(x)
        X2.append(x2)
        Y.append(y)
        #print(x2)
    return np.array(X),np.array(X2),np.array(Y)


#得到训练及测试样本
trainx1,trainx2,trainy = getPair(train_questions,train_entitys)#(num_sample,max_len)
testx1,testx2,testy = getPair(test_questions,test_entitys)
print (trainx1.shape)

(2297, 20)


In [4]:
model = load_model(r'D:\final_design\Final_one\data\model\model_ner_general.h5',custom_objects=get_custom_objects())



In [5]:
maxf = 0.0
def computeF(gold_entity,pre_entity):

    #根据标注的实体位置和预测的实体位置，计算prf,完全匹配
    #输入： Python-list  3D，值为每个实体的起始位置列表[begin，end]
    #输出： float
   
    truenum = 0
    prenum = 0
    goldnum = 0
    for i in range(len(gold_entity)):
        goldnum += len(gold_entity[i])
        prenum  += len(pre_entity[i])
        truenum += len(set(gold_entity[i]).intersection(set(pre_entity[i])))
    try:
        precise = float(truenum) / float(prenum)
        recall = float(truenum) / float(goldnum)
        f = float(2 * precise * recall /( precise + recall)) 
    except:
        precise = recall = f = 0.0
    print('本轮实体的F值是 %f' %(f))
    return precise,recall,f

def restore_entity_from_labels_on_corpus(predicty,questions):
    def restore_entity_from_labels(labels,question):
        entitys = []
        str = ''
        labels = labels[1:-1]
        #print(labels,question)
        for i in range(min(len(labels),len(question))):
            if labels[i]==1:
                str += question[i]
            else:
                if len(str):
                    entitys.append(str)
                    str = ''
        if len(str):
            entitys.append(str) 
        return entitys
    all_entitys = []
    for i in range(len(predicty)):
        all_entitys.append(restore_entity_from_labels(predicty[i],questions[i]))
        #print(predicty[i])
    return all_entitys

In [None]:
for i in range(3):
    model.fit([trainx1,trainx2],trainy, epochs=1,batch_size = 32)
    predictys = model.predict([testx1,testx2],batch_size = 32)
    predictys = [[1 if i >0.5 else 0 for i in line]for line in predictys]
    k = restore_entity_from_labels_on_corpus(predictys,test_questions)
    for m in range(200,220):
        print('predict:',k[m])
        print('true:',test_entitys[m])
    p,r,f = computeF(test_entitys,k)
    print('%d epoch,F is %.3f,precise is %.3f'%(i,f,p))
    if f>maxf:
        maxf = f
        model.save(r'D:\final_design\Final_one\data\model\model_ner_general.h5')
        print('model updated')

In [48]:
model = load_model(r'D:\final_design\Final_one\data\model\model_ner_general.h5',custom_objects=get_custom_objects())


In [98]:
q = '覆盖网络是指创建在其他网络之上的网络'
i1,i2 = tokenizer.encode(first = q,max_len = max_seq_len)
i1 = np.array(i1)
i2 = np.array(i2)
print(q)
#print(i1,i2)
#print(testx1[200],testx2[200])
pre = model.predict([np.array([i1,i2]),np.array([i2,i2])])
#print(pre[0])
pre = [1 if i >0.5 else 0 for i in pre[0]]
#print(pre)
finl = restore_entity_from_labels_on_corpus([pre],[q])
finl

覆盖网络是指创建在其他网络之上的网络


[['覆盖网络', '创建在其他网络']]

In [None]:
maxf = 0.0
def computeF(gold_entity,pre_entity):

    #根据标注的实体位置和预测的实体位置，计算prf,完全匹配
    #输入： Python-list  3D，值为每个实体的起始位置列表[begin，end]
    #输出： float
   
    truenum = 0
    prenum = 0
    goldnum = 0
    for i in range(len(gold_entity)):
        goldnum += len(gold_entity[i])
        prenum  += len(pre_entity[i])
        truenum += len(set(gold_entity[i]).intersection(set(pre_entity[i])))
    try:
        precise = float(truenum) / float(prenum)
        recall = float(truenum) / float(goldnum)
        f = float(2 * precise * recall /( precise + recall)) 
    except:
        precise = recall = f = 0.0
    print('本轮实体的F值是 %f' %(f))
    return precise,recall,f

In [None]:
p,r,f = computeF(test_entitys,predict_entitys)