# CRF+LSTM

requirement:

keras 2.2.4

tensorflow 1.13

pip install git+https://www.github.com/keras-team/keras-contrib.git

In [1]:
import re
import os
import pandas as pd

In [2]:
char_vocab_path = "CRF/data/char_vocabs.txt" # 字典文件
special_words = ['<PAD>', '<UNK>'] # 特殊词表示
label2idx = {'O': 0,
             'B-DISEASE': 1, 'B-DISEASE_GROUP': 2,
             'B-DRUG_DOSAGE': 3, 'B-DRUG_EFFICACY': 4,
             'B-DRUG_INGREDIENT': 5, 'B-DRUG_TASTE': 6,
             'B-FOOD_GROUP':7, 'B-PERSON_GROUP':8,
             'B-SYMPTOM':9, 'B-SYNDROME':10,
             'I-DISEASE': 11, 'I-DISEASE_GROUP': 12,
             'I-DRUG_DOSAGE': 13, 'I-DRUG_EFFICACY': 14,
             'I-DRUG_INGREDIENT': 15, 'I-DRUG_TASTE': 16,
             'I-FOOD_GROUP':17, 'I-PERSON_GROUP':18,
             'I-SYMPTOM':19, 'I-SYNDROME':20
            }

# 索引和BIO标签对应
idx2label = {idx: label for label, idx in label2idx.items()}

# 读取字符词典文件
with open(char_vocab_path, "r", encoding="utf8") as fo:
    char_vocabs = [line.strip() for line in fo]
char_vocabs = special_words + char_vocabs

# 字符和索引编号对应
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}

In [3]:
# 读取训练语料
def read_corpus(corpus_path, vocab2idx, label2idx):
    with open(corpus_path, encoding='utf-8') as fr:
        lines = fr.readlines()
    sent_, tag_ = [], []
    for letter in lines:
        [char,label,_] = re.split('\t|\n',letter)
        char = re.sub(' |\*|<|>','_',char)
        sent_.append(char)
        tag_.append(label)
    sent_ids = [vocab2idx[char] if char in vocab2idx else vocab2idx['<UNK>'] for char in sent_]
    tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
    return sent_ids, tag_ids

In [4]:
train_datas = []
train_labels = []
files = os.listdir('data/train_data')
for file in files:
    train_data_path_i = 'data/train_data/'+file
    train_datas_i, train_labels_i = read_corpus(train_data_path_i, vocab2idx, label2idx)
    train_datas.append(train_datas_i)
    train_labels.append(train_labels_i)

In [5]:
valid_datas = []
valid_labels = []
files = os.listdir('data/valid_data')
for file in files:
    valid_data_path_i = 'data/valid_data/'+file
    valid_datas_i, valid_labels_i = read_corpus(valid_data_path_i, vocab2idx, label2idx)
    valid_datas.append(valid_datas_i)
    valid_labels.append(valid_labels_i)

In [6]:
# 提取训练集已有规则
train_files = os.listdir('data/train')
train_ann_files = [x for x in train_files if x.endswith('.ann')]
tag_dic = {}
del_list = []
for file in train_ann_files:
    with open('data/train/%s'%file) as file_obj:
        labeled_data = file_obj.read()
    labeled_data_list = labeled_data.split('\n')
    for labeled in labeled_data_list:
        res = re.split(' |\t',labeled)
        tag = res[1]
        word = res[4]
        if word not in tag_dic and word not in del_list:
            tag_dic[word] = tag
        else:
            try:
                del tag_dic[word]
            except:
                pass
            del_list.append(word)

In [7]:
print(train_datas[50])
print([idx2vocab[idx] for idx in train_datas[50]])
print(train_labels[50])
print([idx2label[idx] for idx in train_labels[50]])

[58, 61, 77, 1, 58, 17, 181, 3093, 3817, 2654, 6214, 1959, 2177, 286, 6802, 5965, 519, 1408, 2644, 2102, 2732, 1842, 889, 2545, 3093, 3817]
['_', 'b', 'r', '<UNK>', '_', '3', '、', '治', '疗', '期', '间', '忌', '房', '事', '，', '配', '偶', '如', '有', '感', '染', '应', '同', '时', '治', '疗']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


### CRF+BiLSTM

In [8]:
import numpy as np
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation
from keras.preprocessing import sequence
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras import backend as K
K.clear_session()

EPOCHS = 40
BATCH_SIZE = 128
EMBED_DIM = 48
HIDDEN_SIZE = 16
MAX_LEN = 100
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(label2idx)
print(VOCAB_SIZE, CLASS_NUMS)

print('padding sequences')
train_datas = sequence.pad_sequences(train_datas, maxlen=MAX_LEN)
train_labels = sequence.pad_sequences(train_labels, maxlen=MAX_LEN)
valid_datas = sequence.pad_sequences(valid_datas, maxlen=MAX_LEN)
valid_labels = sequence.pad_sequences(valid_labels, maxlen=MAX_LEN)
print('x_train shape:', train_datas.shape)
print('x_test shape:', valid_datas.shape)

train_labels = keras.utils.to_categorical(train_labels, CLASS_NUMS)
valid_labels = keras.utils.to_categorical(valid_labels, CLASS_NUMS)
print('trainlabels shape:', train_labels.shape)
print('testlabels shape:', valid_labels.shape)

## BiLSTM+CRF模型构建
inputs = Input(shape=(MAX_LEN,), dtype='int32')
x = Masking(mask_value=0)(inputs)
x = Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(x)
x = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))(x)
x = TimeDistributed(Dense(CLASS_NUMS))(x)#TimeDistributed层的作用就是把Dense层应用到这10个具体的向量上，对每一个向量进行了一个Dense操作
outputs = CRF(CLASS_NUMS)(x)
model = Model(inputs=inputs, outputs=outputs)
model.summary()

model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])
model.fit(train_datas, train_labels, epochs=EPOCHS, verbose=1, validation_split=0.1)

score = model.evaluate(valid_datas, valid_labels, batch_size=BATCH_SIZE)
print(model.metrics_names)
print(score)

Using TensorFlow backend.





6874 21
padding sequences
x_train shape: (6899, 100)
x_test shape: (3974, 100)
trainlabels shape: (6899, 100, 21)
testlabels shape: (3974, 100, 21)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
masking_1 (Masking)          (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 32)           219968    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 24)           4320      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 21)           525       
___________________________

Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
['loss', 'crf_viterbi_accuracy']
[7.449537039163069, 0.889434632519471]


In [9]:
def get_valid_nertag(input_data, result_tags):
    result_words = []
    start, end =0, 1 # 实体开始结束位置标识
    tag_label = "O" # 实体类型标识
    number = 0
    for i, tag in enumerate(result_tags):
        if tag.startswith("B"):
            number += 1
            if tag_label != "O": # 当前实体tag之前有其他实体     
                result_words.append(('T'+str(number), tag_label, start, end,input_data[start: end]))# 获取实体 
            tag_label = tag.split("-")[1] # 获取当前实体类型
            start, end = i, i+1 # 开始和结束位置变更
        elif tag.startswith("I"):
            temp_label = tag.split("-")[1]
            if temp_label == tag_label: # 当前实体tag是之前实体的一部分
                end += 1 # 结束位置end扩展
        elif tag == "O":
            if tag_label != "O": # 当前位置非实体 但是之前有实体
                result_words.append(('T'+str(number), tag_label, start, end,input_data[start: end]))
                tag_label = "O"  # 实体类型置"O"
            start, end = i, i+1 # 开始和结束位置变更
    if tag_label != "O": # 最后结尾还有实体
        number += 1
        result_words.append(('T'+str(number), tag_label, start, end,input_data[start: end]))# 获取结尾的实体
    return result_words

In [10]:
# 线下验证数据集
maxlen = 100
valid_result = {}
valid_data_path = 'data/valid/text/'
#test_data_path = 'data/valid/'
for i in range(len(os.listdir(valid_data_path))):
    valid_file = valid_data_path+str(i+800)+'.txt'
    with open(valid_file, "r", encoding="utf8") as valid:
        sentence = valid.read()
    sentences = sentence.split('。')
    y_ner = []
    for sent in sentences:
        sent = sent.replace(' ','_')
        sent_chars = list(sent+'。')
        sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in sent_chars]

        sent2id_new = np.array([[0] * (maxlen-len(sent2id)) + sent2id[:maxlen]])
        y_pred = model.predict(sent2id_new)
        y_label = np.argmax(y_pred, axis=2)
        y_label = y_label.reshape(1, -1)[0]
        y_ner_ = [idx2label[i] for i in y_label][-len(sent_chars):]
        y_ner.extend(y_ner_)
    result_words = get_valid_nertag(sentence, y_ner)
    ans = []
    for res in result_words:
        number = res[0]
        #tag = res[1]
        start = res[2]
        end = res[3]
        word = res[4]
        word = re.sub(' |，|：|；|、|;|。|（|）','_',res[4]) # 有特殊字符的实体不予加入结果
        try:
            tag = tag_dic[word]
        except:
            tag = res[1]
        if word.startswith('_'):
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start+1, end, word[1:]))
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start+1, end, word[1:]))
        if word.endswith('_'):
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start, end-1, word[:-1]))
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start+1, end-1, word[:-1]))
        if '_' not in word:
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start, end, word))
        
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start, end, word))
    #print('='*100)
    
    valid_result[i+800] = ans

T1	DRUG_DOSAGE 4 8	薄膜衣片
T2	DRUG_TASTE 19 21	味涩
T3	DRUG_TASTE 22 24	微苦
T4	DISEASE 44 47	糖尿病
T5	SYMPTOM 127 129	尿频
T6	SYMPTOM 130 132	尿急
T7	SYMPTOM 133 135	尿痛
T8	DRUG_EFFICACY 483 485	清湿
T9	DRUG_EFFICACY 487 489	止带
T10	DISEASE_GROUP 499 502	的带下
T11	SYMPTOM 505 509	见白带量
T12	DRUG_DOSAGE 509 511	多生
T13	FOOD_GROUP 511 513	冷辛
T14	FOOD_GROUP 515 516	生
T15	SYMPTOM 547 551	见赤白带
T1	DRUG_DOSAGE 15 18	胶囊剂
T2	DRUG_DOSAGE 30 32	粉未
T3	DRUG_TASTE 33 35	气香
T4	DRUG_TASTE 36 39	味微苦
T5	DRUG_EFFICACY 42 46	活血化瘀
T6	DRUG_EFFICACY 47 51	软坚散结
T7	DISEASE 54 58	子宫肌瘤
T8	DISEASE 59 62	盆腔炎
T9	FOOD_GROUP 62 64	生冷
T10	PERSON_GROUP 83 89	抑制苯甲酸雌
T11	DRUG_EFFICACY 154 158	所致的家
T12	DRUG_EFFICACY 159 163	血液粘滞
T13	DISEASE 166 170	制正常大
T14	SYMPTOM 171 177	离体子宫平滑
T15	SYMPTOM 178 182	收缩频率
T1	DISEASE 170 173	宫颈炎
T2	SYMPTOM 174 178	宫颈糜烂
T3	SYMPTOM 193 196	小腹痛
T4	DRUG_TASTE 379 381	血球
T5	SYMPTOM 400 401	烯醇
T6	DISEASE 490 493	平均达
T7	DISEASE 573 576	或栓剂
T8	SYMPTOM 586 589	阴癌有
T9	DISEASE 644 647	咨询医
T10	SYMPTOM 649 651	或药师
T11	DISEA

T1	DRUG_EFFICACY 29 36	抑制血小板聚集
T2	DRUG_EFFICACY 57 61	活血化瘀
T3	DRUG_EFFICACY 122 126	活血破瘀
T4	SYMPTOM 144 148	肌肤甲错
T5	SYMPTOM 149 153	目眶黯黑
T6	SYMPTOM 154 158	潮热羸瘦
T7	SYMPTOM 159 167	经闭不行腹部肿块
T8	SYMPTOM 168 172	肌肤甲错
T1	SYNDROME 38 40	气滞
T2	SYMPTOM 73 81	月经不调赤白带下
T3	SYMPTOM 82 86	小腹冷痛
T4	SYMPTOM 86 90	赤白带下
T5	SYMPTOM 91 95	小腹冷痛
T6	SYNDROME 96 100	气血衰弱
T7	SYMPTOM 101 105	久不受孕
T8	SYNDROME 111 113	血亏
T9	SYMPTOM 119 127	月经不调赤白带下
T10	SYMPTOM 128 132	小腹冷痛
T11	SYNDROME 133 137	气血衰弱
T12	DRUG_DOSAGE 171 174	的小蜜
T13	DRUG_TASTE 176 178	气微
T14	DRUG_TASTE 180 181	味
T15	DRUG_EFFICACY 236 238	补气
T16	SYNDROME 247 249	女血
T17	SYMPTOM 256 263	月经不调赤白带
T18	SYNDROME 270 273	气血衰
T19	SYMPTOM 275 278	久不受
T20	DRUG_EFFICACY 278 282	孕抑郁气
T1	DRUG_EFFICACY 201 205	滋阴清热
T2	DRUG_EFFICACY 206 210	安神除烦
T3	SYNDROME 221 225	阴虚火旺
T4	SYMPTOM 229 233	潮热面红
T5	SYMPTOM 234 238	自汗盗汗
T6	SYMPTOM 244 248	失眠多梦
T7	SYMPTOM 249 253	头晕耳鸣
T8	SYMPTOM 254 258	腰膝酸软
T9	SYMPTOM 259 263	手足心热
T10	SYMPTOM 264 272	妇女卵巢功能衰退
T11	DRUG_EFFICACY 272 276	

T1	DRUG_DOSAGE 76 80	薄膜衣片
T2	DRUG_TASTE 99 101	味涩
T3	DRUG_TASTE 102 104	微苦
T4	SYNDROME 107 111	肾阳不足
T5	SYNDROME 112 116	气滞血瘀
T6	DRUG_EFFICACY 158 160	益肾
T7	DRUG_EFFICACY 161 163	活血
T8	DRUG_EFFICACY 164 168	软坚散结
T9	SYNDROME 172 176	肾阳不足
T10	SYNDROME 177 181	气滞血瘀
T1	SYMPTOM 88 93	甲状腺结节
T2	DRUG_DOSAGE 130 133	胶囊剂
T3	DRUG_DOSAGE 142 144	粉未
T4	DRUG_TASTE 145 147	气香
T5	DRUG_TASTE 148 151	味微苦
T6	DRUG_EFFICACY 186 190	化瘀止痛
T7	SYMPTOM 217 219	瘰疬
T1	PERSON_GROUP 1 3	孕妇
T2	SYMPTOM 47 49	恶心
T3	DRUG_EFFICACY 126 130	疏肝解郁
T4	DRUG_EFFICACY 131 135	理气止痛
T5	DRUG_EFFICACY 136 140	活血破瘀
T6	DRUG_EFFICACY 146 150	软坚散结
T7	DRUG_EFFICACY 151 155	补气健脾
T8	DRUG_EFFICACY 243 249	改善血液循环
T9	DRUG_EFFICACY 250 252	抗炎
T10	DRUG_EFFICACY 253 258	调节内分泌
T1	DISEASE 79 85	乳腺小叶增生
T2	DISEASE 86 90	子宫肌瘤
T3	DRUG_DOSAGE 91 95	卵巢囊肿
T4	DRUG_EFFICACY 118 122	舒肝理气
T5	DRUG_EFFICACY 123 127	软坚散结
T6	DRUG_EFFICACY 128 132	活血化瘀
T7	DRUG_EFFICACY 133 137	消肿止痛
T8	SYNDROME 140 144	气滞血瘀
T9	DISEASE 146 152	乳腺小叶增生
T10	DISEASE 153 157	子宫肌瘤
T11	DI

T1	DRUG_EFFICACY 0 4	理气养血
T2	DRUG_EFFICACY 5 9	暖宫调经
T3	SYNDROME 12 16	血虚气滞
T4	SYNDROME 17 21	下焦虚寒
T5	SYMPTOM 29 31	痛经
T6	SYMPTOM 34 38	行经后错
T7	SYMPTOM 39 42	经量少
T8	SYMPTOM 43 46	有血块
T9	SYMPTOM 47 51	小腹疼痛
T10	SYMPTOM 52 60	经行小腹冷痛喜热
T11	PERSON_GROUP 65 67	儿童
T12	DRUG_INGREDIENT 121 123	方中
T13	DRUG_EFFICACY 125 129	分药物具
T14	DRUG_EFFICACY 130 132	镇痛
T15	DRUG_EFFICACY 152 152	能
T16	DRUG_EFFICACY 153 155	<b
T17	SYMPTOM 158 165	2.方中主药艾
T18	DRUG_INGREDIENT 187 189	附能
T19	DRUG_EFFICACY 196 198	镇痛
T20	DRUG_EFFICACY 218 218	能
T21	DRUG_EFFICACY 228 234	主药艾叶具有
T22	SYMPTOM 287 291	金黄色葡
T23	SYMPTOM 292 294	球菌
T24	SYMPTOM 295 299	某些真菌
T25	SYMPTOM 301 301	长
T26	SYMPTOM 303 305	具有
T27	SYNDROME 309 311	作用
T28	DRUG_EFFICACY 322 329	主要用于治疗妇
T29	SYMPTOM 345 345	漏
T30	DISEASE 347 351	下等疾病
T31	SYMPTOM 352 356	虚寒证型
T32	SYMPTOM 362 364	丸还
T33	DRUG_DOSAGE 400 401	慢
T34	DRUG_TASTE 402 404	肠炎
T35	DRUG_TASTE 408 410	病证
T36	SYNDROME 418 421	_内蒙
T37	SYNDROME 422 426	天奇中蒙
T38	SYMPTOM 429 433	份有限公
T39	SYNDROME 442 444	

T1	DRUG_EFFICACY 3 5	止血
T2	DRUG_EFFICACY 6 10	滋阴清热
T3	SYMPTOM 24 28	月经过多
T4	SYMPTOM 29 33	经期延长
T5	SYMPTOM 37 41	月经量多
T6	SYMPTOM 42 46	经期延长
T7	SYMPTOM 57 60	小血块
T8	SYMPTOM 61 65	腰膝酸软
T9	SYMPTOM 66 70	咽干口燥
T10	SYMPTOM 71 75	潮热心烦
T11	SYMPTOM 76 80	舌红少津
T12	SYMPTOM 81 83	苔少
T13	SYMPTOM 87 90	脉细数
T14	SYMPTOM 136 140	舌红少津
T15	SYMPTOM 141 145	苔少或无
T16	SYMPTOM 223 226	__子
T17	DRUG_DOSAGE 257 257	2
T18	DRUG_TASTE 260 261	用
T19	DRUG_TASTE 262 264	月经
T20	DRUG_EFFICACY 331 333	一次
T21	SYMPTOM 352 356	4天为一
T22	SYMPTOM 364 368	用2个月
T23	SYMPTOM 370 372	周期_
T24	SYMPTOM 384 387	用于冲
T25	SYMPTOM 393 397	血热所致
T26	SYMPTOM 399 401	经过多
T27	SYMPTOM 404 406	期延长
T28	SYMPTOM 408 410	见月
T29	SYMPTOM 411 413	量多
T30	SYMPTOM 414 417	经期延
T1	DRUG_EFFICACY 118 122	清热除湿
T2	DRUG_EFFICACY 123 127	杀虫止痒
T3	PERSON_GROUP 130 132	妇女
T4	SYNDROME 132 136	湿热下注
T5	SYMPTOM 142 146	阴痒灼痛
T6	SYMPTOM 147 151	带下量多
T7	DISEASE 188 195	非特异性阴道炎
T8	PERSON_GROUP 195 197	孕妇
T1	DRUG_EFFICACY 3 7	滋阴清热
T2	DRUG_EFFICACY 8 12	健脾养血
T3	SYMPTOM 24 28	月经

T1	PERSON_GROUP 13 15	孕妇
T2	DRUG_DOSAGE 35 38	胶囊剂
T3	DRUG_TASTE 51 53	气香
T4	DRUG_TASTE 54 57	味微苦
T5	DRUG_TASTE 58 60	微甘
T6	DRUG_EFFICACY 76 80	清热凉血
T7	DRUG_EFFICACY 81 85	化瘀止痛
T8	SYNDROME 88 92	瘀热蕴结
T9	SYMPTOM 102 106	带下量多
T10	SYMPTOM 110 114	少腹疼痛
T11	DISEASE 115 120	慢性盆腔炎
T12	DISEASE 147 152	慢性盆腔炎
T13	SYMPTOM 153 157	带下量多
T1	DRUG_EFFICACY 1 5	补气养血
T2	DRUG_EFFICACY 6 10	调经止带
T3	SYMPTOM 20 24	月经量少
T4	SYMPTOM 28 32	行经腹痛
T5	SYMPTOM 33 35	带下
T6	SYMPTOM 36 40	少腹冷痛
T7	SYMPTOM 41 45	体弱乏力
T8	SYMPTOM 46 50	腰酸腿软
T9	SYMPTOM 60 64	月经过少
T10	SYMPTOM 66 70	经期错后
T11	SYMPTOM 80 85	带下伴阴痒
T12	DRUG_DOSAGE 265 267	液体
T13	DRUG_TASTE 280 282	味甜
T14	DRUG_TASTE 283 287	微苦味甜
T15	DRUG_TASTE 288 290	微苦
T16	DISEASE 358 360	感冒
T17	DISEASE 366 369	糖尿病
T18	SYMPTOM 400 402	腹痛
T19	SYMPTOM 405 409	胸胁胀痛
T20	SYMPTOM 433 437	月经过少
T21	SYMPTOM 439 443	经期错后
T22	SYMPTOM 453 458	带下伴阴痒
T23	SYMPTOM 756 760	行经腹痛
T24	SYMPTOM 761 763	带下
T25	SYMPTOM 764 768	少腹冷痛
T26	SYMPTOM 769 773	腰酸腿软
T1	DISEASE 0 3	糖尿病
T2	SYMPTOM 50 54	小儿疳积
T3	SYM

T1	DRUG_EFFICACY 233 236	清热解
T2	DISEASE 239 242	肿散结
T3	SYMPTOM 244 248	用于附件炎
T4	DRUG_DOSAGE 249 252	附件炎
T5	DISEASE_GROUP 285 288	热下注
T6	DISEASE 289 294	致的带下病
T7	DISEASE 295 298	慢性盆
T8	DRUG_TASTE 327 329	显浅
T9	DRUG_TASTE 331 332	褐色
T1	DRUG_EFFICACY 1 5	益气化瘀
T2	DRUG_EFFICACY 6 10	祛风通络
T3	DRUG_EFFICACY 11 15	舒筋止痛
T4	DRUG_EFFICACY 22 26	中度神经
T5	DISEASE 28 31	颈椎病
T6	SYNDROME 31 35	气虚血瘀
T7	SYMPTOM 56 60	上肢麻木
T8	SYMPTOM 61 65	神疲乏力
T9	DRUG_DOSAGE 65 69	薄膜衣丸
T10	DRUG_DOSAGE 73 77	薄膜衣丸
T11	DRUG_TASTE 93 95	气微
T12	DRUG_TASTE 96 98	味苦
T13	PERSON_GROUP 117 120	运动员
T14	SYMPTOM 197 199	胃胀
T15	SYMPTOM 203 205	腹胀
T16	SYMPTOM 211 213	腹泻
T17	SYMPTOM 246 251	性心律不齐
T18	PERSON_GROUP 335 337	孕妇
T1	FOOD_GROUP 47 50	刺激性
T2	DISEASE 63 69	乳腺小叶增生
T3	DISEASE 70 74	子宫肌瘤
T4	DRUG_DOSAGE 84 88	薄膜衣片
T5	DRUG_TASTE 103 106	气微香
T6	DRUG_TASTE 107 110	味微苦
T7	DRUG_EFFICACY 113 117	舒肝理气
T8	DRUG_EFFICACY 118 122	软坚散结
T9	DRUG_EFFICACY 123 127	活血化瘀
T10	DRUG_EFFICACY 128 132	消肿止痛
T11	SYNDROME 135 139	气滞血瘀
T12	DISEASE 141 145	乳腺小叶

T1	SYMPTOM 51 53	瘙痒
T2	DISEASE_GROUP 117 120	宫颈炎
T3	DISEASE 123 130	念珠菌性阴道炎
T4	DRUG_DOSAGE 136 138	栓剂
T1	DRUG_EFFICACY 3 7	促进造血
T2	DRUG_EFFICACY 8 10	止血
T3	DRUG_EFFICACY 352 356	补气养血
T4	SYNDROME 364 368	气血两虚
T5	SYMPTOM 369 373	身体瘦弱
T6	SYMPTOM 374 378	腰膝酸软
T7	SYMPTOM 379 383	月经不调
T8	DISEASE 404 406	感冒
T9	DRUG_INGREDIENT 427 429	藜芦
T10	DRUG_INGREDIENT 430 433	五灵脂
T11	DRUG_INGREDIENT 434 436	皂荚
T12	SYMPTOM 471 475	月经量少
T13	SYMPTOM 477 481	月经错后
T14	DRUG_DOSAGE 709 713	薄膜衣片
T15	DRUG_TASTE 723 725	味甜
T16	DRUG_TASTE 726 728	微苦
T17	SYMPTOM 792 796	身体瘦弱
T18	SYMPTOM 797 801	腰膝酸软
T19	SYMPTOM 802 806	月经不调
T1	DRUG_EFFICACY 64 66	消炎
T2	DRUG_EFFICACY 67 69	生肌
T3	DRUG_EFFICACY 70 72	止痛
T4	DISEASE 75 81	霉菌性阴道炎
T5	DISEASE 145 147	感冒
T6	SYMPTOM 147 149	发热
T7	DISEASE 159 162	高血压
T8	DISEASE_GROUP 163 166	心脏病
T9	DISEASE_GROUP 167 169	肝病
T10	DISEASE 170 173	糖尿病
T11	DISEASE_GROUP 174 176	肾病
T12	DISEASE_GROUP 177 180	慢性病
T13	PERSON_GROUP 195 200	青春期少女
T14	PERSON_GROUP 201 206	更年期妇女
T15	SYMPTOM 229 233	月经过少
T16

T1	DRUG_EFFICACY 1 5	补气养血
T2	DRUG_EFFICACY 6 10	调经止带
T3	SYNDROME 13 17	气血两虚
T4	SYMPTOM 18 22	身体瘦弱
T5	SYMPTOM 23 27	腰膝酸软
T6	SYMPTOM 28 32	月经不调
T7	PERSON_GROUP 38 43	更年期妇女
T8	SYMPTOM 66 70	月经过少
T9	SYMPTOM 72 76	经期错后
T10	PERSON_GROUP 172 174	儿童
T11	FOOD_GROUP 174 176	辛辣
T12	FOOD_GROUP 177 179	生冷
T13	DISEASE 184 186	感冒
T14	SYMPTOM 186 188	发热
T15	DISEASE 198 201	高血压
T16	DISEASE_GROUP 202 205	心脏病
T17	DISEASE_GROUP 206 208	肝病
T18	DISEASE 209 212	糖尿病
T19	DISEASE_GROUP 213 215	肾病
T20	DRUG_INGREDIENT 625 627	人参
T21	DRUG_INGREDIENT 628 630	白芍
T22	DRUG_INGREDIENT 631 634	反藜芦
T23	DRUG_INGREDIENT 638 640	藜芦
T24	DRUG_INGREDIENT 652 654	甘草
T25	DRUG_INGREDIENT 655 658	反甘遂
T26	DRUG_INGREDIENT 659 661	大戟
T27	DRUG_INGREDIENT 662 664	海藻
T28	DRUG_INGREDIENT 665 667	芫花
T29	DRUG_INGREDIENT 671 673	甘遂
T30	DRUG_INGREDIENT 674 676	大戟
T31	DRUG_INGREDIENT 677 679	海藻
T32	DRUG_INGREDIENT 680 682	芫花
T33	FOOD_GROUP 695 697	生冷
T34	FOOD_GROUP 698 700	辛辣
T35	FOOD_GROUP 701 703	荤腥
T36	FOOD_GROUP 703 705	油腻
T37	DRUG_INGRED

In [11]:
# 预测验证集
valid_pred = []
for i in range(800,1000):
    valid_pred.extend(list(map(lambda x:str(i)+' '+re.sub(r'T[\d+]{1,2}\t','',x),valid_result[i])))
    

In [12]:
# 验证集的真实label
valid_true = []
for i in range(800,1000):
    with open('data/valid/ann/%d.ann'%i) as file_obj:
        valid_label = file_obj.read().split('\n')
        valid_true.extend(list(map(lambda x:str(i)+' '+re.sub(r'T[\d+]{1,2}\t','',x),valid_label)))


In [13]:
# strict F1
def eval(valid_pred,valid_true):
    beta = 1
    P = len(set(valid_pred) & set(valid_true))/len(set(valid_true))
    R = len(set(valid_pred) & set(valid_true))/len(set(valid_pred))
    F = (1+beta**2)*P*R/(P+beta**2*R)
    print('P = %.4f'%P)
    print('R = %.4f'%R)
    print('F = %.4f'%F)
    return P,R,F

In [14]:
results = eval(valid_pred,valid_true)
results

P = 0.3874
R = 0.4865
F = 0.4313


(0.38737446197991393, 0.4864864864864865, 0.4313099041533547)

In [17]:
para = {}
para['EPOCHS'] = EPOCHS
para['BATCH_SIZE'] = BATCH_SIZE
para['EMBED_DIM'] = EMBED_DIM
para['HIDDEN_SIZE'] = HIDDEN_SIZE
para['MAX_LEN'] = MAX_LEN
para['VOCAB_SIZE'] = VOCAB_SIZE
para['CLASS_NUMS'] = CLASS_NUMS
para['VALID_PRED'] = len(set(valid_pred))
para['VALID_TRUE'] = len(set(valid_true))
para['P'] = results[0]
para['R'] = results[1]
para['F'] = results[2]

log = []
for p in para:
    log.append(p+' = '+str(para[p]))
    

import datetime
time = datetime.datetime.now().strftime('%F_%T')

with open('model/log/log_%s.txt'%time,'w', encoding='utf-8') as wr:
    wr.write('\n'.join(log))
# save model
model.save("model/model/model_%s.h5"%time)   

In [None]:
#(0.46909962706446456, 0.5053084648493543, 0.4865312888520513)

In [None]:
# output
result = {}
test_data_path = 'data/chusai_xuanshou/'
#test_data_path = 'data/valid/'
for i in range(len(os.listdir(test_data_path))):
#for i in range(7):
    test_file = test_data_path+str(i+1000)+'.txt'
    with open(test_file, "r", encoding="utf8") as test:
        sentence = test.read()
    sentences = sentence.split('。')
    y_ner = []

    for sent in sentences:
        sent = sent.replace(' ','_')
        sent_chars = list(sent+'。')
        sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in sent_chars]

        sent2id_new = np.array([[0] * (maxlen-len(sent2id)) + sent2id[:maxlen]])
        y_pred = model.predict(sent2id_new)
        y_label = np.argmax(y_pred, axis=2)
        y_label = y_label.reshape(1, -1)[0]
        y_ner_ = [idx2label[i] for i in y_label][-len(sent_chars):]
        y_ner.extend(y_ner_)
    result_words = get_valid_nertag(sentence, y_ner)
    ans = []
    for res in result_words:
        number = res[0]
        #tag = res[1]
        start = res[2]
        end = res[3]
        word = re.sub(' |，|：|；|、|;|。|（|）','_',res[4]) # 有特殊字符的实体不予加入结果
        try:
            tag = tag_dic[word]
        except:
            tag = res[1]
        if word.startswith('_'):
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start+1, end, word[1:]))
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start+1, end, word[1:]))
        if word.endswith('_'):
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start, end-1, word[:-1]))
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start+1, end-1, word[:-1]))
        if '_' not in word:
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start, end, word))
        
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start, end, word))
    
    result[i+1000] = ans

In [None]:
#“实体类别”、“起始位置”、“结束位置”以空格分隔

In [None]:
for i in range(1000,1500):
    with open('data/submit/%d.ann'%i,'w', encoding='utf-8') as wr:
        wr.write('\n'.join(result[i]))