# CRF+LSTM

requirement:

keras 2.2.4

tensorflow 1.13

pip install git+https://www.github.com/keras-team/keras-contrib.git

In [108]:
import re
import os
import pandas as pd

In [109]:
char_vocab_path = "CRF/data/char_vocabs.txt" # 字典文件
special_words = ['<PAD>', '<UNK>'] # 特殊词表示
label2idx = {'O': 0,
             'B-DISEASE': 1, 'B-DISEASE_GROUP': 2,
             'B-DRUG_DOSAGE': 3, 'B-DRUG_EFFICACY': 4,
             'B-DRUG_INGREDIENT': 5, 'B-DRUG_TASTE': 6,
             'B-FOOD_GROUP':7, 'B-PERSON_GROUP':8,
             'B-SYMPTOM':9, 'B-SYNDROME':10,
             'I-DISEASE': 11, 'I-DISEASE_GROUP': 12,
             'I-DRUG_DOSAGE': 13, 'I-DRUG_EFFICACY': 14,
             'I-DRUG_INGREDIENT': 15, 'I-DRUG_TASTE': 16,
             'I-FOOD_GROUP':17, 'I-PERSON_GROUP':18,
             'I-SYMPTOM':19, 'I-SYNDROME':20
            }

# 索引和BIO标签对应
idx2label = {idx: label for label, idx in label2idx.items()}

# 读取字符词典文件
with open(char_vocab_path, "r", encoding="utf8") as fo:
    char_vocabs = [line.strip() for line in fo]
char_vocabs = special_words + char_vocabs

# 字符和索引编号对应
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}

In [110]:
# 读取训练语料
def read_corpus(corpus_path, vocab2idx, label2idx):
    with open(corpus_path, encoding='utf-8') as fr:
        lines = fr.readlines()
    sent_, tag_ = [], []
    for letter in lines:
        [char,label,_] = re.split('\t|\n',letter)
        char = re.sub(' |\*|<|>','_',char)
        sent_.append(char)
        tag_.append(label)
    sent_ids = [vocab2idx[char] if char in vocab2idx else vocab2idx['<UNK>'] for char in sent_]
    tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
    return sent_ids, tag_ids

In [111]:
train_datas = []
train_labels = []
files = os.listdir('data/train_data')
for file in files:
    train_data_path_i = 'data/train_data/'+file
    train_datas_i, train_labels_i = read_corpus(train_data_path_i, vocab2idx, label2idx)
    train_datas.append(train_datas_i)
    train_labels.append(train_labels_i)

In [112]:
valid_datas = []
valid_labels = []
files = os.listdir('data/valid_data')
for file in files:
    valid_data_path_i = 'data/valid_data/'+file
    valid_datas_i, valid_labels_i = read_corpus(valid_data_path_i, vocab2idx, label2idx)
    valid_datas.append(valid_datas_i)
    valid_labels.append(valid_labels_i)

In [133]:
train_files = os.listdir('data/train')
train_ann_files = [x for x in train_files if x.endswith('.ann')]
tag_dic = {}
del_list = []
for file in train_ann_files:
    with open('data/train/%s'%file) as file_obj:
        labeled_data = file_obj.read()
    labeled_data_list = labeled_data.split('\n')
    for labeled in labeled_data_list:
        res = re.split(' |\t',labeled)
        tag = res[1]
        word = res[4]
        if word not in tag_dic and word not in del_list:
            tag_dic[word] = tag
        else:
            try:
                del tag_dic[word]
            except:
                pass
            del_list.append(word)

In [157]:
tag_dic

{'轻度胃脘不适': 'SYMPTOM',
 '热毒证': 'SYNDROME',
 '苦味': 'DRUG_TASTE',
 '淀粉': 'DRUG_INGREDIENT',
 '腰酸': 'SYMPTOM',
 '素有癥瘕': 'SYMPTOM',
 '阴道出血量多': 'SYMPTOM',
 '肥腻': 'FOOD_GROUP',
 '消徵破积': 'DRUG_EFFICACY',
 '气短多汗': 'SYMPTOM',
 '大便难': 'SYMPTOM',
 '益肾养血': 'DRUG_EFFICACY',
 '补气滋阴': 'DRUG_EFFICACY',
 '失血过多': 'SYMPTOM',
 '痔漏下血': 'SYMPTOM',
 '止血安胎': 'DRUG_EFFICACY',
 '头痛身酸': 'SYMPTOM',
 '盆腔炎症': 'PERSON_GROUP',
 '鱼类': 'FOOD_GROUP',
 '黄藤素': 'DRUG_INGREDIENT',
 '妇科炎': 'SYMPTOM',
 '呼吸道': 'SYMPTOM',
 '除湿健脾': 'DRUG_EFFICACY',
 '流产后': 'SYMPTOM',
 '筋骨无力': 'SYMPTOM',
 '眼目昏花': 'SYMPTOM',
 '消渴尿多': 'SYMPTOM',
 '肢冷畏寒': 'SYMPTOM',
 '舌红苔少': 'SYMPTOM',
 '听力减退': 'SYMPTOM',
 '脑动脉硬化': 'DISEASE',
 '风湿阻络': 'SYNDROME',
 '宜痹止痛': 'DRUG_EFFICACY',
 '补肾健骨': 'DRUG_EFFICACY',
 '为甘苦': 'DRUG_TASTE',
 '强壮健脑': 'DRUG_EFFICACY',
 '补肝补肾': 'DRUG_EFFICACY',
 '益精活血': 'DRUG_EFFICACY',
 '肾虚白发': 'SYMPTOM',
 '牙齿摇动': 'SYMPTOM',
 '筋骨软弱': 'SYMPTOM',
 '小便淋漓': 'SYMPTOM',
 '急慢性肾炎': 'DISEASE',
 '骨髓炎': 'DISEASE',
 '关节炎': 'DISEASE',
 '筋膜炎': 'DISEASE',

In [113]:
print(train_datas[50])
print([idx2vocab[idx] for idx in train_datas[50]])
print(train_labels[50])
print([idx2label[idx] for idx in train_labels[50]])

[58, 61, 77, 1, 58, 17, 181, 3093, 3817, 2654, 6214, 1959, 2177, 286, 6802, 5965, 519, 1408, 2644, 2102, 2732, 1842, 889, 2545, 3093, 3817]
['_', 'b', 'r', '<UNK>', '_', '3', '、', '治', '疗', '期', '间', '忌', '房', '事', '，', '配', '偶', '如', '有', '感', '染', '应', '同', '时', '治', '疗']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


### CRF+BiLSTM

In [114]:
import numpy as np
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation
from keras.preprocessing import sequence
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras import backend as K
K.clear_session()

EPOCHS = 30
BATCH_SIZE = 64
EMBED_DIM = 48
HIDDEN_SIZE = 12
MAX_LEN = 100
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(label2idx)
print(VOCAB_SIZE, CLASS_NUMS)

print('padding sequences')
train_datas = sequence.pad_sequences(train_datas, maxlen=MAX_LEN)
train_labels = sequence.pad_sequences(train_labels, maxlen=MAX_LEN)
valid_datas = sequence.pad_sequences(valid_datas, maxlen=MAX_LEN)
valid_labels = sequence.pad_sequences(valid_labels, maxlen=MAX_LEN)
print('x_train shape:', train_datas.shape)
print('x_test shape:', valid_datas.shape)

train_labels = keras.utils.to_categorical(train_labels, CLASS_NUMS)
valid_labels = keras.utils.to_categorical(valid_labels, CLASS_NUMS)
print('trainlabels shape:', train_labels.shape)
print('testlabels shape:', valid_labels.shape)

## BiLSTM+CRF模型构建
inputs = Input(shape=(MAX_LEN,), dtype='int32')
x = Masking(mask_value=0)(inputs)
x = Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(x)
x = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))(x)
x = TimeDistributed(Dense(CLASS_NUMS))(x)#TimeDistributed层的作用就是把Dense层应用到这10个具体的向量上，对每一个向量进行了一个Dense操作
outputs = CRF(CLASS_NUMS)(x)
model = Model(inputs=inputs, outputs=outputs)
model.summary()

model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])
model.fit(train_datas, train_labels, epochs=EPOCHS, verbose=1, validation_split=0.1)

score = model.evaluate(valid_datas, valid_labels, batch_size=BATCH_SIZE)
print(model.metrics_names)
print(score)

6874 21
padding sequences
x_train shape: (6899, 100)
x_test shape: (3974, 100)
trainlabels shape: (6899, 100, 21)
testlabels shape: (3974, 100, 21)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
masking_1 (Masking)          (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 48)           329952    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 24)           5856      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 21)           525       
_________________________________________________________________
crf_1 (CRF)                  (None, 100, 21)           945  

In [115]:
# save model
model.save("model/ch_ner_model.h5")

In [116]:
def get_valid_nertag(input_data, result_tags):
    result_words = []
    start, end =0, 1 # 实体开始结束位置标识
    tag_label = "O" # 实体类型标识
    number = 0
    for i, tag in enumerate(result_tags):
        if tag.startswith("B"):
            number += 1
            if tag_label != "O": # 当前实体tag之前有其他实体     
                result_words.append(('T'+str(number), tag_label, start, end,input_data[start: end]))# 获取实体 
            tag_label = tag.split("-")[1] # 获取当前实体类型
            start, end = i, i+1 # 开始和结束位置变更
        elif tag.startswith("I"):
            temp_label = tag.split("-")[1]
            if temp_label == tag_label: # 当前实体tag是之前实体的一部分
                end += 1 # 结束位置end扩展
        elif tag == "O":
            if tag_label != "O": # 当前位置非实体 但是之前有实体
                result_words.append(('T'+str(number), tag_label, start, end,input_data[start: end]))
                tag_label = "O"  # 实体类型置"O"
            start, end = i, i+1 # 开始和结束位置变更
    if tag_label != "O": # 最后结尾还有实体
        number += 1
        result_words.append(('T'+str(number), tag_label, start, end,input_data[start: end]))# 获取结尾的实体
    return result_words

In [160]:
# 线下验证数据集
maxlen = 100
valid_result = {}
valid_data_path = 'data/valid/text/'
#test_data_path = 'data/valid/'
for i in range(len(os.listdir(valid_data_path))):
#for i in range(7):
    valid_file = valid_data_path+str(i+800)+'.txt'
    with open(valid_file, "r", encoding="utf8") as valid:
        sentence = valid.read()
    sentences = sentence.split('。')
    y_ner = []

    for sent in sentences:
        sent = sent.replace(' ','_')
        sent_chars = list(sent+'。')
        sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in sent_chars]

        sent2id_new = np.array([[0] * (maxlen-len(sent2id)) + sent2id[:maxlen]])
        y_pred = model.predict(sent2id_new)
        y_label = np.argmax(y_pred, axis=2)
        y_label = y_label.reshape(1, -1)[0]
        y_ner_ = [idx2label[i] for i in y_label][-len(sent_chars):]
        y_ner.extend(y_ner_)
    result_words = get_valid_nertag(sentence, y_ner)
    ans = []
    for res in result_words:
        number = res[0]
        start = res[2]
        end = res[3]
        word = re.sub(' |，|：|；|、|;|。|（|）','_',res[4]) # 有特殊字符的实体不予加入结果
        try:
            tag = tag_dic[word]
        except:
            tag = res[1]
        if '_' not in word:
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start, end, word))
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start, end, word))
    #print('='*100)
    
    valid_result[i+800] = ans

T1	DRUG_DOSAGE 4 8	薄膜衣片
T2	DRUG_TASTE 19 21	味涩
T3	DRUG_TASTE 22 24	微苦
T4	FOOD_GROUP 31 33	辛辣
T5	FOOD_GROUP 34 36	生冷
T6	FOOD_GROUP 37 39	油腻
T7	DISEASE 44 47	糖尿病
T8	SYMPTOM 81 83	阴痒
T9	PERSON_GROUP 105 110	脾胃虚寒者
T10	SYMPTOM 127 129	尿频
T11	SYMPTOM 130 132	尿急
T12	SYMPTOM 133 135	尿痛
T13	PERSON_GROUP 226 229	过敏者
T14	PERSON_GROUP 232 237	过敏体质者
T15	PERSON_GROUP 266 268	儿童
T16	SYMPTOM 285 287	恶心
T17	DRUG_DOSAGE 316 318	片剂
T18	DRUG_DOSAGE 319 322	薄膜衣
T19	PERSON_GROUP 325 327	孕妇
T20	DRUG_EFFICACY 411 413	镇痛
T21	SYNDROME 492 496	于湿热下
T22	DISEASE_GROUP 499 502	的带下
T23	SYMPTOM 505 509	见白带量
T24	FOOD_GROUP 509 511	多生
T25	FOOD_GROUP 511 513	冷辛
T26	SYMPTOM 547 551	见赤白带
T1	DRUG_DOSAGE 15 18	胶囊剂
T2	DRUG_DOSAGE 30 32	粉未
T3	DRUG_TASTE 33 35	气香
T4	DRUG_TASTE 36 39	味微苦
T5	DRUG_EFFICACY 42 46	活血化瘀
T6	DRUG_EFFICACY 47 51	软坚散结
T7	DISEASE 54 58	子宫肌瘤
T8	DISEASE 59 62	盆腔炎
T9	DRUG_INGREDIENT 62 65	生冷粉
T10	DRUG_EFFICACY 154 158	所致的家
T11	DRUG_EFFICACY 159 163	血液粘滞
T12	DISEASE 166 170	制正常大
T13	DISEASE 171 174	离体子
T14	S

T1	DRUG_DOSAGE 11 13	颗粒
T2	DRUG_TASTE 14 17	味微苦
T3	DISEASE 25 30	糖尿病肾病
T4	PERSON_GROUP 32 36	肾衰竭者
T5	DRUG_EFFICACY 48 52	健脾利湿
T6	DRUG_EFFICACY 53 57	活血化瘀
T7	DISEASE 60 67	慢性肾功能衰竭
T8	SYNDROME 70 72	血症
T9	DRUG_INGREDIENT 192 194	党参
T10	DRUG_INGREDIENT 195 197	白芍
T11	PERSON_GROUP 452 454	孕妇
T12	PERSON_GROUP 459 464	过敏体质者
T13	DISEASE_GROUP 480 485	肾小球肾炎
T14	DISEASE 486 490	高血压病
T15	DISEASE 491 496	糖尿病肾病
T16	DISEASE 583 589	高血压者水肿
T1	PERSON_GROUP 1 3	孕妇
T2	DRUG_DOSAGE 90 93	水蜜丸
T3	DRUG_EFFICACY 96 100	补气养血
T4	DRUG_DOSAGE 118 121	水蜜丸
T5	DRUG_TASTE 122 124	味甜
T6	DRUG_TASTE 125 127	微苦
T7	DRUG_EFFICACY 142 146	补气养血
T8	SYNDROME 154 158	气血两亏
T9	SYMPTOM 161 165	月经不调
T10	SYMPTOM 166 170	行经腹痛
T11	SYMPTOM 171 175	少腹冷痛
T12	SYMPTOM 176 180	体弱乏力
T13	SYMPTOM 181 185	腰酸腿软
T14	FOOD_GROUP 189 191	辛辣
T15	FOOD_GROUP 192 194	生冷
T16	DISEASE 199 201	感冒
T17	SYMPTOM 201 203	发热
T18	DISEASE 213 216	高血压
T19	DISEASE_GROUP 217 220	心脏病
T20	DISEASE_GROUP 221 223	肝病
T21	DISEASE 224 227	糖尿病
T22	DISEASE_GROUP 228 230	肾病
T23

T1	FOOD_GROUP 54 56	辛辣
T2	FOOD_GROUP 57 59	生冷
T3	DRUG_DOSAGE 83 85	丸剂
T4	DRUG_DOSAGE 86 89	水蜜丸
T5	DRUG_DOSAGE 156 158	丸剂
T6	DRUG_DOSAGE 159 162	水蜜丸
T7	PERSON_GROUP 165 167	孕妇
T8	DRUG_EFFICACY 171 175	理气补血
T9	DRUG_EFFICACY 176 180	暖宫调经
T10	SYMPTOM 188 192	月经量少
T11	SYMPTOM 196 200	经期腹痛
T12	SYMPTOM 201 214	腰酸带下赤带阴道不规则出血
T13	PERSON_GROUP 214 218	感冒艾附
T14	PERSON_GROUP 222 224	孕妇
T15	DRUG_DOSAGE 239 242	水蜜丸
T16	DRUG_TASTE 243 245	气微
T17	DRUG_TASTE 246 251	味甘而后苦
T18	FOOD_GROUP 286 288	辛辣
T19	FOOD_GROUP 289 291	生冷
T20	DISEASE 302 304	感冒
T21	SYMPTOM 331 335	经行有块
T22	SYMPTOM 336 338	腹痛
T23	SYMPTOM 341 345	胸胁胀痛
T24	SYMPTOM 365 369	月经过少
T25	SYMPTOM 371 375	经期错后
T26	SYMPTOM 377 384	阴道不规则出血
T27	SYMPTOM 385 390	带下伴阴痒
T28	SYMPTOM 392 394	赤带
T29	SYMPTOM 407 409	痛经
T30	SYMPTOM 457 461	重度痛经
T31	PERSON_GROUP 496 499	过敏者
T32	PERSON_GROUP 502 507	过敏体质者
T33	PERSON_GROUP 537 539	儿童
T34	SYNDROME 579 583	血虚气滞
T35	SYNDROME 584 588	下焦虚寒
T36	SYMPTOM 591 595	月经不调
T37	SYMPTOM 596 598	痛经
T38	SYNDROME 651 655	下焦虚寒
T39

T1	PERSON_GROUP 84 89	哺乳期妇女
T2	DRUG_EFFICACY 264 266	止痛
T3	DISEASE 283 289	霉菌性阴道炎
T4	PERSON_GROUP 304 306	孕妇
T1	DRUG_EFFICACY 50 54	凉血活血
T2	SYNDROME 68 75	血热伤阴挟瘀证
T3	SYMPTOM 78 82	皮肤紫癜
T4	SYMPTOM 83 85	齿衄
T5	SYMPTOM 86 88	鼻衄
T6	PERSON_GROUP 89 91	妇女
T7	SYMPTOM 91 95	月经过多
T8	SYMPTOM 96 98	口渴
T9	SYMPTOM 102 104	盗汗
T10	DISEASE 110 112	肿瘤
T11	SYMPTOM 135 139	轻度腹胀
T12	SYMPTOM 140 142	呕吐
T13	SYMPTOM 143 146	大便稀
T1	DISEASE 3 7	乳腺增生
T2	SYMPTOM 8 12	乳房胀痛
T3	DRUG_EFFICACY 106 110	疏肝理气
T4	DRUG_EFFICACY 111 115	活血化瘀
T5	SYNDROME 123 127	肝气郁结
T6	DRUG_EFFICACY 166 170	活血化瘀
T7	DRUG_EFFICACY 171 175	消散乳块
T8	SYNDROME 178 182	肝气郁结
T9	SYMPTOM 193 197	乳房胀痛
T10	PERSON_GROUP 256 258	儿童
T11	PERSON_GROUP 268 270	孕妇
T1	PERSON_GROUP 16 21	脾虚湿盛者
T2	SYMPTOM 87 91	眩晕耳鸣
T3	DRUG_EFFICACY 159 163	调节心律
T4	DRUG_EFFICACY 164 168	降低血压
T5	DRUG_EFFICACY 177 181	安神养心
T6	DRUG_EFFICACY 182 186	改善睡眠
T7	DRUG_EFFICACY 196 200	延缓衰老
T8	DRUG_EFFICACY 201 206	滋阴补肝肾
T9	SYMPTOM 231 235	肌肉酸胀
T10	DRUG_EFFICACY 238 242	强体增精
T11	DRUG_EFFIC

T1	FOOD_GROUP 61 63	辛辣
T2	DISEASE 71 73	感冒
T3	SYMPTOM 83 87	月经紊乱
T4	DRUG_EFFICACY 87 92	滋阴补肝肾
T5	SYMPTOM 117 121	肌肉酸胀
T6	DRUG_EFFICACY 160 164	增进食欲
T7	DRUG_DOSAGE 170 173	胶囊剂
T8	DRUG_TASTE 185 188	气微香
T9	DRUG_TASTE 189 192	味微甜
T10	SYMPTOM 208 212	眩晕耳鸣
T11	SYMPTOM 213 217	烦躁失眠
T12	FOOD_GROUP 258 260	辛辣
T13	FOOD_GROUP 263 265	油腻
T14	DISEASE 268 270	感冒
T15	SYMPTOM 280 284	月经紊乱
T16	DISEASE 291 294	高血压
T17	DISEASE_GROUP 295 298	心脏病
T18	DISEASE 299 302	糖尿病
T19	PERSON_GROUP 380 383	过敏者
T20	PERSON_GROUP 386 391	过敏体质者
T21	PERSON_GROUP 418 420	儿童
T22	DRUG_EFFICACY 482 486	滋阴潜阳
T23	DRUG_EFFICACY 487 491	除烦安神
T24	SYMPTOM 502 506	眩晕耳鸣
T25	DRUG_EFFICACY 526 549	补肝肾延缓衰老改善睡眠安神养心降低血压调节心律
T26	DRUG_EFFICACY 550 554	降低血压
T27	DRUG_EFFICACY 563 567	安神养心
T28	DRUG_EFFICACY 568 572	改善睡眠
T29	DRUG_EFFICACY 587 592	滋阴补肝肾
T30	SYMPTOM 617 621	肌肉酸胀
T31	DRUG_EFFICACY 660 664	增进食欲
T32	DRUG_EFFICACY 713 717	调节心律
T33	DRUG_EFFICACY 718 722	降低血压
T34	DRUG_EFFICACY 731 735	安神养心
T35	DRUG_EFFICACY 736 740	改善睡眠
T36	DRUG_EFFICA

T1	PERSON_GROUP 1 3	孕妇
T2	DRUG_EFFICACY 99 103	活血调经
T3	DRUG_EFFICACY 104 108	杀菌消炎
T4	DRUG_EFFICACY 109 113	止痛化症
T5	DRUG_EFFICACY 114 118	软坚散结
T6	DISEASE 123 128	慢性盆腔炎
T7	DISEASE 129 132	阴道炎
T8	SYMPTOM 133 137	月经不调
T9	SYMPTOM 138 142	痛经闭经
T10	SYMPTOM 143 147	白带量多
T11	SYMPTOM 148 152	子宫糜烂
T12	SYMPTOM 156 160	肿块色斑
T13	SYMPTOM 166 168	杀菌
T14	DRUG_EFFICACY 169 171	消炎
T15	DRUG_EFFICACY 172 174	止痛
T16	DRUG_DOSAGE 192 194	颗粒
T17	DRUG_TASTE 195 198	气微香
T18	DRUG_TASTE 199 201	味甜
T1	FOOD_GROUP 5 7	寒凉
T2	FOOD_GROUP 8 10	生冷
T3	DRUG_INGREDIENT 44 46	藜芦
T4	DRUG_INGREDIENT 47 50	五灵脂
T5	DRUG_INGREDIENT 51 53	皂荚
T6	DISEASE 63 65	感冒
T7	SYMPTOM 120 124	月经量少
T8	SYMPTOM 126 130	月经错后
T9	SYMPTOM 132 139	阴道不规则出血
T10	PERSON_GROUP 201 204	过敏者
T11	PERSON_GROUP 246 248	儿童
T12	DRUG_EFFICACY 321 325	补气养血
T13	SYNDROME 333 337	气血两虚
T14	SYMPTOM 338 342	身体瘦弱
T15	SYMPTOM 343 347	腰膝酸软
T16	SYMPTOM 348 352	月经不调
T17	SYMPTOM 353 357	崩漏带下
T18	SYMPTOM 363 367	经期腹痛
T19	SYMPTOM 368 372	肢体浮肿
T20	SYMPTOM 373 377	产后体弱
T21	SYMPTOM 37

T1	DRUG_INGREDIENT 49 51	乌鸡
T2	DRUG_INGREDIENT 60 63	鹿角胶
T3	DRUG_INGREDIENT 64 66	当归
T4	DRUG_INGREDIENT 67 69	白芍
T5	DRUG_INGREDIENT 70 73	熟地黄
T6	DRUG_INGREDIENT 74 76	人参
T7	DRUG_INGREDIENT 77 79	黄芪
T8	DRUG_INGREDIENT 80 82	香附
T9	DRUG_INGREDIENT 94 97	鹿角霜
T10	DRUG_INGREDIENT 98 100	牡蛎
T11	DRUG_EFFICACY 131 135	补气养血
T12	SYNDROME 143 147	气血两虚
T13	SYMPTOM 148 152	身体瘦弱
T14	SYMPTOM 153 157	腰膝酸软
T15	SYMPTOM 158 162	月经不调
T16	SYMPTOM 163 167	崩漏带下
T17	PERSON_GROUP 167 169	儿童
T18	SYMPTOM 169 173	经期错后
T19	SYMPTOM 173 177	月经过少
T20	FOOD_GROUP 197 199	辛辣
T21	FOOD_GROUP 200 202	生冷
T22	DISEASE 207 209	感冒
T23	SYMPTOM 209 211	发热
T24	DISEASE 221 224	高血压
T25	DISEASE_GROUP 225 228	心脏病
T26	DISEASE_GROUP 229 231	肝病
T27	DISEASE 232 235	糖尿病
T28	DISEASE_GROUP 236 238	肾病
T29	PERSON_GROUP 257 262	青春期少女
T30	PERSON_GROUP 263 268	更年期妇女
T31	SYMPTOM 291 295	月经过少
T32	SYMPTOM 297 301	经期错后
T33	SYMPTOM 303 310	阴道不规则出血
T34	SYMPTOM 322 324	赤带
T35	PERSON_GROUP 358 361	过敏者
T36	PERSON_GROUP 397 399	儿童
T37	PERSON_GROUP 472 474	孕

T1	SYMPTOM 42 47	残留粪便后
T2	PERSON_GROUP 145 150	马兜铃酸有
T3	PERSON_GROUP 179 181	儿童
T4	PERSON_GROUP 212 214	孕妇
T5	DRUG_EFFICACY 224 228	活血化瘀
T6	DRUG_EFFICACY 229 233	软坚散结
T7	DRUG_EFFICACY 234 238	清热解毒
T8	DISEASE 241 246	慢性盆腔炎
T9	DRUG_EFFICACY 246 250	软坚散结
T1	SYMPTOM 65 68	耳肿胀
T2	SYMPTOM 166 169	甲苯致
T3	DISEASE 239 242	肿散结
T4	DISEASE 243 246	用于附
T5	DISEASE_GROUP 285 288	热下注
T6	DISEASE 289 294	致的带下病
T7	DISEASE 295 298	慢性盆
T8	DRUG_TASTE 327 329	显浅
T9	PERSON_GROUP 366 368	一疗
T1	DRUG_EFFICACY 1 5	益气化瘀
T2	DRUG_EFFICACY 6 10	祛风通络
T3	DRUG_EFFICACY 11 15	舒筋止痛
T4	SYNDROME 31 36	气虚血瘀证
T5	SYMPTOM 61 65	神疲乏力
T6	DRUG_DOSAGE 65 69	薄膜衣丸
T7	DRUG_DOSAGE 73 77	薄膜衣丸
T8	DRUG_TASTE 93 95	气微
T9	DRUG_TASTE 96 98	味苦
T10	PERSON_GROUP 117 120	运动员
T11	SYMPTOM 194 196	胃痛
T12	SYMPTOM 200 202	腹痛
T13	SYMPTOM 211 213	腹泻
T14	PERSON_GROUP 335 337	孕妇
T1	SYNDROME 57 61	气滞血瘀
T2	DISEASE 65 69	小叶增生
T3	SYMPTOM 70 74	子宫肌瘤
T4	SYMPTOM 75 79	卵巢囊肿
T5	DRUG_DOSAGE 84 88	薄膜衣片
T6	DRUG_TASTE 103 106	气微香
T7	DRUG_TASTE 107 110	味微苦
T8	DRUG_EFF

T1	DRUG_EFFICACY 8 10	活血
T2	DRUG_EFFICACY 11 13	祛瘀
T3	DRUG_EFFICACY 14 16	止痛
T4	DRUG_EFFICACY 20 22	止血
T5	DRUG_EFFICACY 25 29	温经活血
T6	DRUG_EFFICACY 30 34	化淤生新
T7	DRUG_EFFICACY 43 45	祛瘀
T8	DRUG_EFFICACY 46 48	镇痛
T9	DRUG_EFFICACY 73 75	活血
T10	DRUG_EFFICACY 76 78	祛瘀
T11	DRUG_EFFICACY 79 81	止痛
T12	SYMPTOM 86 90	恶露不行
T13	SYMPTOM 108 112	阴道流血
T14	SYMPTOM 113 117	月经过多
T15	SYMPTOM 122 126	恶露不行
T16	SYMPTOM 127 131	少腹疼痛
T17	SYMPTOM 144 148	阴道流血
T18	SYMPTOM 149 153	月经过多
T19	DRUG_DOSAGE 205 207	颗粒
T20	DRUG_TASTE 208 210	味甘
T21	DRUG_TASTE 211 213	微苦
T22	PERSON_GROUP 259 261	儿童
T23	DISEASE 271 274	糖尿病
T1	DRUG_DOSAGE 18 22	薄膜衣片
T2	DRUG_TASTE 33 35	味甘
T3	DRUG_TASTE 36 38	微苦
T4	DRUG_EFFICACY 95 98	补气血
T5	SYMPTOM 105 109	头晕心慌
T6	SYMPTOM 110 114	疲乏无力
T7	SYMPTOM 115 119	月经量少
T8	SYMPTOM 120 126	色淡经期后错
T9	DRUG_EFFICACY 126 129	补气血
T10	DRUG_EFFICACY 130 133	调月经
T11	SYMPTOM 136 140	月经不调
T12	PERSON_GROUP 184 186	孕妇
T13	DISEASE 212 218	高血压心脏病
T14	DISEASE_GROUP 219 221	肾病
T15	PERSON_GROUP 248 253	青春期少女
T16	PERSO

T1	DISEASE 1 4	糖尿病
T2	FOOD_GROUP 14 16	辛辣
T3	FOOD_GROUP 17 19	生冷
T4	FOOD_GROUP 20 22	油腻
T5	DISEASE 27 29	感冒
T6	SYMPTOM 29 31	发热
T7	DISEASE 50 53	高血压
T8	DISEASE_GROUP 54 57	心脏病
T9	DISEASE_GROUP 58 60	肝病
T10	DISEASE_GROUP 61 63	肾病
T11	PERSON_GROUP 81 83	儿童
T12	PERSON_GROUP 84 86	孕妇
T13	DRUG_EFFICACY 144 148	补气养血
T14	SYNDROME 151 155	气血两虚
T15	SYMPTOM 158 162	倦怠乏力
T16	SYMPTOM 163 167	面色无华
T17	SYMPTOM 173 177	失眠多梦
T18	SYMPTOM 178 182	心悸气短
T19	DRUG_DOSAGE 216 218	颗粒
T20	DRUG_TASTE 219 221	味甜
T1	DRUG_EFFICACY 102 106	活血化瘀
T2	DRUG_EFFICACY 118 120	理气
T3	DRUG_EFFICACY 121 123	活血
T4	DRUG_EFFICACY 124 126	止痛
T5	SYNDROME 129 133	气滞血瘀
T6	SYMPTOM 136 142	胸胁胀满疼痛
T7	SYMPTOM 143 149	痛经胃痛胃热
T8	DRUG_EFFICACY 149 151	镇痛
T9	DRUG_EFFICACY 238 242	活血化瘀
T10	SYMPTOM 280 282	胃痛
T11	SYMPTOM 298 302	有损胎气
T12	FOOD_GROUP 317 319	清淡
T13	FOOD_GROUP 322 324	生冷
T14	FOOD_GROUP 325 327	辛辣
T15	FOOD_GROUP 328 330	油腻
T16	DRUG_DOSAGE 388 390	水丸
T17	DRUG_TASTE 394 396	味苦
T18	DRUG_TASTE 397 398	辣
T19	PERSON_GROUP 409 411	孕妇
T2

In [161]:
# 预测验证集
valid_pred = []
for i in range(800,1000):
    valid_pred.extend(list(map(lambda x:str(i)+' '+re.sub(r'T[\d+]{1,2}\t','',x),valid_result[i])))
    

In [162]:
# 验证集的真实label
valid_true = []
for i in range(800,1000):
    with open('data/valid/ann/%d.ann'%i) as file_obj:
        valid_label = file_obj.read().split('\n')
        valid_true.extend(list(map(lambda x:str(i)+' '+re.sub(r'T[\d+]{1,2}\t','',x),valid_label)))


In [163]:
# strict F1
def eval(valid_pred,valid_true):
    beta = 1
    P = len(set(valid_pred) & set(valid_true))/len(set(valid_true))
    R = len(set(valid_pred) & set(valid_true))/len(set(valid_pred))
    F = (1+beta**2)*P*R/(P+beta**2*R)
    print('P = %.4f'%P)
    print('R = %.4f'%R)
    print('F = %.4f'%F)
    return P,R,F

In [164]:
eval(valid_pred,valid_true)

P = 0.4763
R = 0.4975
F = 0.4867


(0.4763271162123386, 0.4974528019178903, 0.4866608032834946)

In [None]:
#(0.46909962706446456, 0.5053084648493543, 0.4865312888520513)

In [None]:
# output
result = {}
test_data_path = 'data/chusai_xuanshou/'
#test_data_path = 'data/valid/'
for i in range(len(os.listdir(test_data_path))):
#for i in range(7):
    test_file = test_data_path+str(i+1000)+'.txt'
    with open(test_file, "r", encoding="utf8") as test:
        sentence = test.read()
    sentences = sentence.split('。')
    y_ner = []

    for sent in sentences:
        sent = sent.replace(' ','_')
        sent_chars = list(sent+'。')
        sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in sent_chars]

        sent2id_new = np.array([[0] * (maxlen-len(sent2id)) + sent2id[:maxlen]])
        y_pred = model.predict(sent2id_new)
        y_label = np.argmax(y_pred, axis=2)
        y_label = y_label.reshape(1, -1)[0]
        y_ner_ = [idx2label[i] for i in y_label][-len(sent_chars):]
        y_ner.extend(y_ner_)
    result_words = get_valid_nertag(sentence, y_ner)
    ans = []
    for res in result_words:
        number = res[0]
        #tag = res[1]
        start = res[2]
        end = res[3]
        word = re.sub(' |，|：|；|、|;|。|（|）','_',res[4]) # 有特殊字符的实体不予加入结果
        try:
            tag = tag_dic[word]
        except:
            tag = res[1]
        if '_' not in word:
            ans.append('{}\t{} {} {}\t{}'.format('T'+str(len(ans)+1), tag,start, end, word))
            print('{}\t{} {} {}\t{}'.format('T'+str(len(ans)), tag,start, end, word))
    
    result[i+1000] = ans

T1	DRUG_EFFICACY 138 142	清热解毒
T2	DRUG_EFFICACY 143 147	化湿除带
T3	DRUG_EFFICACY 148 152	祛瘀止痛
T4	DRUG_EFFICACY 153 156	散结消
T5	SYMPTOM 160 165	慢性盆腔炎
T6	SYMPTOM 167 171	小腹疼痛
T7	SYMPTOM 172 176	腰骶酸痛
T8	SYMPTOM 177 181	带下量多
T1	PERSON_GROUP 31 33	孕妇
T2	DISEASE 36 39	糖尿病
T3	DRUG_EFFICACY 79 81	活血
T4	SYMPTOM 87 91	月经量少
T5	SYMPTOM 95 101	血虚萎黄后错
T6	SYMPTOM 102 106	血虚萎黄
T7	SYMPTOM 107 111	风湿痹痛
T1	FOOD_GROUP 25 27	辛辣
T2	DISEASE 36 38	感冒
T3	SYMPTOM 38 40	发热
T4	DISEASE 51 54	高血压
T5	DISEASE_GROUP 55 58	心脏病
T6	DISEASE_GROUP 59 61	肝病
T7	DISEASE 62 65	糖尿病
T8	DISEASE_GROUP 66 68	肾病
T9	SYMPTOM 90 94	月经紊乱
T10	PERSON_GROUP 192 194	儿童
T11	DRUG_DOSAGE 242 244	颗粒
T12	DRUG_TASTE 245 248	气微香
T13	DRUG_TASTE 249 252	味微苦
T14	DRUG_EFFICACY 255 259	滋养肝肾
T15	SYNDROME 274 276	阴虚
T16	DISEASE_GROUP 276 279	肝旺症
T17	SYMPTOM 282 286	烘热汗出
T18	SYMPTOM 287 291	头晕耳鸣
T19	SYMPTOM 292 296	失眠多梦
T20	SYMPTOM 297 301	五心烦热
T21	SYMPTOM 302 306	腰背酸痛
T22	SYMPTOM 307 311	大便干燥
T23	SYMPTOM 312 316	心烦易怒
T24	SYMPTOM 317 321	舌红少苔
T25	SYMPTOM 322 3

In [None]:
#“实体类别”、“起始位置”、“结束位置”以空格分隔

In [19]:
for i in range(1000,1500):
    with open('data/submit/%d.ann'%i,'w', encoding='utf-8') as wr:
        wr.write('\n'.join(result[i]))