keras 2.2.4

tensorflow 1.13

pip install git+https://www.github.com/keras-team/keras-contrib.git

In [1]:
import re

In [2]:
char_vocab_path = "data/char_vocabs.txt" # 字典文件
train_data_path = 'train_data/train_data_000' # 训练数据
#train_data_path = './data/train_data' # 训练数据
test_data_path = 'train_data/train_data_000' # 测试数据

special_words = ['<PAD>', '<UNK>'] # 特殊词表示

# "BIO"标记的标签
#label2idx = {"O": 0,
#             "B-PER": 1, "I-PER": 2,
#             "B-LOC": 3, "I-LOC": 4,
#             "B-ORG": 5, "I-ORG": 6
#            }
label2idx = {'O': 0,
             'DISEASE': 1, 'DISEASE_GROUP': 2,
             'DRUG_DOSAGE': 3, 'DRUG_EFFICACY': 4,
             'DRUG_INGREDIENT': 5, 'DRUG_TASTE': 6,
             'FOOD_GROUP':7, 'PERSON_GROUP':8,
             'SYMPTOM':9, 'SYNDROME':10
            }

# 索引和BIO标签对应
idx2label = {idx: label for label, idx in label2idx.items()}

# 读取字符词典文件
with open(char_vocab_path, "r", encoding="utf8") as fo:
    char_vocabs = [line.strip() for line in fo]
char_vocabs = special_words + char_vocabs

# 字符和索引编号对应
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}

In [3]:
# 读取训练语料
def read_corpus(corpus_path, vocab2idx, label2idx):
    with open(corpus_path, encoding='utf-8') as fr:
        lines = fr.readlines()

    sent_, tag_ = [], []
    for letter in lines:
        [char,label,_] = re.split('\t|\n',letter)
        sent_.append(char)
        tag_.append(label)

    sent_ids = [vocab2idx[char] if char in vocab2idx else vocab2idx['<UNK>'] for char in sent_]
    tag_ids = [label2idx[label] if label in label2idx else 0 for label in tag_]
    return sent_ids, tag_ids

# 加载训练集
train_datas, train_labels = read_corpus(train_data_path, vocab2idx, label2idx)
# 加载测试集
test_datas, test_labels = read_corpus(test_data_path, vocab2idx, label2idx)


In [4]:
train_datas = []
train_labels = []
for i in range(1000):
    train_data_path_i = './train_data/train_data_%03d'%i
    train_datas_i, train_labels_i = read_corpus(train_data_path_i, vocab2idx, label2idx)
    train_datas.append(train_datas_i)
    train_labels.append(train_labels_i)
    #if i%10==0:
    #    print(i)

In [5]:
test_datas = train_datas
test_labels = train_labels

In [6]:
print(train_datas[50])
print([idx2vocab[idx] for idx in train_datas[50]])
print(train_labels[50])
print([idx2label[idx] for idx in train_labels[50]])

[0, 589, 2644, 451, 5801, 5844, 5253, 966, 2953, 5253, 406, 3769, 182, 2659, 998, 4719, 451, 5801, 3647, 4101, 5980, 4717, 2178, 4793, 3900, 4463, 4702, 2008, 2494, 218, 6267, 3903, 2022, 1357, 6802, 2377, 6573, 1374, 5253, 1635, 6756, 3903, 5253, 4435, 5169, 3900, 911, 6009, 6802, 4539, 4020, 1635, 6756, 651, 5253, 2545, 6214, 966, 5253, 3160, 1357, 6040, 2545, 6214, 182, 0, 589, 2644, 6315, 3383, 4392, 2770, 406, 3769, 182, 2659, 998, 868, 1338, 719, 6315, 6756, 1534, 1588, 6007, 6009, 6802, 1338, 6573, 1367, 6756, 1534, 1588, 2288, 2494, 966, 6315, 287, 5988, 911, 6009, 6802, 723, 3556, 651, 3648, 723, 2065, 2654, 3903, 2994, 3630, 1338, 1363, 182, 0, 589, 2644, 463, 4669, 406, 3769, 182, 2659, 998, 868, 2276, 2223, 32, 12, 3020, 1297, 785, 280, 4377, 2178, 4793, 3903, 2000, 2002, 4669, 2327, 376, 1367, 6756, 5528, 228, 5738, 3020, 5983, 966, 5528, 4934, 5738, 3020, 5983, 504, 3903, 782, 6573, 6817, 1338, 719, 1192, 3022, 765, 4084, 2178, 4793, 2118, 2002, 4669, 2327, 1591, 1367, 67

In [7]:
import numpy as np
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation
from keras.preprocessing import sequence
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras import backend as K
K.clear_session()

EPOCHS = 20
BATCH_SIZE = 64
EMBED_DIM = 32
HIDDEN_SIZE = 16
MAX_LEN = 500
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(label2idx)
print(VOCAB_SIZE, CLASS_NUMS)

print('padding sequences')
train_datas = sequence.pad_sequences(train_datas, maxlen=MAX_LEN)
train_labels = sequence.pad_sequences(train_labels, maxlen=MAX_LEN)
test_datas = sequence.pad_sequences(test_datas, maxlen=MAX_LEN)
test_labels = sequence.pad_sequences(test_labels, maxlen=MAX_LEN)
print('x_train shape:', train_datas.shape)
print('x_test shape:', test_datas.shape)

train_labels = keras.utils.to_categorical(train_labels, CLASS_NUMS)
test_labels = keras.utils.to_categorical(test_labels, CLASS_NUMS)
print('trainlabels shape:', train_labels.shape)
print('testlabels shape:', test_labels.shape)

## BiLSTM+CRF模型构建
inputs = Input(shape=(MAX_LEN,), dtype='int32')
x = Masking(mask_value=0)(inputs)
x = Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(x)
x = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))(x)
x = TimeDistributed(Dense(CLASS_NUMS))(x)
outputs = CRF(CLASS_NUMS)(x)
model = Model(inputs=inputs, outputs=outputs)
model.summary()

model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])
model.fit(train_datas, train_labels, epochs=EPOCHS, verbose=1, validation_split=0.1)

score = model.evaluate(test_datas, test_labels, batch_size=BATCH_SIZE)
print(model.metrics_names)
print(score)

# save model
model.save("./model/ch_ner_model.h5")

Using TensorFlow backend.





6874 11
padding sequences
x_train shape: (1000, 500)
x_test shape: (1000, 500)
trainlabels shape: (1000, 500, 11)
testlabels shape: (1000, 500, 11)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
masking_1 (Masking)          (None, 500)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 500, 32)           219968    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 500, 32)           6272      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 500, 11)           363       
___________________________

In [9]:
from keras.models import load_model
import numpy as np

maxlen = 500
sentence = " 灌肠用。取本品50ml，将药液加温至38~39°C，臀部抬高10cm插管，肛管插入深度10~15cm。肛管插入后，讲管端套的熟料瓶颈部，加压挤入即可。灌入后膝胸卧位30分钟。每日一次，两周为一个疗程。月经干净后3~5天开始用药。  红虎灌肠液（50毫升装）-安徽天洋药业  清热解毒，化湿除带，祛瘀止痛，散结消癥，用于慢性盆腔炎所致小腹疼痛，腰骶酸痛，带下量多，或有发热 安徽天洋药业有限公司。"

sent_chars = list(sentence)
sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in sent_chars]

sent2id_new = np.array([[0] * (maxlen-len(sent2id)) + sent2id[:maxlen]])
y_pred = model.predict(sent2id_new)
y_label = np.argmax(y_pred, axis=2)
y_label = y_label.reshape(1, -1)[0]
y_ner = [idx2label[i] for i in y_label][-len(sent_chars):]

print(idx2label)
print(sent_chars)
print(sent2id)
print(y_ner)

{0: 'O', 1: 'DISEASE', 2: 'DISEASE_GROUP', 3: 'DRUG_DOSAGE', 4: 'DRUG_EFFICACY', 5: 'DRUG_INGREDIENT', 6: 'DRUG_TASTE', 7: 'FOOD_GROUP', 8: 'PERSON_GROUP', 9: 'SYMPTOM', 10: 'SYNDROME'}
[' ', '灌', '肠', '用', '。', '取', '本', '品', '5', '0', 'm', 'l', '，', '将', '药', '液', '加', '温', '至', '3', '8', '~', '3', '9', '°', 'C', '，', '臀', '部', '抬', '高', '1', '0', 'c', 'm', '插', '管', '，', '肛', '管', '插', '入', '深', '度', '1', '0', '~', '1', '5', 'c', 'm', '。', '肛', '管', '插', '入', '后', '，', '讲', '管', '端', '套', '的', '熟', '料', '瓶', '颈', '部', '，', '加', '压', '挤', '入', '即', '可', '。', '灌', '入', '后', '膝', '胸', '卧', '位', '3', '0', '分', '钟', '。', '每', '日', '一', '次', '，', '两', '周', '为', '一', '个', '疗', '程', '。', '月', '经', '干', '净', '后', '3', '~', '5', '天', '开', '始', '用', '药', '。', ' ', ' ', '红', '虎', '灌', '肠', '液', '（', '5', '0', '毫', '升', '装', '）', '-', '安', '徽', '天', '洋', '药', '业', ' ', ' ', '清', '热', '解', '毒', '，', '化', '湿', '除', '带', '，', '祛', '瘀', '止', '痛', '，', '散', '结', '消', '癥', '，', '用', '于', '慢', '性', '盆'

In [12]:
list(zip(sent_chars,y_ner))

[(' ', 'O'),
 ('灌', 'O'),
 ('肠', 'O'),
 ('用', 'O'),
 ('。', 'O'),
 ('取', 'O'),
 ('本', 'O'),
 ('品', 'O'),
 ('5', 'O'),
 ('0', 'O'),
 ('m', 'O'),
 ('l', 'O'),
 ('，', 'O'),
 ('将', 'O'),
 ('药', 'O'),
 ('液', 'O'),
 ('加', 'O'),
 ('温', 'O'),
 ('至', 'O'),
 ('3', 'O'),
 ('8', 'O'),
 ('~', 'O'),
 ('3', 'O'),
 ('9', 'O'),
 ('°', 'O'),
 ('C', 'O'),
 ('，', 'O'),
 ('臀', 'O'),
 ('部', 'O'),
 ('抬', 'O'),
 ('高', 'O'),
 ('1', 'O'),
 ('0', 'O'),
 ('c', 'O'),
 ('m', 'O'),
 ('插', 'O'),
 ('管', 'O'),
 ('，', 'O'),
 ('肛', 'O'),
 ('管', 'O'),
 ('插', 'O'),
 ('入', 'O'),
 ('深', 'O'),
 ('度', 'O'),
 ('1', 'O'),
 ('0', 'O'),
 ('~', 'O'),
 ('1', 'O'),
 ('5', 'O'),
 ('c', 'O'),
 ('m', 'O'),
 ('。', 'O'),
 ('肛', 'O'),
 ('管', 'O'),
 ('插', 'O'),
 ('入', 'O'),
 ('后', 'O'),
 ('，', 'O'),
 ('讲', 'O'),
 ('管', 'O'),
 ('端', 'O'),
 ('套', 'O'),
 ('的', 'O'),
 ('熟', 'O'),
 ('料', 'O'),
 ('瓶', 'O'),
 ('颈', 'O'),
 ('部', 'O'),
 ('，', 'O'),
 ('加', 'O'),
 ('压', 'O'),
 ('挤', 'O'),
 ('入', 'O'),
 ('即', 'O'),
 ('可', 'O'),
 ('。', 'O'),
 ('灌', 'O'),

In [None]:
# 对预测结果进行命名实体解析和提取
def get_valid_nertag(input_data, result_tags):
    result_words = []
    start, end =0, 1 # 实体开始结束位置标识
    tag_label = "O" # 实体类型标识
    for i, tag in enumerate(result_tags):
        if tag.startswith("B"):
            if tag_label != "O": # 当前实体tag之前有其他实体
                result_words.append((input_data[start: end], tag_label)) # 获取实体
            tag_label = tag.split("-")[1] # 获取当前实体类型
            start, end = i, i+1 # 开始和结束位置变更
        elif tag.startswith("I"):
            temp_label = tag.split("-")[1]
            if temp_label == tag_label: # 当前实体tag是之前实体的一部分
                end += 1 # 结束位置end扩展
        elif tag == "O":
            if tag_label != "O": # 当前位置非实体 但是之前有实体
                result_words.append((input_data[start: end], tag_label)) # 获取实体
                tag_label = "O"  # 实体类型置"O"
            start, end = i, i+1 # 开始和结束位置变更
    if tag_label != "O": # 最后结尾还有实体
        result_words.append((input_data[start: end], tag_label)) # 获取结尾的实体
    return result_words

result_words = get_valid_nertag(sent_chars, y_ner)
for (word, tag) in result_words:
    print("".join(word), tag)

In [None]:
char_vocab_path = "./data/char_vocabs.txt" # 字典文件
model_path = "./model/ch_ner_model.h5" # 模型文件

ner_labels = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
special_words = ['<PAD>', '<UNK>']
MAX_LEN = 100

with open(char_vocab_path, "r", encoding="utf8") as fo:
    char_vocabs = [line.strip() for line in fo]
char_vocabs = special_words + char_vocabs

idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}

idx2label = {idx: label for label, idx in ner_labels.items()}

sentence = "中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下，连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚"

sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in sentence]

sent2input = np.array([[0] * (MAX_LEN-len(sent2id)) + sent2id[:MAX_LEN]])

model = load_model(model_path, custom_objects={'CRF': CRF}, compile=False)
y_pred = model.predict(sent2input)

y_label = np.argmax(y_pred, axis=2)
y_label = y_label.reshape(1, -1)[0]
y_ner = [idx2label[i] for i in y_label][-len(sentence):]

print(idx2label)
print(sent_chars)
print(sent2id)
print(y_ner)