In [6]:
import jieba
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.utils.data as tdata
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences

In [1]:
from htorch.contrib.data.reader import CHNPOSNERReader

In [3]:
reader = CHNPOSNERReader()

In [7]:
def read_data(category="train"):
    datas = []
    sentence_id = 0
    for i in range(1, 6):
        path = "../data/CHN-POS-NER/{category}_data{i}".format(category=category, i=i)
        data = reader.read(path)
        data['sentence_id'] = data['sentence_id'] + sentence_id
        sentence_id += data.shape[0]
        datas.append(data)
    return pd.concat(datas, axis=0)

train_data = read_data(category="train")
test_data = read_data(category="test")
print("train:", train_data.shape)
print("test:", test_data.shape)
train_data.head(2)

train: (1043124, 4)
test: (506203, 4)


Unnamed: 0,ner_tag,pos_tag,sentence_id,token
0,N,n,0,女
1,N,n,0,性


In [10]:
from htorch.train.trainer import ModelTrainer

In [11]:
def replace_ner_tag(tag):
    if "S-" in tag:
        return "B" + tag[1:]
    if "E-" in tag:
        return "I" + tag[1:]
    if tag == "N":
        return "O"
    return tag

In [13]:
def convert_data_to_sentences(data):
    tokens = []
    labels = []
    for _, sentence in tqdm(data.groupby("sentence_id")):
        tokens.append(sentence.token.values)
        labels.append(sentence.ner_tag.values)
    return tokens, labels

In [14]:
train_data['ner_tag'] = train_data.ner_tag.apply(replace_ner_tag)
test_data['ner_tag'] = test_data.ner_tag.apply(replace_ner_tag)

In [77]:
train_tokens, train_labels = convert_data_to_sentences(train_data)
# train_tokens, val_tokens, train_labels, val_labels = train_test_split(train_tokens, 
#                                                                       train_labels, 
#                                                                       test_size=0.1, 
#                                                                       random_state=42)
test_tokens, test_labels = convert_data_to_sentences(test_data)
# print("train:", len(train_tokens))
# print("val:", len(val_tokens))
# print("test:", len(test_tokens))

100%|██████████| 31274/31274 [00:06<00:00, 4972.61it/s]
100%|██████████| 15273/15273 [00:02<00:00, 5142.02it/s]


In [78]:
train_sen = ["|".join(i) for i in train_tokens]
print(len(train_sen), len(set(train_sen)))
test_sen = ["|".join(i) for i in test_tokens]
print(len(test_sen), len(set(test_sen)))
print(len(train_sen + test_sen), len(set(train_sen + test_sen)))
print(len(set(train_sen) & set(test_sen)))

31274 4785
15273 4783
46547 4785
4783


In [79]:
import hashlib

df = pd.DataFrame({'token': train_sen, 'label': ["|".join(i) for i in train_labels]})
df = df.drop_duplicates(['token', 'label']).sort_values('token')
df['finger'] = df['token'].apply(lambda x: hashlib.md5(x.encode('utf8')).hexdigest())
df[df.finger == "482ef1c3afe469a9759763f8185f2dc3"]
df = df.drop_duplicates('finger')
tokens = df.token.apply(lambda x: x.split('|')).values.tolist()
labels = df.label.apply(lambda x: x.split('|')).values.tolist()
train_tokens, val_tokens, train_labels, val_labels = train_test_split(tokens, labels, 
                                                                     test_size=0.1, random_state=42)
train_tokens, test_tokens, train_labels, test_lables = train_test_split(train_tokens, train_labels, 
                                                                       test_size=0.2, random_state=42)
print("train:", len(train_tokens))
print("val:", len(val_tokens))
print("test:", len(test_tokens))

train: 3444
val: 479
test: 862


In [80]:
# add words
# ...
train_words = [jieba.lcut("".join(i)) for i in train_tokens]
val_words = [jieba.lcut("".join(i)) for i in val_tokens]
test_words = [jieba.lcut("".join(i)) for i in test_tokens]

In [81]:
from itertools import chain

train_vocab = list(set(chain.from_iterable(train_tokens)))
val_vocab = list(set(chain.from_iterable(val_tokens)))
test_vocab = list(set(chain.from_iterable(test_tokens)))

print("train vocab size:", len(train_vocab))
print("val vocab size:", len(val_vocab))
print("only val vocab size:", len(set(val_vocab) - set(train_vocab)))

train vocab size: 1660
val vocab size: 1110
only val vocab size: 40


In [82]:
train_vocab_words = list(set(chain.from_iterable(train_words)))
val_vocab_words = list(set(chain.from_iterable(val_words)))
test_vocab_words = list(set(chain.from_iterable(test_words)))
print("train vocab size:", len(train_vocab_words))
print("val vocab size:", len(val_vocab_words))
print("only val vocab size:", len(set(val_vocab_words) - set(train_vocab_words)))

train vocab size: 4958
val vocab size: 1976
only val vocab size: 303


In [83]:
token2id = {token: i for i, token in enumerate(train_vocab, 1)}
word_token2id = {token: i for i, token in enumerate(train_vocab_words, 1)}

In [84]:
labels = set(chain.from_iterable(train_labels))
print(labels)
# labels = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']
# labels

{'I-DISEASE', 'I-TREATMENT', 'B-CHECK', 'I-CHECK', 'B-SIGNS', 'B-DISEASE', 'I-BODY', 'B-TREATMENT', 'I-SIGNS', 'O', 'B-BODY'}


In [85]:
label2id = {label: i for i, label in enumerate(labels, 1)}
id2label = dict(zip(label2id.values(), label2id.keys()))
label2id

{'I-DISEASE': 1,
 'I-TREATMENT': 2,
 'B-CHECK': 3,
 'I-CHECK': 4,
 'B-SIGNS': 5,
 'B-DISEASE': 6,
 'I-BODY': 7,
 'B-TREATMENT': 8,
 'I-SIGNS': 9,
 'O': 10,
 'B-BODY': 11}

In [86]:
def convert_token_to_id(data, mapping=token2id):
    for s in data:
        yield [mapping.get(i, 0) for i in s]

def convert_label_to_id(labels):
    for s in labels:
        yield [label2id[i] for i in s]

In [87]:
x_train = list(convert_token_to_id(train_tokens))
x_val = list(convert_token_to_id(val_tokens))
x_test = list(convert_token_to_id(test_tokens))
y_train = list(convert_label_to_id(train_labels))
y_val = list(convert_label_to_id(val_labels))
y_test = list(convert_label_to_id(test_labels))
x_train_word = list(convert_token_to_id(train_words, word_token2id))
x_val_word = list(convert_token_to_id(val_words, word_token2id))
x_test_word = list(convert_token_to_id(test_words, word_token2id))

In [88]:
for p in (25, 50, 75, 90, 95, 99, 99.5):
    print(p, np.percentile([len(i) for i in x_train_word], p))
print("min:", min([len(i) for i in x_train_word]))
print("max:", max([len(i) for i in x_train_word]))

25 11.0
50 19.0
75 33.0
90 50.0
95 60.0
99 98.57000000000016
99.5 109.0
min: 1
max: 186


In [89]:
for p in (25, 50, 75, 90, 95, 99, 99.5):
    print(p, np.percentile([len(i) for i in x_train], p))
print("min:", min([len(i) for i in x_train]))
print("max:", max([len(i) for i in x_train]))

25 18.0
50 31.0
75 55.0
90 81.0
95 106.0
99 164.0
99.5 185.0
min: 1
max: 309


In [90]:
maxlen = 144

In [91]:
x_train = pad_sequences(x_train, maxlen=maxlen, padding="pre")
x_val = pad_sequences(x_val, maxlen=maxlen, padding="pre")
x_test = pad_sequences(x_test, maxlen=maxlen, padding="pre")
y_train = pad_sequences(y_train, maxlen=maxlen, padding="pre")
y_val = pad_sequences(y_val, maxlen=maxlen, padding="pre")
y_test = pad_sequences(y_test, maxlen=maxlen, padding="pre")

x_train_word = pad_sequences(x_train_word, maxlen=60, padding="pre")
x_val_word = pad_sequences(x_val_word, maxlen=60, padding="pre")
x_test_word = pad_sequences(x_test_word, maxlen=60, padding="pre")

In [92]:
from htorch.nn.layers import CRF
crf = CRF(3, batch_first=True)

In [93]:
# crf(torch.tensor([[[1, 2, 3], [1, 2, 4]]]).float())
# crf.log_likelihood_score(torch.tensor([[[1, 2, 3]]]).float(), torch.tensor([[1]]))

In [144]:
from htorch.nn.layers import CRF, CRFOrg
from htorch.nn.layers import SpatialDropout

class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size=len(train_vocab) + 1, char_dim=100, 
                 word_vocab_size=len(train_vocab_words) + 1, word_dim=300,
                 num_tags=len(label2id), lstm_hidden_size=40, word_lstm_size=40, 
                 embedding_matrix=None, word_embedding_matrix=None):
        super(BiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, char_dim, padding_idx=0)
        if embedding_matrix is None:
            self.embedding.weight.data.normal_(-1, 1)
            self.embedding.weight.requires_grad = True
        else:
            self.embedding.weight = nn.Parameter(embedding_matrix)
            self.embedding.weight.requires_grad = False
            
        self.word_embedding = nn.Embedding(word_vocab_size, word_dim, padding_idx=0)
        if word_embedding_matrix is None:
            self.word_embedding.weight.data.normal_(-1, 1)
            self.word_embedding.weight.requires_grad = True
        else:
            self.word_embedding.weight = nn.Parameter(word_embedding_matrix)
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(char_dim, lstm_hidden_size, batch_first=True, bidirectional=True)
        self.word_lstm = nn.LSTM(word_dim, word_lstm_size, batch_first=True, bidirectional=True)
        self.embedding_dropout = SpatialDropout(0.5)
        self.word_embedding_dropout = SpatialDropout(0.3)
        self.dropout = nn.Dropout(0.7)
        self.lstm_encoder = nn.Linear(lstm_hidden_size * 2 + word_lstm_size * 2, num_tags)
        self.crf = CRFOrg(num_tags)  # , batch_first=True
    
    def _compute_feature(self, x):
        char_x = self.embedding(x)
        char_x = self.embedding_dropout(char_x)
        char_x, _ = self.lstm(char_x)
        w_x = self.word_embedding(x)
#         w_x = self.word_embedding_dropout(w_x)
        w_x, _ = self.word_lstm(w_x)
        x = torch.cat([char_x, w_x], dim=-1)
        x = self.dropout(x)
        x = self.lstm_encoder(x)
        return x
    
    def forward(self, x, x_words):
        x = self._compute_feature(x)
        x = self.crf(x)
        return x
    
    def loss(self, x, x_words, y_true):
        feature = self._compute_feature(x)
#         return self.crf.log_likelihood_score(feature, y_true, reduction="mean")
        return self.crf.loss(feature, y_true)

In [95]:
device = "cuda"

In [96]:
def build_dataloader(*x, y=None, batch_size=32, shuffle=False, device=device):
    tensors = []
    for i in x:
        tensors.append(torch.tensor(i, dtype=torch.long, device=device))
    if y is not None:
        tensors.append(torch.tensor(y, dtype=torch.long, device=device))
    dataset = tdata.TensorDataset(*tensors)
    dataloader = tdata.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [97]:
import gensim

def load_embedding(filename, vocab, dim=100):
    model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True, 
                                                            unicode_errors="ignore")
    embedding = np.zeros((len(vocab) + 1, dim))
    for word, index in vocab.items():
        if word not in model:
            continue
        embedding[index, :] = model[word]
    return embedding

In [98]:
# embedding_matrix = load_embedding("../data/new_chn_fasttext.bin", token2id, dim=100)

In [99]:
train_dataloader = build_dataloader(x_train, x_train_word, y_train, batch_size=128, shuffle=True, device=device)
val_dataloader = build_dataloader(x_val, x_val_word, y_val, batch_size=128, shuffle=False, device=device)

In [100]:
import os
import random

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [159]:
# embedding = torch.tensor(embedding_matrix, dtype=torch.float32, device=device)
seed_everything(1234)
# model = BiLSTMCRF(lstm_hidden_size=100, num_tags=len(label2id) + 1, 
#                   embedding_matrix=None)
loss_fn = model.loss
optimizer = torch.optim.Adam(model.parameters())

model_trainer = ModelTrainer(model, loss_fn, optimizer, device=device)
model = model_trainer.train(train_dataloader, val_dataloader, epochs=10, verbose=1, 
                            y_true="input")

Epoch: 1, train loss: 2.2298, val loss: 3.6609, time: 2.6
Epoch: 2, train loss: 2.0910, val loss: 3.8225, time: 2.7
Epoch: 3, train loss: 2.0112, val loss: 3.7463, time: 2.7
Epoch: 4, train loss: 1.9018, val loss: 3.8499, time: 2.7
Epoch: 5, train loss: 1.8761, val loss: 3.7836, time: 2.7
Epoch: 6, train loss: 1.8367, val loss: 3.6960, time: 2.7
Epoch: 7, train loss: 1.8899, val loss: 3.7646, time: 2.6
Epoch: 8, train loss: 1.8470, val loss: 3.5966, time: 2.6
Epoch: 9, train loss: 1.7948, val loss: 3.7214, time: 2.7
Epoch: 10, train loss: 1.7995, val loss: 3.7419, time: 2.6


In [137]:
# model_trainer.predict(train_dataloader, has_label=True)

In [160]:
val_pred = model_trainer.predict(val_dataloader, has_label=True)
train_dataloader_pred = build_dataloader(x_train, x_train_word, y_train, 
                                         batch_size=2048, shuffle=False, device=device)
train_pred = model_trainer.predict(train_dataloader_pred, has_label=True)

In [161]:
val_pred_label = [[id2label[i] for i in s if i] for s in val_pred]
train_pred_label = [[id2label[i] for i in s if i] for s in train_pred]

In [162]:
i = 22
print('|'.join(val_pred_label[i]))
print('|'.join(val_labels[i]))
print('|'.join(val_tokens[i]))

O|O|O|O|O|O|O|O|O|O|B-SIGNS|I-SIGNS|O|O|B-SIGNS|I-SIGNS|B-SIGNS|I-SIGNS|O|O|O|O|B-BODY|I-BODY|O|O|O
O|O|O|O|O|O|O|O|O|O|B-SIGNS|I-SIGNS|O|O|B-SIGNS|I-SIGNS|B-SIGNS|I-SIGNS|O|O|O|O|B-BODY|I-BODY|O|O|O
患|者|神|清|，|精|神|可|，|无|发|热|，|无|咳|嗽|咳|痰|，|饮|食|及|大|便|正|常|。


In [163]:
from htorch.metrics.sequence import sequence_f1

In [164]:
sequence_f1(y_val, val_pred, id2label)

(0.8834894613583139, 0.8860833822665883, 0.8847845206684258)

In [165]:
sequence_f1(y_train, train_pred, id2label)

(0.9527477763659467, 0.9501821637890068, 0.9514632405424698)

In [98]:
i = 25
print('|'.join(train_pred_label[i]))
print('|'.join(train_labels[i]))
print('|'.join(train_tokens[i]))

B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
周|恩|来|总|理|说|，|那|就|送|一|株|万|古|常|青|的|友|谊|红|杉|吧|！


In [34]:
class MMM(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout()
    
    def forward(self, x):
        return self.dropout(x)

In [35]:
a = torch.rand((5, 10, 10))

In [36]:
mm = MMM()
mm(a)

tensor([[[0.0000, 1.9541, 0.0000, 0.5803, 1.2009, 1.4002, 1.9079, 0.9438,
          0.4426, 0.0000],
         [0.6091, 0.0000, 0.0000, 0.0000, 0.0000, 1.3762, 0.0000, 1.9177,
          1.6712, 0.5589],
         [0.0000, 0.0000, 0.0000, 1.5544, 0.1537, 0.2537, 0.0000, 0.0000,
          0.6286, 0.0000],
         [0.0000, 1.4669, 0.0000, 0.0000, 1.7357, 0.0000, 1.9292, 0.0000,
          0.0000, 0.4287],
         [1.5792, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6595, 0.1251,
          0.0000, 0.7667],
         [0.1108, 1.5880, 0.2712, 0.0000, 0.0000, 0.0000, 0.9227, 0.0000,
          1.6739, 1.8188],
         [0.0000, 0.0000, 0.0809, 1.7883, 1.3815, 1.9409, 0.0000, 0.6050,
          1.3982, 0.0000],
         [0.0000, 0.0000, 0.0610, 0.0000, 0.3187, 1.6865, 0.6932, 0.0000,
          1.8176, 1.1273],
         [0.0000, 1.4328, 0.0000, 1.7913, 0.0000, 0.0000, 0.0000, 0.0000,
          1.3483, 0.1882],
         [0.0000, 1.7394, 1.1920, 1.4148, 1.7558, 0.0000, 0.0000, 0.0000,
          0.0000,