In [6]:
import jieba
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.utils.data as tdata
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences

In [2]:
from htorch.contrib.data.reader import CHNReader, CHNDetReader

In [3]:
reader = CHNReader()
# reader = CHNDetReader()

In [7]:
train_data = reader.read("../data/CHN-NER/example.train")
val_data = reader.read("../data/CHN-NER/example.dev")
test_data = reader.read("../data/CHN-NER/example.test")

# train_data = reader.read("../data/CHN-NER-det/train_data")
# test_data = reader.read("../data/CHN-NER-det/test_data")
train_data.head(2)

Unnamed: 0,ner_tag,sentence_id,token
0,O,0,海
1,O,0,钓


In [8]:
from htorch.train.trainer import ModelTrainer

In [9]:
def convert_data_to_sentences(data):
    tokens = []
    labels = []
    for _, sentence in tqdm(data.groupby("sentence_id")):
        tokens.append(sentence.token.values)
        labels.append(sentence.ner_tag.values)
    return tokens, labels

In [10]:
train_tokens, train_labels = convert_data_to_sentences(train_data)
val_tokens, val_labels = convert_data_to_sentences(val_data)
# train_tokens, val_tokens, train_labels, val_labels = train_test_split(train_tokens, 
#                                                                       train_labels, 
#                                                                       test_size=0.1, 
#                                                                       random_state=42)
test_tokens, test_labels = convert_data_to_sentences(test_data)
print("train:", len(train_tokens))
print("val:", len(val_tokens))
print("test:", len(test_tokens))

100%|██████████| 20864/20864 [00:03<00:00, 5400.49it/s]
100%|██████████| 2318/2318 [00:00<00:00, 5393.38it/s]
100%|██████████| 4636/4636 [00:00<00:00, 5433.94it/s]

train: 20864
val: 2318
test: 4636





In [23]:
# add words
# ...
train_words = [jieba.lcut("".join(i)) for i in train_tokens]
val_words = [jieba.lcut("".join(i)) for i in val_tokens]
test_words = [jieba.lcut("".join(i)) for i in test_tokens]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.536 seconds.
Prefix dict has been built succesfully.


In [11]:
from itertools import chain

train_vocab = list(set(chain.from_iterable(train_tokens)))
val_vocab = list(set(chain.from_iterable(val_tokens)))
test_vocab = list(set(chain.from_iterable(test_tokens)))

print("train vocab size:", len(train_vocab))
print("val vocab size:", len(val_vocab))
print("only val vocab size:", len(set(val_vocab) - set(train_vocab)))

train vocab size: 4312
val vocab size: 2931
only val vocab size: 64


In [24]:
train_vocab_words = list(set(chain.from_iterable(train_words)))
val_vocab_words = list(set(chain.from_iterable(val_words)))
test_vocab_words = list(set(chain.from_iterable(test_words)))
print("train vocab size:", len(train_vocab_words))
print("val vocab size:", len(val_vocab_words))
print("only val vocab size:", len(set(val_vocab_words) - set(train_vocab_words)))

train vocab size: 52804
val vocab size: 14321
only val vocab size: 2857


In [25]:
token2id = {token: i for i, token in enumerate(train_vocab, 1)}
word_token2id = {token: i for i, token in enumerate(train_vocab_words, 1)}

In [26]:
labels = set(chain.from_iterable(train_labels))
print(labels)
labels = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']
labels

{'B-LOC', 'O', 'B-PER', 'B-ORG', 'I-ORG', 'I-PER', 'I-LOC'}


['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']

In [27]:
label2id = {label: i for i, label in enumerate(labels, 1)}
id2label = dict(zip(label2id.values(), label2id.keys()))
label2id

{'B-LOC': 1,
 'I-LOC': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-PER': 5,
 'I-PER': 6,
 'O': 7}

In [28]:
def convert_token_to_id(data, mapping=token2id):
    for s in data:
        yield [mapping.get(i, 0) for i in s]

def convert_label_to_id(labels):
    for s in labels:
        yield [label2id[i] for i in s]

In [29]:
x_train = list(convert_token_to_id(train_tokens))
x_val = list(convert_token_to_id(val_tokens))
x_test = list(convert_token_to_id(test_tokens))
y_train = list(convert_label_to_id(train_labels))
y_val = list(convert_label_to_id(val_labels))
y_test = list(convert_label_to_id(test_labels))
x_train_word = list(convert_token_to_id(train_words, word_token2id))
x_val_word = list(convert_token_to_id(val_words, word_token2id))
x_test_word = list(convert_token_to_id(test_words, word_token2id))

In [32]:
for p in (25, 50, 75, 90, 95, 99, 99.5):
    print(p, np.percentile([len(i) for i in x_train_word], p))
print("min:", min([len(i) for i in x_train_word]))
print("max:", max([len(i) for i in x_train_word]))

25 16.0
50 23.0
75 33.0
90 45.0
95 54.0
99 82.36999999999898
99.5 99.0
min: 3
max: 348


In [18]:
maxlen = 144

In [33]:
x_train = pad_sequences(x_train, maxlen=maxlen, padding="pre")
x_val = pad_sequences(x_val, maxlen=maxlen, padding="pre")
x_test = pad_sequences(x_test, maxlen=maxlen, padding="pre")
y_train = pad_sequences(y_train, maxlen=maxlen, padding="pre")
y_val = pad_sequences(y_val, maxlen=maxlen, padding="pre")
y_test = pad_sequences(y_test, maxlen=maxlen, padding="pre")

x_train_word = pad_sequences(x_train_word, maxlen=60, padding="pre")
x_val_word = pad_sequences(x_val_word, maxlen=60, padding="pre")
x_test_word = pad_sequences(x_test_word, maxlen=60, padding="pre")

In [20]:
from htorch.nn.layers import CRF
crf = CRF(3, batch_first=True)

In [21]:
# crf(torch.tensor([[[1, 2, 3], [1, 2, 4]]]).float())
# crf.log_likelihood_score(torch.tensor([[[1, 2, 3]]]).float(), torch.tensor([[1]]))

In [87]:
from htorch.nn.layers import CRF, CRFOrg
from htorch.nn.layers import SpatialDropout

class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size=len(train_vocab) + 1, char_dim=100, 
                 word_vocab_size=len(train_vocab_words) + 1, word_dim=300,
                 num_tags=len(label2id), lstm_hidden_size=40, word_lstm_size=40, 
                 embedding_matrix=None, word_embedding_matrix=None):
        super(BiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, char_dim, padding_idx=0)
        if embedding_matrix is None:
            self.embedding.weight.data.normal_(-1, 1)
            self.embedding.weight.requires_grad = True
        else:
            self.embedding.weight = nn.Parameter(embedding_matrix)
            self.embedding.weight.requires_grad = False
            
        self.word_embedding = nn.Embedding(word_vocab_size, word_dim, padding_idx=0)
        if word_embedding_matrix is None:
            self.word_embedding.weight.data.normal_(-1, 1)
            self.word_embedding.weight.requires_grad = True
        else:
            self.word_embedding.weight = nn.Parameter(word_embedding_matrix)
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(char_dim, lstm_hidden_size, batch_first=True, bidirectional=True)
        self.word_lstm = nn.LSTM(word_dim, word_lstm_size, batch_first=True, bidirectional=True)
        self.embedding_dropout = SpatialDropout(0.5)
        self.word_embedding_dropout = SpatialDropout(0.3)
        self.dropout = nn.Dropout(0.7)
        self.lstm_encoder = nn.Linear(lstm_hidden_size * 2 + word_lstm_size * 2, num_tags)
        self.crf = CRFOrg(num_tags)  # , batch_first=True
    
    def _compute_feature(self, x):
        char_x = self.embedding(x)
        char_x = self.embedding_dropout(char_x)
        char_x, _ = self.lstm(char_x)
        w_x = self.word_embedding(x)
#         w_x = self.word_embedding_dropout(w_x)
        w_x, _ = self.word_lstm(w_x)
        x = torch.cat([char_x, w_x], dim=-1)
        x = self.dropout(x)
        x = self.lstm_encoder(x)
        return x
    
    def forward(self, x, x_words):
        x = self._compute_feature(x)
        x = self.crf(x)
        return x
    
    def loss(self, x, x_words, y_true):
        feature = self._compute_feature(x)
#         return self.crf.log_likelihood_score(feature, y_true, reduction="mean")
        return self.crf.loss(feature, y_true)

In [40]:
device = "cuda"

In [41]:
def build_dataloader(*x, y=None, batch_size=32, shuffle=False, device=device):
    tensors = []
    for i in x:
        tensors.append(torch.tensor(i, dtype=torch.long, device=device))
    if y is not None:
        tensors.append(torch.tensor(y, dtype=torch.long, device=device))
    dataset = tdata.TensorDataset(*tensors)
    dataloader = tdata.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [22]:
import gensim

def load_embedding(filename, vocab, dim=100):
    model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True, 
                                                            unicode_errors="ignore")
    embedding = np.zeros((len(vocab) + 1, dim))
    for word, index in vocab.items():
        if word not in model:
            continue
        embedding[index, :] = model[word]
    return embedding

In [38]:
# embedding_matrix = load_embedding("../data/new_chn_fasttext.bin", token2id, dim=100)

In [42]:
train_dataloader = build_dataloader(x_train, x_train_word, y_train, batch_size=128, shuffle=True, device=device)
val_dataloader = build_dataloader(x_val, x_val_word, y_val, batch_size=128, shuffle=False, device=device)

In [79]:
import os
import random

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [99]:
# embedding = torch.tensor(embedding_matrix, dtype=torch.float32, device=device)
seed_everything(1234)
model = BiLSTMCRF(lstm_hidden_size=100, num_tags=len(label2id) + 1, 
                  embedding_matrix=None)
loss_fn = model.loss
optimizer = torch.optim.Adam(model.parameters())

model_trainer = ModelTrainer(model, loss_fn, optimizer, device=device)
model_trainer.train(train_dataloader, val_dataloader, epochs=30, verbose=1, y_true="input")

Epoch: 1, train loss: 27.5186, val loss: 17.2080, time: 16.0
Epoch: 2, train loss: 15.2048, val loss: 11.2890, time: 16.0
Epoch: 3, train loss: 11.0771, val loss: 8.3022, time: 15.9
Epoch: 4, train loss: 8.6078, val loss: 6.4483, time: 16.3
Epoch: 5, train loss: 6.9656, val loss: 5.3474, time: 16.7
Epoch: 6, train loss: 5.8960, val loss: 4.5947, time: 16.5
Epoch: 7, train loss: 5.1358, val loss: 4.1045, time: 16.4
Epoch: 8, train loss: 4.5575, val loss: 3.8734, time: 16.3
Epoch: 9, train loss: 4.1117, val loss: 3.5297, time: 16.0
Epoch: 10, train loss: 3.7585, val loss: 3.2888, time: 16.3
Epoch: 11, train loss: 3.4344, val loss: 3.1087, time: 16.1
Epoch: 12, train loss: 3.1888, val loss: 2.9730, time: 16.1
Epoch: 13, train loss: 2.9783, val loss: 2.9099, time: 16.1
Epoch: 14, train loss: 2.7985, val loss: 2.7095, time: 16.1
Epoch: 15, train loss: 2.6026, val loss: 2.6656, time: 16.4
Epoch: 16, train loss: 2.4547, val loss: 2.4899, time: 16.2
Epoch: 17, train loss: 2.3277, val loss: 2.5

BiLSTMCRF(
  (embedding): Embedding(4313, 100, padding_idx=0)
  (word_embedding): Embedding(52805, 300, padding_idx=0)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (word_lstm): LSTM(300, 40, batch_first=True, bidirectional=True)
  (embedding_dropout): SpatialDropout(p=0.5)
  (word_embedding_dropout): SpatialDropout(p=0.3)
  (dropout): Dropout(p=0.7)
  (lstm_encoder): Linear(in_features=280, out_features=8, bias=True)
  (crf): CRFOrg()
)

In [100]:
# model_trainer.predict(train_dataloader, has_label=True)

In [101]:
val_pred = model_trainer.predict(val_dataloader, has_label=True)
train_dataloader_pred = build_dataloader(x_train, x_train_word, y_train, 
                                         batch_size=2048, shuffle=False, device=device)
train_pred = model_trainer.predict(train_dataloader_pred, has_label=True)

In [102]:
val_pred_label = [[id2label[i] for i in s if i] for s in val_pred]
train_pred_label = [[id2label[i] for i in s if i] for s in train_pred]

In [103]:
i = 22
print('|'.join(val_pred_label[i]))
print('|'.join(val_labels[i]))
print('|'.join(val_tokens[i]))

O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
这|个|出|版|社|与|总|社|新|闻|研|究|所|、|中|国|新|闻|学|院|合|作|编|撰|的|国|家|“|九|五|”|重|点|图|书|《|毛|泽|东|、|邓|小|平|、|江|泽|民|新|闻|宣|传|思|想|研|究|》|已|取|得|实|质|性|进|展|。


In [104]:
from htorch.metrics.sequence import sequence_f1

In [105]:
sequence_f1(y_val, val_pred, id2label)

(0.7875108412836079, 0.8219674109837055, 0.804370293813672)

In [106]:
sequence_f1(y_train, train_pred, id2label)

(0.8950492508101517, 0.9157939660549556, 0.9053027844486272)

In [98]:
i = 25
print('|'.join(train_pred_label[i]))
print('|'.join(train_labels[i]))
print('|'.join(train_tokens[i]))

B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
周|恩|来|总|理|说|，|那|就|送|一|株|万|古|常|青|的|友|谊|红|杉|吧|！


In [34]:
class MMM(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout()
    
    def forward(self, x):
        return self.dropout(x)

In [35]:
a = torch.rand((5, 10, 10))

In [36]:
mm = MMM()
mm(a)

tensor([[[0.0000, 1.9541, 0.0000, 0.5803, 1.2009, 1.4002, 1.9079, 0.9438,
          0.4426, 0.0000],
         [0.6091, 0.0000, 0.0000, 0.0000, 0.0000, 1.3762, 0.0000, 1.9177,
          1.6712, 0.5589],
         [0.0000, 0.0000, 0.0000, 1.5544, 0.1537, 0.2537, 0.0000, 0.0000,
          0.6286, 0.0000],
         [0.0000, 1.4669, 0.0000, 0.0000, 1.7357, 0.0000, 1.9292, 0.0000,
          0.0000, 0.4287],
         [1.5792, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.6595, 0.1251,
          0.0000, 0.7667],
         [0.1108, 1.5880, 0.2712, 0.0000, 0.0000, 0.0000, 0.9227, 0.0000,
          1.6739, 1.8188],
         [0.0000, 0.0000, 0.0809, 1.7883, 1.3815, 1.9409, 0.0000, 0.6050,
          1.3982, 0.0000],
         [0.0000, 0.0000, 0.0610, 0.0000, 0.3187, 1.6865, 0.6932, 0.0000,
          1.8176, 1.1273],
         [0.0000, 1.4328, 0.0000, 1.7913, 0.0000, 0.0000, 0.0000, 0.0000,
          1.3483, 0.1882],
         [0.0000, 1.7394, 1.1920, 1.4148, 1.7558, 0.0000, 0.0000, 0.0000,
          0.0000,