In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.utils.data as tdata
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
from htorch.contrib.data.reader import CHNReader, CHNDetReader

In [3]:
reader = CHNReader()
# reader = CHNDetReader()

In [4]:
train_data = reader.read("../data/CHN-NER/example.train")
val_data = reader.read("../data/CHN-NER/example.dev")
test_data = reader.read("../data/CHN-NER/example.test")

# train_data = reader.read("../data/CHN-NER-det/train_data")
# test_data = reader.read("../data/CHN-NER-det/test_data")
train_data.head(2)

Unnamed: 0,ner_tag,sentence_id,token
0,O,0,海
1,O,0,钓


In [5]:
from htorch.train.trainer import ModelTrainer

In [6]:
def convert_data_to_sentences(data):
    tokens = []
    labels = []
    for _, sentence in tqdm(data.groupby("sentence_id")):
        tokens.append(sentence.token.values)
        labels.append(sentence.ner_tag.values)
    return tokens, labels

In [7]:
train_tokens, train_labels = convert_data_to_sentences(train_data)
val_tokens, val_labels = convert_data_to_sentences(val_data)
# train_tokens, val_tokens, train_labels, val_labels = train_test_split(train_tokens, 
#                                                                       train_labels, 
#                                                                       test_size=0.1, 
#                                                                       random_state=42)
test_tokens, test_labels = convert_data_to_sentences(test_data)
print("train:", len(train_tokens))
print("val:", len(val_tokens))
print("test:", len(test_tokens))

100%|██████████| 20864/20864 [00:03<00:00, 5549.21it/s]
100%|██████████| 2318/2318 [00:00<00:00, 5509.50it/s]
100%|██████████| 4636/4636 [00:00<00:00, 5482.11it/s]

train: 20864
val: 2318
test: 4636





In [8]:
from itertools import chain

train_vocab = list(set(chain.from_iterable(train_tokens)))
val_vocab = list(set(chain.from_iterable(val_tokens)))
test_vocab = list(set(chain.from_iterable(test_tokens)))

print("train vocab size:", len(train_vocab))
print("val vocab size:", len(val_vocab))
print("only val vocab size:", len(set(val_vocab) - set(train_vocab)))

train vocab size: 4312
val vocab size: 2931
only val vocab size: 64


In [9]:
token2id = {token: i for i, token in enumerate(train_vocab, 1)}

In [10]:
labels = set(chain.from_iterable(train_labels))
print(labels)
labels = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']
labels

{'I-ORG', 'I-LOC', 'B-ORG', 'I-PER', 'B-PER', 'B-LOC', 'O'}


['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']

In [11]:
label2id = {label: i for i, label in enumerate(labels, 1)}
id2label = dict(zip(label2id.values(), label2id.keys()))
label2id

{'B-LOC': 1,
 'I-LOC': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-PER': 5,
 'I-PER': 6,
 'O': 7}

In [12]:
def convert_token_to_id(data):
    for s in data:
        yield [token2id.get(i, 0) for i in s]

def convert_label_to_id(labels):
    for s in labels:
        yield [label2id[i] for i in s]

In [13]:
x_train = list(convert_token_to_id(train_tokens))
x_val = list(convert_token_to_id(val_tokens))
x_test = list(convert_token_to_id(test_tokens))
y_train = list(convert_label_to_id(train_labels))
y_val = list(convert_label_to_id(val_labels))
y_test = list(convert_label_to_id(test_labels))

In [14]:
for p in (25, 50, 75, 90, 95, 99, 99.5):
    print(p, np.percentile([len(i) for i in x_train], p))
print("min:", min([len(i) for i in x_train]))
print("max:", max([len(i) for i in x_train]))

25 28.0
50 40.0
75 58.0
90 80.0
95 97.0
99 144.0
99.5 172.0
min: 6
max: 574


In [15]:
maxlen = 144

In [16]:
x_train = pad_sequences(x_train, maxlen=maxlen, padding="pre")
x_val = pad_sequences(x_val, maxlen=maxlen, padding="pre")
x_test = pad_sequences(x_test, maxlen=maxlen, padding="pre")
y_train = pad_sequences(y_train, maxlen=maxlen, padding="pre")
y_val = pad_sequences(y_val, maxlen=maxlen, padding="pre")
y_test = pad_sequences(y_test, maxlen=maxlen, padding="pre")

In [17]:
from htorch.nn.layers import CRF
crf = CRF(3, batch_first=True)

In [18]:
# crf(torch.tensor([[[1, 2, 3], [1, 2, 4]]]).float())
# crf.log_likelihood_score(torch.tensor([[[1, 2, 3]]]).float(), torch.tensor([[1]]))

In [113]:
from htorch.nn.layers import CRF

class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size=len(train_vocab) + 1, dim=100, num_tags=len(label2id), 
                 lstm_hidden_size=40, embedding_matrix=None):
        super(BiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dim, padding_idx=0)
        if embedding_matrix is None:
            self.embedding.weight.data.normal_(-1, 1)
        else:
            self.embedding.weight = nn.Parameter(embedding_matrix)
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(dim, lstm_hidden_size, batch_first=True, bidirectional=True)
        self.lstm_encoder = nn.Linear(lstm_hidden_size * 2, num_tags)
        self.crf = CRF(num_tags, batch_first=True)
    
    def _compute_feature(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.lstm_encoder(x)
        return x
    
    def forward(self, x):
        x = self._compute_feature(x)
        x = self.crf(x)
        return x
    
    def loss(self, x, y_true):
        feature = self._compute_feature(x)
        return self.crf.log_likelihood_score(feature, y_true, reduction="mean")

In [66]:
device = "cuda"

In [67]:
def build_dataloader(x, y=None, batch_size=32, shuffle=False, device=device):
    x = torch.tensor(x, dtype=torch.long, device=device)
    if y is not None:
        y = torch.tensor(y, dtype=torch.long, device=device)
        dataset = tdata.TensorDataset(x, y)
    else:
        dataset = tdata.TensorDataset(x)
    dataloader = tdata.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [109]:
import gensim

def load_embedding(filename, vocab, dim=100):
    model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True, 
                                                            unicode_errors="ignore")
    embedding = np.zeros((len(vocab) + 1, dim))
    for word, index in vocab.items():
        if word not in model:
            continue
        embedding[index, :] = model[word]
    return embedding

In [110]:
embedding_matrix = load_embedding("../data/new_chn_fasttext.bin", token2id, dim=100)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [132]:
train_dataloader = build_dataloader(x_train, y_train, batch_size=128, shuffle=True, device=device)
val_dataloader = build_dataloader(x_val, y_val, batch_size=128, shuffle=False, device=device)

In [133]:
# embedding = torch.tensor(embedding_matrix, dtype=torch.float32, device=device)
# model = BiLSTMCRF(lstm_hidden_size=100, num_tags=len(label2id) + 1, 
#                   embedding_matrix=embedding)
loss_fn = model.loss
optimizer = torch.optim.Adam(model.parameters())

model_trainer = ModelTrainer(model, loss_fn, optimizer, device=device)
model_trainer.train(train_dataloader, val_dataloader, epochs=5, verbose=1, y_true="input")

Epoch: 1, train loss: 3.5140, val loss: 30.9955, time: 79.3
Epoch: 2, train loss: 2.8375, val loss: 31.6800, time: 78.9
Epoch: 3, train loss: 2.3744, val loss: 31.6873, time: 79.5
Epoch: 4, train loss: 2.0209, val loss: 30.4260, time: 79.4
Epoch: 5, train loss: 1.7400, val loss: 31.3082, time: 79.0


BiLSTMCRF(
  (embedding): Embedding(4313, 100, padding_idx=0)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (lstm_encoder): Linear(in_features=200, out_features=8, bias=True)
  (crf): CRF(num_tags=8)
)

In [24]:
# model_trainer.predict(train_dataloader, has_label=True)

In [125]:
val_pred = model_trainer.predict(val_dataloader, has_label=True)
train_dataloader_pred = build_dataloader(x_train, y_train, batch_size=64, shuffle=False, 
                                         device=device)
train_pred = model_trainer.predict(train_dataloader_pred, has_label=True)

In [127]:
val_pred_label = [[id2label[i] for i in s if i] for s in val_pred]
train_pred_label = [[id2label[i] for i in s if i] for s in train_pred]

In [128]:
i = 22
print('|'.join(val_pred_label[i]))
print('|'.join(val_labels[i]))
print('|'.join(val_tokens[i]))

O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
这|个|出|版|社|与|总|社|新|闻|研|究|所|、|中|国|新|闻|学|院|合|作|编|撰|的|国|家|“|九|五|”|重|点|图|书|《|毛|泽|东|、|邓|小|平|、|江|泽|民|新|闻|宣|传|思|想|研|究|》|已|取|得|实|质|性|进|展|。


In [118]:
from htorch.metrics.sequence import sequence_f1

In [129]:
sequence_f1(y_val, val_pred, id2label)

(0.6143394044521538, 0.7088058705803869, 0.6582004026637758)

In [130]:
sequence_f1(y_train, train_pred, id2label)

(0.6322713126062823, 0.7159049625808327, 0.6714940452864908)

In [131]:
i = 22
print('|'.join(val_pred_label[i]))
print('|'.join(train_labels[i]))
print('|'.join(train_tokens[i]))

O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
O|O|B-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
熟|悉|世|行|运|作|的|人|士|说|，|贷|款|一|旦|被|推|迟|，|将|在|几|个|月|、|甚|至|可|能|更|长|的|时|间|后|才|能|被|重|新|考|虑|。
