In [1]:
import numpy as np
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.utils.data as tdata
from tqdm import tqdm

In [2]:
from htorch.contrib.data.reader import CHNReader, CHNDetReader

In [3]:
reader = CHNReader()
# reader = CHNDetReader()

In [4]:
train_data = reader.read("../data/CHN-NER/example.train")
val_data = reader.read("../data/CHN-NER/example.dev")
test_data = reader.read("../data/CHN-NER/example.test")

# train_data = reader.read("../data/CHN-NER-det/train_data")
# test_data = reader.read("../data/CHN-NER-det/test_data")
train_data.head(2)

Unnamed: 0,ner_tag,sentence_id,token
0,O,0,海
1,O,0,钓


In [5]:
bert_path = "/home/heyao/Downloads/bert-pretrain"
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
bert_model = BertModel.from_pretrained("bert-base-chinese")

In [6]:
from htorch.train.trainer import ModelTrainer

In [7]:
def convert_data_to_sentences(data):
    tokens = []
    labels = []
    for _, sentence in tqdm(data.groupby("sentence_id")):
        tokens.append(sentence.token.values)
        labels.append(sentence.ner_tag.values)
    return tokens, labels

In [8]:
train_tokens, train_labels = convert_data_to_sentences(train_data)
val_tokens, val_labels = convert_data_to_sentences(val_data)
# train_tokens, val_tokens, train_labels, val_labels = train_test_split(train_tokens, 
#                                                                       train_labels, 
#                                                                       test_size=0.1, 
#                                                                       random_state=42)
test_tokens, test_labels = convert_data_to_sentences(test_data)
print("train:", len(train_tokens))
print("val:", len(val_tokens))
print("test:", len(test_tokens))

100%|██████████| 20864/20864 [00:03<00:00, 5456.07it/s]
100%|██████████| 2318/2318 [00:00<00:00, 5479.07it/s]
100%|██████████| 4636/4636 [00:00<00:00, 5547.81it/s]

train: 20864
val: 2318
test: 4636





In [9]:
train_tokens = [bert_tokenizer.tokenize("".join(i)) for i in train_tokens]
val_tokens = [bert_tokenizer.tokenize("".join(i)) for i in val_tokens]
test_tokens = [bert_tokenizer.tokenize("".join(i)) for i in test_tokens]

In [10]:
from itertools import chain

labels = set(chain.from_iterable(train_labels))
print(labels)
labels = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']
labels

{'B-ORG', 'O', 'B-PER', 'I-ORG', 'I-PER', 'I-LOC', 'B-LOC'}


['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']

In [11]:
label2id = {label: i for i, label in enumerate(labels, 1)}
id2label = dict(zip(label2id.values(), label2id.keys()))
label2id

{'B-LOC': 1,
 'I-LOC': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-PER': 5,
 'I-PER': 6,
 'O': 7}

In [12]:
def convert_token_to_id(data):
    for s in data:
        yield [token2id.get(i, 0) for i in s]

def convert_label_to_id(labels):
    for s in labels:
        yield [label2id[i] for i in s]

In [13]:
x_train = [bert_tokenizer.convert_tokens_to_ids(i) for i in train_tokens]
x_val = [bert_tokenizer.convert_tokens_to_ids(i) for i in val_tokens]
x_test = [bert_tokenizer.convert_tokens_to_ids(i) for i in test_tokens]

y_train = list(convert_label_to_id(train_labels))
y_val = list(convert_label_to_id(val_labels))
y_test = list(convert_label_to_id(test_labels))

mask_train = [[1] * len(i) for i in x_train]
mask_val = [[1] * len(i) for i in x_val]
mask_test = [[1] * len(i) for i in x_test]

In [14]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 128
x_train = pad_sequences(x_train, maxlen=maxlen, padding="pre")
x_val = pad_sequences(x_val, maxlen=maxlen, padding="pre")
x_test = pad_sequences(x_test, maxlen=maxlen, padding="pre")
y_train = pad_sequences(y_train, maxlen=maxlen, padding="pre")
y_val = pad_sequences(y_val, maxlen=maxlen, padding="pre")
y_test = pad_sequences(y_test, maxlen=maxlen, padding="pre")
mask_train = pad_sequences(mask_train, maxlen=maxlen, padding="pre")
mask_val = pad_sequences(mask_val, maxlen=maxlen, padding="pre")
mask_test = pad_sequences(mask_test, maxlen=maxlen, padding="pre")

Using TensorFlow backend.


In [15]:
# from htorch.nn.layers import CRF
# crf = CRF(3, batch_first=True)
# crf(torch.tensor([[[1, 2, 3], [1, 2, 4]]]).float())
# crf.log_likelihood_score(torch.tensor([[[1, 2, 3]]]).float(), torch.tensor([[1]]))

In [55]:
from htorch.nn.layers import CRF, CRFOrg
from htorch.nn.layers import SpatialDropout

class BiLSTMCRF(nn.Module):
    def __init__(self, num_tags=len(label2id)):
        super(BiLSTMCRF, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, dim, padding_idx=0)
#         if embedding_matrix is None:
#             self.embedding.weight.data.normal_(-1, 1)
#         else:
#             self.embedding.weight = nn.Parameter(embedding_matrix)
#             self.embedding.weight.requires_grad = False
        self.bert = bert_model
#         self.lstm = nn.LSTM(dim, lstm_hidden_size, batch_first=True, bidirectional=True)
#         self.embedding_dropout = SpatialDropout(0.3)
        self.dropout = nn.Dropout(0.5)
        self.lstm_encoder = nn.Linear(768, num_tags)
        self.crf = CRFOrg(num_tags)  # , batch_first=True
    
    def _compute_feature(self, x, mask):
        x, _ = self.bert(x, attention_mask=mask, output_all_encoded_layers=False)
#         x, _ = self.lstm(x)
#         x = self.dropout(x)
        x = self.lstm_encoder(x)
        return x
    
    def forward(self, x, mask):
        x = self._compute_feature(x, mask)
        x = self.crf(x)
        return x
    
    def loss(self, x, mask, y_true):
        feature = self._compute_feature(x, mask)
#         return self.crf.log_likelihood_score(feature, y_true, reduction="mean")
        return self.crf.loss(feature, y_true)

In [17]:
device = "cuda"

In [18]:
def build_dataloader(*x, y=None, batch_size=32, shuffle=False, device=device):
    tensors = []
    for i in x:
        tensors.append(torch.tensor(i, dtype=torch.long, device=device))
    if y is not None:
        tensors.append(torch.tensor(y, dtype=torch.long, device=device))
    dataset = tdata.TensorDataset(*tensors)
    dataloader = tdata.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [19]:
import gensim

def load_embedding(filename, vocab, dim=100):
    model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True, 
                                                            unicode_errors="ignore")
    embedding = np.zeros((len(vocab) + 1, dim))
    for word, index in vocab.items():
        if word not in model:
            continue
        embedding[index, :] = model[word]
    return embedding

In [20]:
# embedding_matrix = load_embedding("../data/new_chn_fasttext.bin", token2id, dim=100)

In [56]:
train_dataloader = build_dataloader(x_train, mask_train, y_train, batch_size=16, shuffle=True, device=device)
val_dataloader = build_dataloader(x_val, mask_val, y_val, batch_size=16, shuffle=False, device=device)

In [71]:
from pytorch_pretrained_bert import BertAdam
# embedding = torch.tensor(embedding_matrix, dtype=torch.float32, device=device)
# model = BiLSTMCRF(num_tags=len(label2id) + 1)
loss_fn = model.loss
# optimizer = torch.optim.Adam(model.parameters())
optimizer = BertAdam(model.parameters(), lr=3e-5)

model_trainer = ModelTrainer(model, loss_fn, optimizer, device=device)
model = model_trainer.train(train_dataloader, val_dataloader=None, epochs=1, verbose=1, 
                            y_true="input", verbose_step=True)

t_total value of -1 results in schedule not being applied
100%|██████████| 1304/1304 [11:10<00:00,  1.96it/s]

Epoch: 1, train loss: 1.1926, time: 670.9





In [23]:
# model_trainer.predict(train_dataloader, has_label=True)

In [72]:
val_pred = model_trainer.predict(val_dataloader, has_label=True)
train_dataloader_pred = build_dataloader(x_train, mask_train, y_train, batch_size=16, 
                                         shuffle=False, device=device)
train_pred = model_trainer.predict(train_dataloader_pred, has_label=True)

In [73]:
val_pred_label = [[id2label[i] for i in s if i] for s in val_pred]
train_pred_label = [[id2label[i] for i in s if i] for s in train_pred]

In [74]:
i = 22
print('|'.join(val_pred_label[i]))
print('|'.join(val_labels[i]))
print('|'.join(val_tokens[i]))

O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
O|O|O|O|O|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|B-ORG|I-ORG|I-ORG|I-ORG|I-ORG|I-ORG|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|B-PER|I-PER|I-PER|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O|O
这|个|出|版|社|与|总|社|新|闻|研|究|所|、|中|国|新|闻|学|院|合|作|编|撰|的|国|家|[UNK]|九|五|[UNK]|重|点|图|书|《|毛|泽|东|、|邓|小|平|、|江|泽|民|新|闻|宣|传|思|想|研|究|》|已|取|得|实|质|性|进|展|。


In [75]:
from htorch.metrics.sequence import sequence_f1

In [76]:
sequence_f1(y_val, val_pred, id2label)

(0.8428862379982542, 0.8760205624433021, 0.8591340450771056)

In [77]:
sequence_f1(y_train, train_pred, id2label)

(0.9314683960894395, 0.9451301358651171, 0.9382495368715266)

In [None]:
i = 23
print('|'.join(val_pred_label[i]))
print('|'.join(train_labels[i]))
print('|'.join(train_tokens[i]))

In [None]:
class MMM(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout()
    
    def forward(self, x):
        return self.dropout(x)

In [None]:
a = torch.rand((5, 10, 10))

In [None]:
mm = MMM()
mm(a)