In [None]:
import numpy as np
import pandas as pd
import re
import os
import codecs
from sklearn.preprocessing import LabelEncoder
from collections import namedtuple, deque
import copy
import torch.nn as nn
import torch
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score
import seqeval.metrics
from transformers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torchcrf
import matplotlib.pyplot as plt
import jieba
import random
import math

In [None]:
f = codecs.open('d:/ds/project007/ds5/NER_all_cl_2.txt', encoding='utf8')
train_ds = f.read()
ori_text = re.findall('(?<=\{\"originalText\": \")(.*?)(?=\", \"entities\":)', train_ds)

In [None]:
ori_text = [x.lower() for x in ori_text]

In [None]:
plt.hist([len(x) for x in ori_text], density=True, cumulative=True, bins=[64*i for i in range(20)])

# CUT

In [None]:
cut_str = [jieba.lcut(x[:512]) for x in ori_text]

In [None]:
plt.hist([len(x) for x in cut_str], density=True, cumulative=True, bins=[64*i for i in range(10)])

# Normal Order Input Process

## Normal Order Input Labeling

In [None]:
def marking(str_ls, return_tuple=True):
    lbl_ls = [['I-'+str(i) if len(sub_str)>1 else 'O' for i, _ in enumerate(sub_str)] for sub_str in str_ls if sub_str is not None]
    if return_tuple == True:
        lbl_tuple = [[(char, lbl) for char, lbl in zip(sub_str, sub_lbl)] for sub_str, sub_lbl in zip(str_ls, lbl_ls)]
        return lbl_tuple
    else:
        return lbl_ls

In [None]:
lbl_str = [marking(x) for x in cut_str]

In [None]:
nrm_ord_char_ls = [[x[0] for sublist in sent for x in sublist] for sent in lbl_str]

In [None]:
nrm_ord_labl_ls = [[x[1] for sublist in sent for x in sublist] for sent in lbl_str]

In [None]:
all([len(char)==len(labl) for char, labl in zip(nrm_ord_char_ls, nrm_ord_labl_ls)])

In [None]:
nrm_ord_word_ls = copy.deepcopy(cut_str)

## Normal Order Input Tokenizing

In [None]:
tokenizer = BertTokenizer.from_pretrained("D:/NLP/roberta-wwm-ext",
                                          return_tensors='pt')

In [None]:
nrm_ord_tokn_ls = [tokenizer.convert_tokens_to_ids(x) for x in nrm_ord_char_ls]

In [None]:
all_labl_types = [list(set(sublist)) for sublist in nrm_ord_labl_ls]
all_labl_types = list(set([item for sublist in all_labl_types for item in sublist]))

In [None]:
all_labl_types.append('PAD')
lbl2idx = {l: i for i, l in enumerate(all_labl_types)}

In [None]:
nrm_input_ids = pad_sequences(nrm_ord_tokn_ls,
                              maxlen=512,
                              dtype='long',
                              value=0.0,
                              truncating='post',
                              padding='post')

In [None]:
nrm_attn_masks = [[float(i != 0.0) for i in ii] for ii in nrm_input_ids]

In [None]:
nrm_ord_word_set = deque(set([word for sublist in nrm_ord_word_ls for word in sublist]))
nrm_ord_word_set.appendleft('_PAD_')
nrm_ord_word_set = list(nrm_ord_word_set)

In [None]:
nrm_word2idx = {w: i for i, w in enumerate(nrm_ord_word_set)}

In [None]:
nrm_word_lbl = pad_sequences([[nrm_word2idx.get(w) for w in sublist] for sublist in nrm_ord_word_ls], maxlen=256, value=nrm_word2idx['_PAD_'], padding='post', truncating='post', dtype='long')

In [None]:
nrm_word_msk = [[float(i != nrm_word2idx['_PAD_']) for i in ii] for ii in nrm_word_lbl]

# INVERSE Order Input Processing

In [None]:
class inverse:
    def __init__(self, lbl_pair, ratio):
        super(inverse, self).__init__()
        self.lbl_pair_ = lbl_pair
        self.lbl_pair = copy.deepcopy(lbl_pair)
        self.ratio = ratio
        eff_phrs_idx = [[
            i for i, sublist in enumerate(pair) if (len(
                re.findall('[^\u4e00-\u9fa5]+', ''.join(
                    [x[0] for x in sublist]))) == 0) and (
                        len(''.join([x[0] for x in sublist])) > 1)
        ] for pair in lbl_pair]
        self.rnd_idx = [
            np.random.choice(x,
                             size=math.floor(ratio * len(x)),
                             replace=False).tolist() for x in eff_phrs_idx
        ]

    def inverse_pair(self):
#         super(inverse, self).inverse_pair()
        [[random.shuffle(pairs[i]) for i in idxs] for pairs, idxs in zip(self.lbl_pair, self.rnd_idx)]
#         [random.shuffle(self.lbl_pair[i]) for i in self.rnd_idx]
        self.inv_ord_char_ls = [[x[0] for sublist in sent for x in sublist]
                                for sent in self.lbl_pair]
        self.inv_ord_labl_ls = [[x[1] for sublist in sent for x in sublist]
                                for sent in self.lbl_pair]
        assert all([
            len(char) == len(labl)
            for char, labl in zip(self.inv_ord_char_ls, self.inv_ord_labl_ls)
        ])
        return self.inv_ord_char_ls, self.inv_ord_labl_ls

    def shattered(self):
#         super(inverse, self).shattered()
        self.inv_ord_word_ls = [[
            ''.join([x[0] for x in sublist]) for sublist in sent
        ] for sent in self.lbl_pair]
        return self.inv_ord_word_ls

In [None]:
inv = inverse(lbl_str, ratio=0.5)
inv_ord_char_ls, inv_ord_labl_ls = inv.inverse_pair()
inv_ord_word_ls = inv.shattered()

## SHUF INIT

In [None]:
epochs = 10
shuf_ratio_init = 0.6
shuf_ratio_min = 0.1
shuf_ratio = [
    i for i in np.arange(shuf_ratio_init, shuf_ratio_min,
                         -(shuf_ratio_init - shuf_ratio_min) / epochs)
]

In [None]:
inv_ord_char_all = []
inv_ord_labl_all = []
inv_ord_word_all = []
for i in range(epochs):
    inv = inverse(lbl_str, ratio=shuf_ratio[i])
    inv_ord_char_ls, inv_ord_labl_ls = inv.inverse_pair()
    inv_ord_word_ls = inv.shattered()
    inv_ord_char_all.append(inv_ord_char_ls)
    inv_ord_labl_all.append(inv_ord_labl_ls)
    inv_ord_word_all.append(inv_ord_word_ls)

In [None]:
temp = [sent for sublist in inv_ord_word_all for sent in sublist]
inv_ord_word_all_set = deque(set([word for sublist in temp for word in sublist]))

In [None]:
inv_ord_word_all_set.appendleft('_PAD_')
inv_ord_word_all_set = list(inv_ord_word_all_set)

In [None]:
len(inv_ord_word_all_set)

## Inverse Order Input Tokenizing

In [None]:
inv_ord_tokn_all = [[tokenizer.convert_tokens_to_ids(x) for x in sublist] for sublist in inv_ord_char_all]

In [None]:
inv_input_ids_all = [pad_sequences(x,
                              maxlen=512,
                              dtype='long',
                              value=0.0,
                              truncating='post',
                              padding='post') for x in inv_ord_tokn_all]

In [None]:
inv_attn_masks_all = [[[float(i != 0.0) for i in ii] for ii in sublist] for sublist in inv_input_ids_all]

In [None]:
inv_lbl_all = [
    pad_sequences([[lbl2idx.get(l) for l in lab] for lab in sublist],
                  maxlen=512,
                  value=lbl2idx['PAD'],
                  padding='post',
                  truncating='post',
                  dtype='long') for sublist in inv_ord_labl_all
]

In [None]:
inv_word2idx = {w: i for i, w in enumerate(inv_ord_word_all_set)}

In [None]:
inv_word_lbl = [
    pad_sequences([[inv_word2idx.get(w) for w in sent] for sent in sublist],
                  maxlen=256,
                  value=inv_word2idx['_PAD_'],
                  padding='post',
                  truncating='post',
                  dtype='long') for sublist in inv_ord_word_all
]

In [None]:
inv_word_msk = [[[float(i != inv_word2idx['_PAD_']) for i in ii] for ii in sublist] for sublist in inv_word_lbl]

# Model Constructing

In [None]:
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size=768, **kwargs):
        super(WordEmbedding, self).__init__(**kwargs)
        self.lut = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
        
    def forward(self, word):
        return self.lut(word) * math.sqrt(self.emb_size)

In [None]:
class WordPositionEncoding(nn.Module):
    def __init__(self, dropout, emb_size=768, max_len=256, **kwargs):
        super(WordPositionEncoding, self).__init__(**kwargs)
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, emb_size)
        position = torch.arange(0, max_len).unsqueeze(1)
        
        div_term = torch.exp(torch.arange(0, emb_size, 2) * -(math.log(10000.0) / emb_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, emb):
        emb = emb + torch.autograd.Variable(self.pe[:, :emb.size(1)], requires_grad=False)
        return self.dropout(emb)

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = key.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1))/math.sqrt(d_k)
    
    if mask is not None:
        scores = scores.masked_fill(mask.unsqueeze(2)==0, -1e9)
        
    p_attn = torch.nn.functional.softmax(scores, dim=-1)
    
    if dropout is not None:
        p_attn = dropout(p_attn)
        
    return torch.matmul(p_attn, value), p_attn

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, head, emb_size, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert emb_size % head == 0
        
        self.d_head = emb_size // head
        self.head = head
        
        self.q_lin = nn.Linear(emb_size, emb_size)
        self.k_lin = nn.Linear(emb_size, emb_size)
        self.v_lin = nn.Linear(emb_size, emb_size)
        
        self.concat_lin = nn.Linear(emb_size, emb_size)
        
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask):
        batch_size = query.size(0)
        
        self.query = self.q_lin(query).view(batch_size, -1, self.head, self.d_head).transpose(1, 2)
        self.key = self.k_lin(key).view(batch_size, -1, self.head, self.d_head).transpose(1, 2)
        self.value = self.v_lin(value).view(batch_size, -1, self.head, self.d_head).transpose(1, 2)
        
        self.x, self.attn = attention(self.query, self.key, self.value, mask=mask, dropout=self.dropout)
        
        x = self.x.transpose(1, 2).contiguous().view(batch_size, -1, self.head * self.d_head)
        
        return query + self.concat_lin(x)

In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 nrm_vocab_size,
                 inv_vocab_size,
                 bert_model,
                 cls,
                 dropout=0.1):
        super(Encoder, self).__init__()
        self.nrm_wd_emb = WordEmbedding(vocab_size=nrm_vocab_size)
        self.inv_wd_emb = WordEmbedding(vocab_size=inv_vocab_size)
        self.bert_model = bert_model

        self.attn1 = MultiHeadedAttention(head=12,
                                          emb_size=768,
                                          dropout=dropout)
        self.attn2 = MultiHeadedAttention(head=12,
                                          emb_size=768,
                                          dropout=dropout)

        self.layernorm = nn.LayerNorm(768, eps=1e-6)
        self.lastdrop = nn.Dropout(p=dropout)
        self.pooler = nn.Linear(in_features=768, out_features=cls)

    def forward(self, _nrm_word_lbl, _nrm_word_msk, _nrm_input_ids,
                _nrm_attn_masks, _inv_word_lbl, _inv_word_msk,
                _inv_input_ids, _inv_attn_masks, _inv_lbls):
        
        q1 = self.nrm_wd_emb(_nrm_word_lbl)
        k1 = self.bert_model(_inv_input_ids, attention_mask=_inv_attn_masks)[0]
        v1 = self.bert_model(_nrm_input_ids, attention_mask=_nrm_attn_masks)[0]
        attn1_res = self.layernorm(self.attn1(q1, k1, v1, mask=_nrm_word_msk))
        q2 = k1
        k2 = self.inv_wd_emb(_inv_word_lbl)
        v2 = attn1_res
        attn2_res = self.layernorm(self.attn2(q2, k2, v2, mask=_inv_attn_masks))
        
        output = self.pooler(self.lastdrop(attn2_res))
        
        return output

In [None]:
bert_mdl = BertModel.from_pretrained('D:/NLP/roberta-wwm-ext')

In [None]:
nrm_word_lbl = torch.LongTensor(nrm_word_lbl)
nrm_word_msk = torch.LongTensor(nrm_word_msk)
nrm_input_ids = torch.LongTensor(nrm_input_ids)
nrm_attn_masks = torch.LongTensor(nrm_attn_masks)

In [None]:
inv_word_lbl = [torch.LongTensor(x) for x in inv_word_lbl]
inv_word_msk = [torch.LongTensor(x) for x in inv_word_msk]
inv_input_ids_all = [torch.LongTensor(x) for x in inv_input_ids_all]
inv_attn_masks_all = [torch.LongTensor(x) for x in inv_attn_masks_all]
inv_lbl_all = [torch.LongTensor(x) for x in inv_lbl_all]

In [None]:
nrm_vocab_size = len(nrm_word2idx)
inv_vocab_size = len(inv_word2idx)

In [None]:
ENC = Encoder(nrm_vocab_size=nrm_vocab_size,
              inv_vocab_size=inv_vocab_size,
              bert_model=bert_mdl,
              cls=len(lbl2idx)).cuda()

In [None]:
total_steps = nrm_word_lbl.shape[0] * epochs
max_grad_norm = 1.0

In [None]:
ENC_optimizer = AdamW(ENC.parameters(), lr=1e-4)
ENC_scheduler = get_linear_schedule_with_warmup(
    ENC_optimizer,
    num_warmup_steps=20,
    num_training_steps=total_steps
)

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
loss_values = []
for i in range(epochs):
    train_data = torch.utils.data.TensorDataset(
        nrm_word_lbl, nrm_word_msk, nrm_input_ids, nrm_attn_masks,
        inv_word_lbl[i], inv_word_msk[i], inv_input_ids_all[i],
        inv_attn_masks_all[i], inv_lbl_all[i])
    train_sampler = torch.utils.data.RandomSampler(train_data)
    train_dataloader = torch.utils.data.DataLoader(train_data,
                                                   sampler=train_sampler,
                                                   batch_size=1)

    ENC.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.cuda() for t in batch)
        b_nrm_word_lbl, b_nrm_word_msk, b_nrm_input_ids, b_nrm_attn_masks, b_inv_word_lbl, b_inv_word_msk, b_inv_input_ids, b_inv_attn_masks, b_inv_lbls = batch
        
        ENC.zero_grad()
        
        outputs = ENC(b_nrm_word_lbl, b_nrm_word_msk, b_nrm_input_ids, b_nrm_attn_masks, b_inv_word_lbl, b_inv_word_msk, b_inv_input_ids, b_inv_attn_masks, b_inv_lbls)
        print(i, 'epoch', step, 'step', outputs.shape)
        
        loss = loss_fn(outputs.permute(0, 2, 1), b_inv_lbls)
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=ENC.parameters(), max_norm=max_grad_norm)
        
        ENC_optimizer.step()
        ENC_scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    loss_values.append(avg_train_loss)

In [None]:
ENC.bert_model.save_pretrained('d:/NLP/00inv_ner/')

# DOWNSTREAM TASK

In [None]:
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score
import seqeval.metrics
import torchcrf

In [None]:
f = codecs.open('d:/ds/project007/ds5/NER_all_cl.txt', encoding='utf8')
train_ds = f.read()

ori_text = re.findall('(?<=\{\"originalText\": \")(.*?)(?=\", \"entities\":)', train_ds)
ents = re.findall('(?<=\"entities\": \[)(.*)(?=\]\})', train_ds)
lbl_names = [re.findall('(?<=\"label_type\": ")(.*?)(?=\")', x) for x in ents]

In [None]:
start_pos = [re.findall('(?<=\"start_pos\": )([0-9]*)', x) for x in ents]
end_pos = [re.findall('(?<=\"end_pos\": )([0-9]*)', x) for x in ents]

start_pos = [np.array(x, dtype=np.int) for x in start_pos]
end_pos = [np.array(x, dtype=np.int) for x in end_pos]

In [None]:
lbl_names_flatten = [item for sublist in lbl_names for item in sublist]

In [None]:
lbl_enc = LabelEncoder()
lbl_enc.fit(list(set(lbl_names_flatten)))
lbl_enc.classes_
lbl_codes = [lbl_enc.transform(x) for x in lbl_names]

In [None]:
lbl_codes = [x.astype('str_') for x in lbl_codes]

In [None]:
lbl_marks = list(zip(start_pos, end_pos, lbl_codes))

In [None]:
y_init = [np.array(['O']*len(x), dtype=object) for x in ori_text]

In [None]:
def cat_sub(marks, init_seq_):
    st_pos = marks[0]
    ed_pos = marks[1]
    cats = marks[2]
    assert len(st_pos) == len(ed_pos) == len(cats)
    init_seq = copy.deepcopy(init_seq_)
#     init_seq = init_seq.tolist()
    for i in range(len(st_pos)):
        init_seq[st_pos[i]:ed_pos[i]] = 'I-'+ cats[i]
        init_seq[st_pos[i]] = 'B-' + cats[i]
        
    return init_seq

In [None]:
y = [cat_sub(lbl_marks[i], y_init[i]) for i in range(len(y_init))]

In [None]:
def cleantext(text_, lbl_seq_):
#     _ = [list(range(i.start(), i.end())) for i in re.finditer('[^\u4e00-\u9fa5]+', text_)]
    _ = [list(range(i.start(), i.end())) for i in re.finditer(r'\\', text_)]
    non_chs_idx = [item for sublist in _ for item in sublist]
    lbl_seq = copy.deepcopy(lbl_seq_)
    lbl_seq = np.delete(lbl_seq, non_chs_idx)
    
    text = copy.deepcopy(text_)
    text = re.sub(r'\\', '', text)
    
    assert len(text) == len(lbl_seq)
    return text, lbl_seq_

In [None]:
all_types = [np.unique(x).tolist() for x in y]
all_types = [item for sublist in all_types for item in sublist]
all_types = set(all_types)

In [None]:
tokenizer = BertTokenizer.from_pretrained("D:/NLP/00inv_ner/K_INV")

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(ori_text, y)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:
tag_values = list(all_types)
tag_values.append('PAD')
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=512, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=512, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs = input_ids[:1000]
val_inputs = input_ids[1000:]

tr_tags = tags[:1000]
val_tags = tags[1000:]

tr_masks = attention_masks[:1000]
val_masks = attention_masks[1000:]

In [None]:
tr_inputs = torch.tensor(tr_inputs, dtype=torch.long)
val_inputs = torch.tensor(val_inputs, dtype=torch.long)
tr_tags = torch.tensor(tr_tags, dtype=torch.long)
val_tags = torch.tensor(val_tags, dtype=torch.long)
tr_masks = torch.tensor(tr_masks, dtype=torch.long)
val_masks = torch.tensor(val_masks, dtype=torch.long)

In [None]:
train_data = torch.utils.data.TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=1)

valid_data = torch.utils.data.TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = torch.utils.data.SequentialSampler(valid_data)
valid_dataloader = torch.utils.data.DataLoader(valid_data, sampler=valid_sampler, batch_size=1)

In [None]:
model = BertForTokenClassification.from_pretrained(
    'D:/NLP/00inv_ner/K_INV/',
    num_labels=len(tag2idx),
    output_attentions=False,
    output_hidden_states=False).cuda()

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{
        "params": [p for n, p in param_optimizer]
    }]

bert_optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-8)

In [None]:
epochs = 40
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    bert_optimizer,
    num_warmup_steps=200,
    num_training_steps=total_steps
)

In [None]:
crf_model = torchcrf.CRF(len(tag2idx), batch_first=True).cuda()

In [None]:
crf_optimizer = AdamW(crf_model.parameters(), lr=8e-5)
crf_scheduler = get_linear_schedule_with_warmup(
    crf_optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

In [None]:
## Store the average loss after each epoch so we can plot them.
bert_loss_values, loss_values, validation_loss_values = [], [], []

for _ in range(epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    crf_model.train()
    # Reset the total loss for this epoch.
    total_loss = 0
    bert_total_loss = 0
    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        crf_model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        bert_outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        bert_loss = bert_outputs[0]
        loss = crf_model(bert_outputs[1], b_labels, mask=b_input_mask.bool())
        # get the loss
        loss = -loss
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        bert_total_loss += bert_loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                       max_norm=max_grad_norm)
        # update parameters
        bert_optimizer.step()
        crf_optimizer.step()
        # Update the learning rate.
        scheduler.step()
#         print('now processing ', step, ' step')
        crf_scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    avg_bert_loss = bert_total_loss / len(train_dataloader)
    
    print('Average Bert loss: {}'.format(avg_bert_loss))
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    bert_loss_values.append(avg_bert_loss)
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    crf_paths = []
    # Put the model into evaluation mode
    model.eval()
    crf_model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            bert_outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
            loss_ = crf_model(bert_outputs[0], b_labels, mask=b_input_mask.bool())
            # Move logits and labels to CPU
        bert_logits = bert_outputs[0].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        crf_path = crf_model.decode(emissions=bert_outputs[0], mask=b_input_mask.bool())
        # Calculate the accuracy for this batch of test sentences.
#         eval_loss += -loss_
        crf_paths.extend(crf_path)
        predictions.extend([list(p) for p in np.argmax(bert_logits, axis=2)])
        true_labels.extend(label_ids)
        
    crf_tags = [[
        tag_values[c_i] for c_i, l_i in zip(c, l) if tag_values[l_i] != 'PAD'
    ] for c, l in zip(crf_paths, true_labels)]

    pred_tags = [[
        tag_values[p_i] for p_i, l_i in zip(p, l) if tag_values[l_i] != 'PAD'
    ] for p, l in zip(predictions, true_labels)]

    valid_tags = [[tag_values[l_i] for l_i in l if tag_values[l_i] != 'PAD']
                  for l in true_labels]

    print(seqeval.metrics.classification_report(valid_tags, pred_tags, digits=3))
    print('\n')
    print(seqeval.metrics.classification_report(valid_tags, crf_tags, digits=3))

In [None]:
crf_tags = [[
    tag_values[c_i] for c_i, l_i in zip(c, l) if tag_values[l_i] != 'PAD'
] for c, l in zip(crf_paths, true_labels)]

pred_tags = [[
    tag_values[p_i] for p_i, l_i in zip(p, l) if tag_values[l_i] != 'PAD'
] for p, l in zip(predictions, true_labels)]

valid_tags = [[tag_values[l_i] for l_i in l if tag_values[l_i] != 'PAD']
              for l in true_labels]


In [None]:
print(seqeval.metrics.classification_report(valid_tags, pred_tags, digits=3))
print('\n')
print(seqeval.metrics.classification_report(valid_tags, crf_tags, digits=3))