In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import tensorflow_addons as tfa
from seqeval.metrics import accuracy_score
from seqeval.metrics import f1_score
from seqeval.metrics import precision_score
from seqeval.metrics import recall_score
from seqeval.metrics import classification_report as seqeval_cs
from sklearn.metrics import classification_report as sklearn_cs
from utils import *
import models.bilstm_crf as bilstm_crf
import nltk
from nltk.tokenize import word_tokenize
from datasets import load_dataset
nltk.download('punkt')


@tf.function(input_signature=[({'input_ids': tf.TensorSpec(shape=(None,None,15)),
                    'attention_masks': tf.TensorSpec(shape=(None,None),dtype=tf.bool),
                    'pos_tags': tf.TensorSpec(shape=(None,None))},tf.TensorSpec(shape=(None,None)))])
def train_step(gen_data):

    x = gen_data[0]
    y = gen_data[1]
    seq_len = tf.reduce_sum(tf.cast(x['attention_masks'],dtype=tf.int32),axis=1)

    with tf.GradientTape() as tape:
        # loss = get_preds(x,y)

        seq,logits = net(x)
        loss = tfa.text.crf_log_likelihood(logits,y,seq_len,crf_params)[0]
        loss = -tf.reduce_mean(loss)
    
  
  # Compute gradients
    trainable_vars = net.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)
    optimizer.apply_gradients(zip(gradients, trainable_vars))
    return(loss,seq)

@tf.function(input_signature=[({'input_ids': tf.TensorSpec(shape=(None,None,15)),
                    'attention_masks': tf.TensorSpec(shape=(None,None),dtype=tf.bool),
                    'pos_tags': tf.TensorSpec(shape=(None,None))},tf.TensorSpec(shape=(None,None)))])
def test_step(gen_data):    

    x = gen_data[0]
    y = gen_data[1]
    seq_len = tf.reduce_sum(tf.cast(x['attention_masks'],dtype=tf.int32),axis=1)

    seq,logits = net(x)
    loss = tfa.text.crf_log_likelihood(logits,y,seq_len,crf_params)[0]
    loss = -tf.reduce_mean(loss)

    return(loss,seq)

[nltk_data] Downloading package punkt to /home/sumeet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/sumeet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
dataset = load_dataset("conll2003")
train_sents = dataset['train']['tokens']
test_sents = dataset['test']['tokens']
val_sents = dataset['validation']['tokens']
train_tags = dataset['train']['ner_tags']
test_tags = dataset['test']['ner_tags']
val_tags = dataset['validation']['ner_tags']
train_pos = dataset['train']['pos_tags']
test_pos = dataset['test']['pos_tags']
val_pos = dataset['validation']['pos_tags']
label_array = np.array(['O','B-MISC','I-MISC','B-PER', 'I-PER','B-LOC','I-LOC','B-ORG', 'I-ORG'])

force_lower=False
sents = train_sents
if force_lower:
    sents = train_sents
    for i in range(len(sents)):
        for j in range(len(sents[i])):
            sents[i][j] = sents[i][j].lower()
    sents = test_sents
    for i in range(len(sents)):
        for j in range(len(sents[i])):
            sents[i][j] = sents[i][j].lower()
    sents = val_sents
    for i in range(len(sents)):
        for j in range(len(sents[i])):
            sents[i][j] = sents[i][j].lower()

Reusing dataset conll2003 (/home/sumeet/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63ba56944e35c1943434322a07ceefd79864672041b7834583709af4a5de4664)


In [3]:
num_classes = len(label_array)
sample = [["ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]"]]
cti,itc = build_char_vocab(sample,pad_index=0,pad_token="<PAD>",unk_index=1,unk_token="<UNK>")

use_char_level_model = True
use_pos_features = False

if use_char_level_model:
    datagen = datagen_char
    vocab = cti
else:
    datagen = datagen_word
    vocab = wti
    
vocab_dim = len(vocab)
batch_size = 64
train_steps = int(len(train_sents)/batch_size)
test_steps = int(len(test_sents)/batch_size)
val_steps = int(len(val_sents)/batch_size)
gen_train = datagen(train_sents,train_tags,train_pos,vocab,0,batch_size)
gen_test = datagen(test_sents,test_tags,test_pos,vocab,0,batch_size)
gen_val = datagen(val_sents,val_tags,val_pos,vocab,0,batch_size)



epochs = 50
character_embedding_size = 256
word_embedding_size = 256
lstm_hidden_size = 128
optimizer = tf.keras.optimizers.Adam()

net = bilstm_crf.model(vocab_dim,
                    character_embedding_size,
                    word_embedding_size,
                    lstm_hidden=lstm_hidden_size,
                    num_classes=num_classes,
                    use_char_level_embeddings=use_char_level_model,
                    dropout=0.3,
                    use_pos_features=use_pos_features,
                    num_pos=5)

In [7]:
crf_params = net.get_layer('crf').chain_kernel
acc = []
f1 = []

print('Training.....')
for iteration,data in enumerate(gen_train):
    i = iteration+1 
    epoch = i//train_steps   
    if epoch > epochs:
        break
    x = data[0]
    y = data[1]
    loss,seq = train_step(data)
      
    masked_pred = tf.boolean_mask(seq,x['attention_masks'])
    masked_y = tf.boolean_mask(y,x['attention_masks'])
    label_true = label_array[masked_y.numpy().astype(int)]
    label_pred = label_array[masked_pred.numpy().astype(int)]
    acc.append(accuracy_score([label_true.tolist()],[label_pred.tolist()]))
    f1.append(f1_score([label_true.tolist()],[label_pred.tolist()]))

    if i%train_steps == 0:
        print('\nEpoch -' + str(epoch))
        print('Train') 
        print('Loss -' + str(loss))         
        print('Accuracy -' + str(sum(acc)/train_steps))
        print('F1 -' + str(sum(f1)/train_steps)) 
        acc = []
        f1 = []

        for iteration,data in enumerate(gen_val):  
            i = iteration+1      
            x = data[0]
            y = data[1]
            loss,seq = test_step(data)
      
            masked_pred = tf.boolean_mask(seq,x['attention_masks'])
            masked_y = tf.boolean_mask(y,x['attention_masks'])
            label_true = label_array[masked_y.numpy().astype(int)]
            label_pred = label_array[masked_pred.numpy().astype(int)]
            acc.append(accuracy_score([label_true.tolist()],[label_pred.tolist()]))
            f1.append(f1_score([label_true.tolist()],[label_pred.tolist()]))
            if i%val_steps == 0:
                print('\nValidation')
                print('Loss -' + str(loss))         
                print('Accuracy -' + str(sum(acc)/val_steps))
                print('F1 -' + str(sum(f1)/val_steps)) 
                print('\n*************************')
                acc = []
                f1 = []
                break
print('Finished training !!')                
#         for iteration,data in enumerate(gen_test):  
#             i = iteration+1      
#             x = data[0]
#             y = data[1]
#             loss,seq = test_step(data)

#             masked_pred = tf.boolean_mask(seq,x['attention_masks'])
#             masked_y = tf.boolean_mask(y,x['attention_masks'])
#             label_true = label_array[masked_y.numpy().astype(int)]
#             label_pred = label_array[masked_pred.numpy().astype(int)]
#             acc.append(accuracy_score([label_true.tolist()],[label_pred.tolist()]))
#             f1.append(f1_score([label_true.tolist()],[label_pred.tolist()]))
#             if i%val_steps == 0:
#                 print('\nTest')
#                 print('Loss -' + str(loss))         
#                 print('Accuracy -' + str(sum(acc)/test_steps))
#                 print('F1 -' + str(sum(f1)/test_steps)) 
#                 print('\n*************************')
#                 acc = []
#                 f1 = []
#                 break

Training.....


KeyboardInterrupt: 

In [8]:
print('Testing......')
acc = []
f1 = []
for iteration,data in enumerate(gen_test):  
    i = iteration+1      
    x = data[0]
    y = data[1]
    loss,seq = test_step(data)

    masked_pred = tf.boolean_mask(seq,x['attention_masks'])
    masked_y = tf.boolean_mask(y,x['attention_masks'])
    label_true = label_array[masked_y.numpy().astype(int)]
    label_pred = label_array[masked_pred.numpy().astype(int)]
    acc.append(accuracy_score([label_true.tolist()],[label_pred.tolist()]))
    f1.append(f1_score([label_true.tolist()],[label_pred.tolist()]))
    if i%val_steps == 0:
        print('\nTest')
        print('Loss -' + str(loss))         
        print('Accuracy -' + str(sum(acc)/test_steps))
        print('F1 -' + str(sum(f1)/test_steps)) 
        print('\n*************************')
        acc = []
        f1 = []
        break

KeyboardInterrupt: 