### Preparing Data

Let's prepare data for our model. The dataset contains sentence and tags (for the respective sentence) columns. 

In [2]:

from evaluation import *
import pandas as pd
import random
df2 = pd.read_csv('ner/ner_dataset.csv', encoding= 'unicode_escape')
#df2=df2[["Word","Tag"]]
df_test=df2[["Sentence #"]]
df2=df2.fillna(method='ffill')

we will create dictonaries from the dataset we prepared. 

In [96]:
Sentence2word = df2.groupby('Sentence #')['Word'].apply(lambda x:x.tolist()).to_dict()
Sentence2tag = df2.groupby('Sentence #')['Tag'].apply(lambda x:x.tolist()).to_dict()

Now we will create function which will preprocess our dataset. It will be helpful in generating the dictionaries of tokens and tags. 

In [3]:
from collections import defaultdict

def preprocess(data,special_t):  
    tok2idx = defaultdict(lambda: 0)
    tok2idx[special_t] = 0 
    #idx2tok = []
    vocab=list(set(data))
    for i,sp_tok in enumerate(vocab):
        #idx2tok.append(sp_tok)
        if sp_tok not in tok2idx.keys():
            tok2idx[sp_tok]=i+1

    return tok2idx

Let's create token2id  and tag2id dictionaries using the function(preprocess) built above.

In [134]:
token2idx = preprocess(df2.Word,'<PAD>')
tag2idx = preprocess(np.unique(df2.Tag)[:-1],'O')


Also generate the id2token and id2tag with help of dictionaries created above.

In [136]:
idx2token= [x for x in token2idx]
idx2tag= [x for x in tag2idx]

In [135]:
tag2idx

defaultdict(<function __main__.preprocess.<locals>.<lambda>()>,
            {'O': 0,
             'I-org': 1,
             'B-art': 2,
             'B-org': 3,
             'B-geo': 4,
             'I-eve': 5,
             'B-per': 6,
             'B-eve': 7,
             'I-gpe': 8,
             'I-per': 9,
             'B-gpe': 10,
             'I-geo': 11,
             'I-tim': 12,
             'B-tim': 13,
             'I-art': 14,
             'B-nat': 15,
             'I-nat': 16})

We need to define the train & test set for our model. So for that we will be using 80% record of dictionaries(Sentence2word,Sentence2tag) as train set and 20% as test set.

In [100]:
train_size=int(len(Sentence2word.keys())*0.8)
test_size=len(Sentence2word.keys()) - train_size

In [101]:
train_sentence_dict={}
for i in range(train_size):
    
    sentence=random.choice(list(Sentence2word.keys()))
    train_sentence_dict[sentence] = Sentence2word[sentence]
    
train_tags_dict={}    
for sentence in train_sentence_dict.keys():
    
    train_tags_dict[sentence] = Sentence2tag[sentence]

In [102]:
test_sentence_dict={}
test_sentences = [sentence for sentence in Sentence2word.keys() if not sentence in train_sentence_dict.keys()] 
for sentence in test_sentences:
    
    test_sentence_dict[sentence] = Sentence2word[sentence]
    
test_tags_dict={}    
for sentence in test_sentence_dict.keys():
    
    test_tags_dict[sentence] = Sentence2tag[sentence]


We will be passing the data to the model in batches.For generating batches from the data we use following function.

In [103]:
def batches_generator(batch_size, sentences, tags):
    
    n_samples = len(sentences)
    order = np.arange(n_samples)
    key_list=list(sentences.keys())

    n_batches = n_samples // batch_size
    if n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            
            x_temp = [token2idx[token] for  i,token in enumerate(sentences[key_list[idx]])]
            x_temp = x_temp[:-1]
            x_list.append(x_temp)
            
            y_temp = [tag2idx[tag] for j,tag in enumerate(tags[key_list[idx]])]
            y_temp = y_temp[:-1]
            y_list.append(y_temp)
            
            max_len_token = max(max_len_token, len(tags[key_list[idx]]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            lengths[n] = utt_len
            x[n, :utt_len] = x_list[n]
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

### Defining the model

Let's define our model

In [104]:
import tensorflow as tf
import numpy as np
tf.compat.v1.disable_eager_execution()

class BiLSTMModel():
    pass

Various variables/parameters for the model are as follows. 

In [5]:
def declare_placeholders(self):
    self.input_batch = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.ground_truth_tags = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None], name='label_batch')   
    self.lengths = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
    
    self.dropout_ph = tf.compat.v1.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    
    self.learning_rate_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[]) 

In [106]:
BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)

The architecture of model will contain two LSTM Layers.Let's define that.

In [107]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
    
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix_variable = tf.compat.v1.Variable(initial_embedding_matrix,dtype=tf.float32,name='embeddings_matrix') 
    
    forward_cell =  tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_hidden_rnn,use_peepholes=True,name='forward_lstm'),input_keep_prob=self.dropout_ph,output_keep_prob=self.dropout_ph)
    backward_cell =  tf.compat.v1.nn.rnn_cell.DropoutWrapper(tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_hidden_rnn,use_peepholes=True,name='back_lstm'),input_keep_prob=self.dropout_ph,output_keep_prob=self.dropout_ph)


    embeddings = tf.compat.v1.nn.embedding_lookup(embedding_matrix_variable,self.input_batch) 
    
    (rnn_output_fw, rnn_output_bw), _ = tf.compat.v1.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,inputs=embeddings,sequence_length=self.lengths,dtype=tf.float32) ######### YOUR CODE HERE #############
    rnn_output = tf.compat.v1.concat([rnn_output_fw, rnn_output_bw], axis=2)
   
    self.logits = tf.compat.v1.layers.dense(rnn_output, n_tags, activation=None)

In [108]:
BiLSTMModel.__build_layers = classmethod(build_layers)

Prediction function for our model is as follows.

In [109]:
def compute_predictions(self):
    
    softmax_output = tf.nn.softmax(self.logits,name='softmax') 
    
    self.predictions = tf.math.argmax(softmax_output,axis=-1) 

In [110]:
BiLSTMModel.__compute_predictions = classmethod(compute_predictions)

Function for computing loss is defined as follows.

In [111]:
def compute_loss(self, n_tags, PAD_index):
    
    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)
    loss_tensor = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(ground_truth_tags_one_hot,self.logits) 
    
    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)

    self.loss = tf.reduce_mean(loss_tensor*mask) 

In [112]:
BiLSTMModel.__compute_loss = classmethod(compute_loss)

We need a function which will do optimization and train our model.

In [7]:
def perform_optimization(self):
    
    self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate_ph) 
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
    
    clip_norm = tf.cast(1.0, tf.float32)
    self.grads_and_vars =[ (tf.clip_by_norm(gv[0], clip_norm),gv[1]) for gv in self.grads_and_vars]  
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

In [114]:
BiLSTMModel.__perform_optimization = classmethod(perform_optimization)

Let's aggregate our model with the functions we defined

In [115]:
def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
    self.__compute_predictions()
    self.__compute_loss(n_tags, PAD_index)
    self.__perform_optimization()

In [116]:
BiLSTMModel.__init__ = classmethod(init_model)

Function for training on a batch is defined as follows.

In [117]:
def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    feed_dict = {self.input_batch: x_batch,
                 self.ground_truth_tags: y_batch,
                 self.learning_rate_ph: learning_rate,
                 self.dropout_ph: dropout_keep_probability,
                 self.lengths: lengths}
    
    session.run(self.train_op, feed_dict=feed_dict)

In [118]:
BiLSTMModel.train_on_batch = classmethod(train_on_batch)

Function for predicting on a batch is defined as follows.

In [119]:
def predict_for_batch(self, session, x_batch, lengths):
    

    feed_dict = {self.input_batch: x_batch,
                        self.lengths: lengths}
    predictions = session.run(self.predictions,feed_dict=feed_dict)
    return predictions

In [120]:
BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)

### Model training
Let's run our model and train it over the dataset we prepared

In [121]:
tf.compat.v1.reset_default_graph()

model = BiLSTMModel(vocabulary_size=len(token2idx),n_tags=len(tag2idx),embedding_dim=200,n_hidden_rnn=200,PAD_index=token2idx['<PAD>']) ######### YOUR CODE HERE #############

batch_size = 32 
n_epochs = 4  
learning_rate = 0.005 
learning_rate_decay = np.sqrt(2) 
dropout_keep_probability = 0.9 

In [143]:
sess = tf.compat.v1.Session()
sess.run(tf.compat.v1.global_variables_initializer())


for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_conll(model, sess, dict(list(train_sentence_dict.items())[0:20000]), dict(list(train_tags_dict.items())[0:20000]), short_report=True)
    print('Validation data evaluation:')
    eval_conll(model, sess, dict(list(train_sentence_dict.items())[20000:-1]), dict(list(train_tags_dict.items())[20000:-1]), short_report=True)
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size,dict(list(train_sentence_dict.items())[0:20000]) ,dict(list(train_tags_dict.items())[0:20000])):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
        #tags_batch, tokens_batch = predict_tags(model, sess, x_batch, lengths)
        #print(tags_batch)
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    

-------------------- Epoch 1 of 4 --------------------
Train data evaluation:
processed 437864 tokens with 46665 phrases; found: 318335 phrases; correct: 2600.

precision:  0.82%; recall:  5.57%; FB1:  1.42


Validation data evaluation:
processed 139324 tokens with 14904 phrases; found: 101180 phrases; correct: 825.

precision:  0.82%; recall:  5.54%; FB1:  1.42


-------------------- Epoch 2 of 4 --------------------
Train data evaluation:
processed 437864 tokens with 46665 phrases; found: 49073 phrases; correct: 24781.

precision:  50.50%; recall:  53.10%; FB1:  51.77


Validation data evaluation:
processed 139324 tokens with 14904 phrases; found: 15345 phrases; correct: 7594.

precision:  49.49%; recall:  50.95%; FB1:  50.21


-------------------- Epoch 3 of 4 --------------------
Train data evaluation:
processed 437864 tokens with 46665 phrases; found: 48383 phrases; correct: 34996.

precision:  72.33%; recall:  74.99%; FB1:  73.64


Validation data evaluation:
processed 139324 tok

In [6]:
def predict_tags(model, session, token_idxs_batch, lengths):
    
    tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)
    
    tags_batch, tokens_batch = [], []
    for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):
        tags, tokens = [], []
        for tag_idx, token_idx in zip(tag_idxs, token_idxs):
            tags.append(idx2tag[tag_idx]  )
            tokens.append(idx2token[token_idx] )
        tags_batch.append(tags)
        tokens_batch.append(tokens)
    return tags_batch, tokens_batch
    
    
def eval_conll(model, session, tokens, tags, short_report=True):
    
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        ground_truth_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): 
            if token != '<PAD>':
                ground_truth_tags.append(idx2tag[gt_tag_idx])
                predicted_tags.append(pred_tag)

        y_true.extend(ground_truth_tags + ['O'])
        y_pred.extend(predicted_tags + ['O'])
        
    
    return precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)