In [1]:
import os
import time
import config
import logging
import argparse
import tensorflow as tf

from kalm import *
from data_utils import *

from keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import accuracy_score

Using TensorFlow backend.


In [2]:

import warnings

warnings.filterwarnings("ignore")

# Data path
DATA_PATH = "./CONLL2003/"

# Standard parameters
TRAIN_DP = 16
TEST_DP = 2
BATCH_SIZE = 16
embedding_dim = 400
dec_units = 1150
WH_UNITS = WE_UNITS = 100
VOCAB_SIZE = 5000
NB_ENTITIES = 4
EPOCHS = 1

In [3]:
import os
import time
import config
import math
import numpy as np
import pandas as pd
import tensorflow as tf


from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Dense,Flatten,Dropout,RepeatVector,Embedding,Input,LSTM



# TypeEmbedding layer

class TypeEmbedding(tf.keras.layers.Layer):
    
    def __init__(self,nb_entities,wh_units,we_units):
        
        super(TypeEmbedding,self).__init__()
        
        assert type(nb_entities) == int , "Please provide integer number to number of entities"
        
        self.Wh = Dense(wh_units)
        self.We = [Dense(we_units) for _ in range(nb_entities)]
        
    def call(self,hidden_state):
        
        # hidden_state == hidden_states i.e (bs,hidden_size)
        # Wh is used for dimentionality reduction as mentioned in paper and is shared.
        
        reduced_dimentionality = self.Wh(hidden_state)    # (bs,wh_units) i.e. (bs,400)
        
        outputs = []
        
        for we in self.We:
            logits = we(reduced_dimentionality)
            logits = tf.expand_dims(logits,axis = 0)            # (1,bs,we_units)
            outputs.append(logits)
            
        outputs = tf.concat(outputs,axis=0)  # (nb_entities,bs,we_units)
        
        outputs = tf.transpose(outputs,perm = [1,0,2])  # (bs,nb_entities,we_units)
        outputs = tf.nn.softmax(outputs,axis= -1)      # (bs,nb_entities,we_units)
        
        # code for vt
        vt = []
        
        for i in range(len(self.We)):
            logits = self.We[i](outputs[:,i,:])
            logits = tf.expand_dims(logits,axis = 0) # (1,bs,we_units)
            vt.append(logits)
            
        result = tf.concat(vt,axis=0)                   # (nb_entities,bs,we_units)
        result = tf.transpose(result,perm= [1,2,0] )  # (bs,we_uninb_entities)
        result = tf.reduce_sum(result,axis = -1)        # (bs,we_units)
            
        return outputs,result                 # (bs,nb_entities,units)  (bs,we_units)


# Projection layer W(p,j) for j = 1,2,...K

class ProjectionLayer(tf.keras.layers.Layer):
    
    def __init__(self,nb_entities,wp_units):
        
        super(ProjectionLayer,self).__init__()
        
        assert type(nb_entities) == int , "Please provide integer number to number of entities"
        self.Wp = [Dense(wp_units) for _ in range(nb_entities)]
        
    def call(self,inputs,type_prob):
        
        # expected inputs are hidden states of lstm. i.e (batch_size,hidden_units)
        # type_prob - (bs,nb_entites,we_units)
        
        outputs = []
        for wp in self.Wp:
            logits = wp(inputs)
            logits = tf.nn.softmax(logits,axis = -1)   # (bs,units)
            logits = tf.expand_dims(logits,axis = 0)            # (1,bs,units)
            outputs.append(logits)
            
        outputs = tf.concat(outputs,axis=0)  # (nb_entities,bs,units)
        
        outputs = tf.transpose(outputs,perm = [1,2,0])  # (bsb,nb_entities,units)
        
        result = tf.matmul(outputs,type_prob)
        result = tf.reduce_sum(result,axis=-1)    # (bs,ts,vg)
        
        return result

class Decoder(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim, dec_units,nb_entities,wh_units,we_units):
        
        super(Decoder, self).__init__()
        # self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        self.lstm_1 = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       recurrent_initializer='glorot_uniform')
        self.lstm_2 = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       recurrent_initializer='glorot_uniform')

        self.lstm_3 = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       recurrent_initializer='glorot_uniform')
        
        self.type_emb = TypeEmbedding(nb_entities,wh_units,we_units)
        self.projection = ProjectionLayer(nb_entities,vocab_size)


    def call(self, x, hidden_state):
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        #print("Embedding : ",x.shape)
        
        
        # Call for TypeEmbedding Layer to get vt and type_probabilities
        # type_prob (bs,nb_entities,we_units) and vt shape is (bs,we_units)
        type_prob , vt = self.type_emb(hidden_state)

        #print("After type :",type_prob.shape,vt.shape)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(vt, 1), x], axis=-1)
        
        #print("LSTM input = ",x.shape)
        hidden = self.lstm_1(x)
        hidden = self.lstm_2(hidden)
        hidden = self.lstm_3(hidden)
        
        #print("Final op : ",hidden.shape)
        # output shape == (batch_size * 1, hidden_size)
        hidden = tf.reshape(hidden, (-1, hidden.shape[2]))

        # output shape == (batch_size, vocab)
        final_result = self.projection(hidden,type_prob)
        
        
        
        return hidden, final_result,type_prob
    
    def initialize_hidden_state(self,batch=None):

        assert batch is not None , "please provide batch_size..."
        return tf.zeros((batch,self.dec_units))
        



In [4]:
import re
def decontracted(phrase):

    """
    This funtion is for preprocssing the given phrase.
    """
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase.lower()


def read_lines(file_path):

    with open(file_path,"r") as f:
        train_text = f.readlines()
    return train_text



def get_data(file_path):
    
    train_text = read_lines(file_path)
    
    train = []    
    for sent in train_text[1:]:
        if sent.split(" ")[0] != "\n":
            train.append(sent.split(" ")[0])
        else:
            train.append("</s>")

    train_sent = " ".join(train)
    
    train_data = []
    for es in train_sent.split("</s>"):
        es = es.strip()
        if len(es) > 0:
            es = "<s> " + es + " </s>"
            train_data.append(es)
    
    return [decontracted(phrase) for phrase in train_data]


In [5]:
import logging
logging.basicConfig(filename='app.log', filemode='w',level=logging.DEBUG)

logging.info("Data preprocessed...")

#get the CONLL2003 dataset
x_train = get_data(DATA_PATH + "train.txt")
x_test = get_data(DATA_PATH + "test.txt")
x_valid = get_data(DATA_PATH + "valid.txt")


general_vocab = dict()
general_vocab["_"] = 0
general_vocab["<s>"] = 1
general_vocab["</s>"] = 2

count = 2
for es in x_train:
    for ew in es.split(" "):
        if ew in general_vocab:
            pass
        else:
            count  += 1
            general_vocab[ew] = count

word_ix = general_vocab
ix_word = dict()

for k,v in general_vocab.items():
    ix_word[v] = k

logging.info("wordix and ixword dictionary created...")

train_x = [[word_ix[w] if w in word_ix else 0 for w in sent.split()] for sent in x_train]
test_x = [[word_ix[w] if w in word_ix else 0 for w in sent.split()] for sent in x_test]
val_x = [[word_ix[w] if w in word_ix else 0 for w in sent.split()] for sent in x_valid]

max_sequence_length = 0

for es in train_x:
    if len(es) >= max_sequence_length:
        max_sequence_length = len(es)

logging.info("Maximum sequence length found - {}".format(max_sequence_length))


####################pad sequences######################

train_x_padded = pad_sequences(train_x,padding="post")
test_x_padded = pad_sequences(test_x,maxlen=train_x_padded.shape[1],padding="post")
val_x_padded = pad_sequences(val_x,maxlen=train_x_padded.shape[1],padding="post")

train_y = train_x_padded
test_y = test_x_padded
val_y = val_x_padded

logging.info("Sequence padded...")

# train data
ds = tf.data.Dataset.from_tensor_slices((train_x_padded, train_x_padded))
ds = ds.take(TRAIN_DP).shuffle(TRAIN_DP).batch(BATCH_SIZE)

# test data
ds_test = tf.data.Dataset.from_tensor_slices((test_x_padded, test_x_padded))
ds_test = ds_test.take(TEST_DP).shuffle(TEST_DP).batch(1)


In [6]:
decoder = Decoder(5000,embedding_dim,dec_units,
                        NB_ENTITIES,WH_UNITS,WE_UNITS)

optimizer = tf.keras.optimizers.Adam()
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 decoder=decoder)


loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

def loss_function(real, pred):
        
    loss_ = tf.keras.losses.sparse_categorical_crossentropy(real,pred)
    return tf.reduce_mean(loss_)


def train_step(inp, targ, dec_hidden):
  loss = 0

  with tf.GradientTape() as tape:

    dec_input = tf.expand_dims([word_ix['<s>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder

      dec_hidden,predictions,_ = decoder(dec_input,dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)
    
  
  batch_loss = (loss / int(targ.shape[1]))
    

  grads = tape.gradient(loss, decoder.trainable_variables)
  optimizer.apply_gradients(zip(grads, decoder.trainable_variables))
    
  
  return batch_loss



######################training process###################

steps_per_epoch = 10

for epoch in range(EPOCHS):
  start = time.time()

  dec_hidden = decoder.initialize_hidden_state(BATCH_SIZE)
  total_loss = 0

  print("*"*60)
  print("epoch : ",epoch, " started")
  for (batch, (inp, targ)) in enumerate(ds):
    
    batch_loss = train_step(inp, targ, dec_hidden)
    total_loss += batch_loss
    
  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec'.format(time.time() - start))

  print("epoch : ",epoch, " ended\n")


************************************************************
epoch :  0  started
Epoch 1 Loss 0.8443
Time taken for 1 epoch 36.05719327926636 sec
epoch :  0  ended



In [32]:
def evaluate(sentence):
    
    dec_hidden = decoder.initialize_hidden_state(1)
    print(sentence.numpy())
    dec_input = tf.expand_dims([sentence[0][1]],0)

    result = ""
    predict = []

    logging.info("Evaluation")
    
    for t in range(max_sequence_length):
        
        dec_hidden,predictions,type_prob = decoder(dec_input,dec_hidden)

        predicted_id = tf.argmax(predictions[0]).numpy()
        predict.append(predicted_id)
        print(predictions.shape)
        result += ix_word[predicted_id] + ' '

        if ix_word[predicted_id] == '</s>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    logging.debug("result - {}".format(result))

    return result, sentence,predict


In [34]:
predictions = []
for batch,(inp, targ) in enumerate(ds_test):

    print(inp[0][1])
    _,_,pred = evaluate(inp)

    predictions.append(pred)


tf.Tensor(135, shape=(), dtype=int32)
[[    1   135   344  4532   198  3813  4787  1086   231    16   500  1770
    162    16   390    72  7704     7    80  3803  2325  3776     7 17342
      0    11     2     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]]
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)


(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
tf.Tensor(1675, shape=(), dtype=int32)
[[   1 1675 1313   16 3102  162  198 5610 1768 1973   24   80 6418 1761
  2303  745  666  231   80  390 2101 2005 1770   20 1096   11    2    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]]
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 5000)
0
(1, 50

KeyboardInterrupt: 

In [None]:
dec_hidden = decoder.initialize_hidden_state(1)