In [1]:
import unicodedata
import re
import pandas as pd
from difflib import SequenceMatcher
import numpy as np
import tensorflow as tf
import os
import time

train = pd.read_csv(r'Data/train_final.csv')
dev = pd.read_csv(r'Data/dev_final.csv')

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z0-9?.!,¿|]+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

# Convert sequences to tokenizers
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    # tensor = tf.ragged.constant(tensor)
    # print(tensor)
    return tensor, lang_tokenizer

# Load the dataset
def load_dataset(number):
    # Create dataset (targ_lan = English, inp_lang = French)
    inp_lang = (train['paragraph']+" | "+train['question']).copy()
    targ_lang = (train['answer']).copy()
    act_inp = []
    act_out = []
    ep = number
    if number == 0:
        ep = len(inp_lang)
    for i in range(ep):
        if(len(str(targ_lang[i]).split(' '))==1):
            act_inp.append(preprocess_sentence(str(inp_lang[i])))
            act_out.append(preprocess_sentence(str(targ_lang[i])))
    
    inp_lang = (dev['paragraph']+" | "+dev['question']).copy()
    targ_lang = (dev['answer']).copy()
    if number == 0:
        ep = len(inp_lang)
    for i in range(ep):
        if(len(str(targ_lang[i]).split(' '))==1):
            act_inp.append(preprocess_sentence(str(inp_lang[i])))
            act_out.append(preprocess_sentence(str(targ_lang[i])))
            
    input_tensor, inp_lang_tokenizer = tokenize(act_inp)
    target_tensor, targ_lang_tokenizer = tokenize(act_out)
    
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [2]:
number_of_instances = 0

input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(number_of_instances)
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

print(max_length_targ, max_length_inp )

from sklearn.model_selection import train_test_split

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)


embeddings_index = {}
f = open('glove.42B.300d.txt', encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

inp_embedding_matrix = np.zeros((len(inp_lang_tokenizer.word_index) + 1, 300))
targ_embedding_matrix = np.zeros((len(targ_lang_tokenizer.word_index) + 1, 300))

for word, i in inp_lang_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        inp_embedding_matrix[i] = embedding_vector
        
for word, i in targ_lang_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        targ_embedding_matrix[i] = embedding_vector

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

3 579
17116 17116 4280 4280


In [3]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 1
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 50
vocab_inp_size = len(inp_lang_tokenizer.word_index) + 1
vocab_tar_size = len(targ_lang_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset_test = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(BUFFER_SIZE)

In [4]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
# Size of input and target batches
example_input_batch, example_target_batch = next(iter(dataset))
print(example_input_batch.shape, example_target_batch.shape)

# Encoder class
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,  weights=[inp_embedding_matrix], trainable=False)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))


# Attention Mechanism
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))


# Decoder class
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[targ_embedding_matrix], trainable=False)
        self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights
    
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))


# Initialize optimizer and loss functions
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')

# Loss function
def loss_function(real, pred):
    # Take care of the padding. Not all sequences are of equal length.
    # If there's a '0' in the sequence, the loss is being nullified
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    fontdict = {'fontsize': 14}
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.show()
    return
    
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang_tokenizer.index_word[predicted_id] + ' '
        if targ_lang_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))
    return

def get_data(element):
    question = ''
    answer = ''
    for k in element[0].numpy():
        if k!=0:
            question = question+ inp_lang_tokenizer.index_word[k] + ' '
    for k in element[1].numpy():
        if k!=0:
            answer = answer+ targ_lang_tokenizer.index_word[k] + ' '
    return question, answer

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

checkpoint_dir = './training_checkpoints1'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss


import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys

def normalize_answer(s):
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: 
        return []
    return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        return (int(gold_toks == pred_toks),1,1)
    if num_same == 0:
        return 0,0,0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return (f1, precision, recall)

(1, 579) (1, 3)
Encoder output shape: (batch size, sequence length, units) (1, 579, 50)
Encoder Hidden state shape: (batch size, units) (1, 50)
Attention result shape: (batch size, units) (1, 50)
Attention weights shape: (batch_size, sequence_length, 1) (1, 579, 1)
Decoder output shape: (batch_size, vocab size) (1, 9720)


In [7]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
score_who = 0
score_what = 0
score_when = 0
score_where = 0
s_who = 0
s_what = 0
s_when = 0
s_where = 0
pre_who = 0
pre_what = 0
pre_when = 0
pre_where = 0
rec_who = 0
rec_what = 0
rec_when = 0
rec_where = 0
count_who = 0
count_what = 0
count_when = 0
count_where = 0

def isWordPresent(sentence, word):
    s = sentence.split(" ")
    for i in s:
        if (i.lower() == word):
            return True
    return False

for element in dataset_test:
    question, answer = get_data(element)
    question = question[7:-6]
    idx = question.index('|')
    ques = question[idx+2:]
    answer = answer[7:-6]
    pred_ans = evaluate(question)[0][:-6]
    if isWordPresent(ques, 'who'):
        f,p,r = compute_f1(answer, pred_ans)
        s_who = s_who + similar(answer, pred_ans)
        score_who = score_who + f
        rec_who = rec_who + r
        pre_who = pre_who + p
        count_who = count_who +1
    elif isWordPresent(ques, 'what'):
        f,p,r = compute_f1(answer, pred_ans)
        s_what = s_what + similar(answer, pred_ans)
        score_what = score_what + f
        rec_what = rec_what + r
        pre_what = pre_what + p
        count_what = count_what +1
    elif isWordPresent(ques, 'when'):
        f,p,r = compute_f1(answer, pred_ans)
        s_when = s_when + similar(answer, pred_ans)
        score_when = score_when + f
        rec_when = rec_when + r
        pre_when = pre_when + p
        count_when = count_when +1
    elif isWordPresent(ques, 'where'):
        f,p,r = compute_f1(answer, pred_ans)
        s_where = s_where + similar(answer, pred_ans)
        score_where = score_where + f
        rec_where = rec_where + r
        pre_where = pre_where + p
        count_where = count_where +1

print("F1 Scores of who what when where")        
print(score_who/count_who, end =',')
print(score_what/count_what, end =',')
print(score_when/count_when, end =',')
print(score_where/count_where, end =',')
print((score_who+score_what+score_when+score_where)/(count_who+count_what+count_when+count_where))

print("Similarity Scores")
print(s_who/count_who, end =',')
print(s_what/count_what, end =',')
print(s_when/count_when, end =',')
print(s_where/count_where, end =',')
print((s_who+s_what+s_when+s_where)/(count_who+count_what+count_when+count_where))

print("Precisions")
print(pre_who/count_who, end =',')
print(pre_what/count_what, end =',')
print(pre_when/count_when, end =',')
print(pre_where/count_where, end =',')
print((pre_who+pre_what+pre_when+pre_where)/(count_who+count_what+count_when+count_where))

print("Recalls")
print(rec_who/count_who, end =',')
print(rec_what/count_what, end =',')
print(rec_when/count_when, end =',')
print(rec_where/count_where, end =',')
print((rec_who+rec_what+rec_when+rec_where)/(count_who+count_what+count_when+count_where))

F1 Scores of who what when where
0.165625,0.2372775522947237,0.48633879781420764,0.22115384615384615,0.2630841121495327
Similarity Scores
0.326566963710235,0.3969360508186101,0.6775109335356773,0.36664117166365673,0.4261921648075726
Precisions
0.171875,0.2379019669060256,0.48816029143898,0.22115384615384615,0.26425233644859814
Recalls
0.171875,0.2379019669060256,0.48816029143898,0.22115384615384615,0.26425233644859814


In [None]:
print("hello")

In [None]:
evaluate("some normans joined turkish forces to aid in the destruction of the armenians vassal states of sassoun and taron in far eastern anatolia later many took up service with the armenian state further south in cilicia and the taurus mountains a norman named oursel led a force of franks into the upper euphrates valley in northern syria from 1073 to 1074 8 000 of the 20 000 troops of the armenian general philaretus brachamius were normans formerly of oursel led by raimbaud they even lent their ethnicity to the name of their castle afranji meaning franks the known trade between amalfi and antioch and between bari and tarsus may be related to the presence of italo normans in those cities while amalfi and bari were under norman rule in italy | who was the leader when the franks entered the euphrates valley")[0][:-6]