##Basics

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Mounted at /content/gdrive


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import tensorflow

In [None]:
from tensorflow import keras

In [None]:
import sys
sys.path.insert(0,'/content/gdrive/MyDrive/FYP_Project/')

## Utility functions

In [None]:
drive_glovepath = '/content/gdrive/MyDrive/FYP_Project/glove.6B.300d.txt'

In [None]:
from __future__ import print_function

from collections import defaultdict
import numpy as np
import json
from operator import itemgetter

from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences

import re
from nltk.corpus import stopwords
from collections import namedtuple

from keras.callbacks import Callback

class Embedder(object):
    """ Generic embedding interface.

    Required: attributes g and N """

    def map_tokens(self, tokens, ndim=2):
        """ for the given list of tokens, return a list of GloVe embeddings,
        or a single plain bag-of-words average embedding if ndim=1.

        Unseen words (that's actually *very* rare) are mapped to 0-vectors. """
        gtokens = [self.g[t] for t in tokens if t in self.g]
        if not gtokens:
            return np.zeros((1, self.N)) if ndim == 2 else np.zeros(self.N)
        gtokens = np.array(gtokens)
        if ndim == 2:
            return gtokens
        else:
            return gtokens.mean(axis=0)

    def map_set(self, ss, ndim=2):
        """ apply map_tokens on a whole set of sentences """
        return [self.map_tokens(s, ndim=ndim) for s in ss]

    def pad_set(self, ss, spad, N=None):
        """ Given a set of sentences transformed to per-word embeddings
        (using glove.map_set()), convert them to a 3D matrix with fixed
        sentence sizes - padded or trimmed to spad embeddings per sentence.

        Output is a tensor of shape (len(ss), spad, N).

        To determine spad, use something like
            np.sort([np.shape(s) for s in s0], axis=0)[-1000]
        so that typically everything fits, but you don't go to absurd lengths
        to accomodate outliers.
        """
        ss2 = []
        if N is None:
            N = self.N
        for s in ss:
            if spad > s.shape[0]:
                if s.ndim == 2:
                    s = np.vstack((s, np.zeros((spad - s.shape[0], N))))
                else:  # pad non-embeddings (e.g. toklabels) too
                    s = np.hstack((s, np.zeros(spad - s.shape[0])))
            elif spad < s.shape[0]:
                s = s[:spad]
            ss2.append(s)
        return np.array(ss2)

class GloVe(Embedder):
    """ A GloVe dictionary and the associated N-dimensional vector space """
    def __init__(self, N=300, glovepath=drive_glovepath):
        """ Load GloVe dictionary from the standard distributed text file.

        Glovepath should contain %d, which is substituted for the embedding
        dimension N. """
        self.N = N
        self.g = dict()
        self.glovepath = glovepath

        # with open('glove.6B.300d.txt') as f:
        with open(self.glovepath) as f:
            for line in f:
                l = line.split()
                word = l[0]
                self.g[word] = np.array(l[1:]).astype(float)


def hash_params(pardict):
    ps = json.dumps(dict([(k, str(v)) for k, v in pardict.items()]), sort_keys=True)
    h = hash(ps)
    return ps, h


"""
NLP preprocessing tools for sentences.

"""

stop = stopwords.words('english')

flagsdim = 4

def sentence_flags(s0, s1, spad):
    """ For sentence lists s0, s1, generate numpy tensor
    (#sents, spad, flagsdim) that contains a sparse indicator vector of
    various token properties.  It is meant to be concatenated to the token
    embedding. """

    def gen_iflags(s, spad):
        iflags = []
        for i in range(len(s)):
            iiflags = [[False, False] for j in range(spad)]
            for j, t in enumerate(s[i]):
                if j >= spad:
                    break
                number = False
                capital = False
                if re.match('^[0-9\W]*[0-9]+[0-9\W]*$', t):
                    number = True
                if j > 0 and re.match('^[A-Z]', t):
                    capital = True
                iiflags[j] = [number, capital]
            iflags.append(iiflags)
        return iflags

    def gen_mflags(s0, s1, spad):
        """ generate flags for s0 that represent overlaps with s1 """
        mflags = []
        for i in range(len(s0)):
            mmflags = [[False, False] for j in range(spad)]
            for j in range(min(spad, len(s0[i]))):
                unigram = False
                bigram = False
                for k in range(len(s1[i])):
                    if s0[i][j].lower() != s1[i][k].lower():
                        continue
                    # do not generate trivial overlap flags, but accept them as part of bigrams                    
                    if s0[i][j].lower() not in stop and not re.match('^\W+$', s0[i][j]):
                        unigram = True
                    try:
                        if s0[i][j+1].lower() == s1[i][k+1].lower():
                            bigram = True
                    except IndexError:
                        pass
                mmflags[j] = [unigram, bigram]
            mflags.append(mmflags)
        return mflags

    # individual flags (for understanding)
    iflags0 = gen_iflags(s0, spad)
    iflags1 = gen_iflags(s1, spad)

    # s1-s0 match flags (for attention)
    mflags0 = gen_mflags(s0, s1, spad)
    mflags1 = gen_mflags(s1, s0, spad)

    return [np.dstack((iflags0, mflags0)),
            np.dstack((iflags1, mflags1))]



"""
Vocabulary that indexes words, can handle OOV words and integrates word
embeddings.
"""

class Vocabulary:
    """ word-to-index mapping, token sequence mapping tools and
    embedding matrix construction tools """
    def __init__(self, sentences, count_thres=1):
        """ build a vocabulary from given list of sentences, but including
        only words occuring at least #count_thres times """

        # Counter() is superslow :(
        vocabset = defaultdict(int)
        for s in sentences:
            for t in s:
                vocabset[t] += 1

        vocab = sorted(list(map(itemgetter(0),
                                filter(lambda k: itemgetter(1)(k) >= count_thres,
                                       vocabset.items() ) )))
        self.word_idx = dict((w, i + 2) for i, w in enumerate(vocab))
        self.word_idx['_PAD_'] = 0
        self.word_idx['_OOV_'] = 1
        print('Vocabulary of %d words' % (len(self.word_idx)))

        self.embcache = dict()

    def add_word(self, word):
        if word not in self.word_idx:
            self.word_idx[word] = len(self.word_idx)

    def vectorize(self, slist, pad=60):
        """ build an pad-ed matrix of word indices from a list of
        token sequences """
        silist = [[self.word_idx.get(t, 1) for t in s] for s in slist]
        if pad is not None:
            return pad_sequences(silist, maxlen=pad, truncating='post', padding='post') 
        else:
            return silist

    def embmatrix(self, emb):
        """ generate index-based embedding matrix from embedding class emb
        (typically GloVe); pass as weights= argument of Keras' Embedding layer """
        if str(emb) in self.embcache:
            return self.embcache[str(emb)]
        embedding_weights = np.zeros((len(self.word_idx), emb.N))
        for word, index in self.word_idx.items():
            try:
                embedding_weights[index, :] = emb.g[word]
            except KeyError:
                if index == 0:
                    embedding_weights[index, :] = np.zeros(emb.N)
                else:
                    embedding_weights[index, :] = np.random.uniform(-0.25, 0.25, emb.N)  # 0.25 is embedding SD
        self.embcache[str(emb)] = embedding_weights
        return embedding_weights

    def size(self):
        return len(self.word_idx)

"""
Evaluation tools, mainly non-straightforward methods.
"""

def aggregate_s0(s0, y, ypred, k=None):
    """
    Generate tuples (s0, [(y, ypred), ...]) where the list is sorted
    by the ypred score.  This is useful for a variety of list-based
    measures in the "anssel"-type tasks.
    """
    ybys0 = dict()
    for i in range(len(s0)):
        try:
            s0is = s0[i].tostring()
        except AttributeError:
            s0is = str(s0[i])
        if s0is in ybys0:
            ybys0[s0is].append((y[i], ypred[i]))
        else:
            ybys0[s0is] = [(y[i], ypred[i])]

    for s, yl in ybys0.items():
        if k is not None:
            yl = yl[:k]
        ys = sorted(yl, key=lambda yy: yy[1], reverse=True)
        yield (s, ys)

def sorted_output(s0, sent,  y, ypred, q_index=1, target_q=None):
    """
    Generate tuples (s0, [(y, ypred), ...]) where the list is sorted
    by the ypred score.  This is useful for a variety of list-based
    measures in the "anssel"-type tasks.
    """
    ybys0 = dict()
    for i in range(len(s0)):
        try:
            s0is = " ".join(s0[i])
        except AttributeError:
            s0is = str(s0[i])
        if s0is in ybys0:
            ybys0[s0is].append((y[i], ypred[i], sent[i]))
        else:
            ybys0[s0is] = [(y[i], ypred[i], sent[i])]

    counter = 1
    for s, yl in ybys0.items():      
        ys = sorted(yl, key=lambda yy: yy[1], reverse=True)

        if(s == target_q):
            print("Question:")
            print(s)
            print()
            print("Candidate answers sorted by ypred score:")
            for each_sent in ys:
                print(" ".join(each_sent[2]))

        if(counter == q_index):
            print("Question:")
            print(s)
            print()
            print("Candidate answers sorted by ypred score:")
            for each_sent in ys:
                print(" ".join(each_sent[2]))
        counter += 1
        

        
def mrr(s0, y, ypred):
    """
    Compute MRR (mean reciprocial rank) of y-predictions, by grouping
    y-predictions for the same s0 together.  This metric is relevant
    e.g. for the "answer sentence selection" task where we want to
    identify and take top N most relevant sentences.
    """
    rr = []
    for s, ys in aggregate_s0(s0, y, ypred):
        if np.sum([yy[0] for yy in ys]) == 0:
            continue  # do not include s0 with no right answers in MRR
        ysd = dict()
        for yy in ys:
            if yy[1][0] in ysd:
                ysd[yy[1][0]].append(yy[0])
            else:
                ysd[yy[1][0]] = [yy[0]]
        rank = 0
        for yp in sorted(ysd.keys(), reverse=True):
            if np.sum(ysd[yp]) > 0:
                rankofs = 1 - np.sum(ysd[yp]) / len(ysd[yp])
                rank += len(ysd[yp]) * rankofs
                break
            rank += len(ysd[yp])
        rr.append(1 / float(1+rank))

    return np.mean(rr)

def mapcalc(s0, y, ypred):
    """
    Compute MAP (mean average precision) of y-predictions, by grouping
    y-predictions for the same s0 together.  This metric is relevant
    e.g. for the "answer sentence selection" task where we want to
    identify and take top N most relevant sentences.
    """

    rr = []
  
    for s, ys in aggregate_s0(s0, y, ypred):
        temp = []
        if np.sum([yy[0] for yy in ys]) == 0:
            continue  # do not include s0 with no right answers in MRR
        ysd = dict()
        for yy in ys:
            if yy[1][0] in ysd:
                ysd[yy[1][0]].append(yy[0])
            else:
                ysd[yy[1][0]] = [yy[0]]
        rank = 0
        counter = 1
        for yp in sorted(ysd.keys(), reverse=True):
            if np.sum(ysd[yp]) > 0:
                rankofs = 1 
                # rankofs = 1 - np.sum(ysd[yp]) / len(ysd[yp])
                rank += len(ysd[yp]) * rankofs
            rank += len(ysd[yp])
            temp.append(float(counter) / float(1+rank)) # change to rank
            counter += 1
        temp_mean = np.mean(temp)
 
        rr.append(temp_mean)
 
    return np.mean(rr)


AnsSelRes = namedtuple('AnsSelRes', ['MRR', 'MAP'])

def eval_QA(pred, q, y):
    mrr_ = mrr(q, y, pred)
    
    print('MRR: %f' % (mrr_))

    map_ = mapcalc(q, y, pred)

    print('MAP: %f' % (map_))
    return AnsSelRes(mrr_, map_)


"""
Task-specific callbacks for the fit() function.
"""

class AnsSelCB(Callback):
    """ A callback that monitors answer selection validation ACC after each epoch """
    def __init__(self, val_q, val_s, y, inputs):
        self.val_q = val_q
        self.val_s = val_s
        self.val_y = y 
        self.val_inputs = inputs

    def on_epoch_end(self, epoch, logs={}):
        pred = self.model.predict(self.val_inputs)        
        mrr_ = mrr(self.val_q, self.val_y, pred)
        map_ = mapcalc(self.val_q, self.val_y, pred)
        print('val MRR %f' % (mrr_,))
        logs['mrr'] = mrr_
        print('val MAP %f' % (map_,))
        logs['map'] = map_



## Import and Config

In [None]:
from __future__ import print_function
from __future__ import division

from scipy import stats
import numpy as np
import sys,os

import csv

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import keras.activations as activations
from tensorflow.keras.layers import Layer, InputSpec
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, TimeDistributed, BatchNormalization
from keras.layers.merge import concatenate, add, multiply, dot
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.core import Activation, Dense, Dropout, Flatten, Lambda, Permute, RepeatVector
from keras.layers.recurrent import GRU, LSTM
from keras.layers import CuDNNGRU, CuDNNLSTM, Bidirectional, MultiHeadAttention

from keras import backend as K
from tensorflow.compat.v1.keras.backend import set_session

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [None]:
def config():
    c = dict()
    # embedding params
    c['emb'] = 'Glove'
    c['embdim'] = 300    # change to 300 
    c['inp_e_dropout'] = 1/2
    c['flag'] = True
    c['pe'] = True
    c['pe_method'] = 'fixed' # 'fixed' or 'learned'

    # training hyperparams
    c['opt'] = 'adam'
    c['batch_size'] = 320
    c['epochs'] = 5 # change to 160
    c['patience'] = 155
    
    # sentences with word lengths below the 'pad' will be padded with 0.
    c['pad'] = 60
    
    # rnn model       
    c['rnn_dropout'] = 1/2     
    c['l2reg'] = 1e-4
                                              
    c['rnnbidi'] = True                      
    c['rnn'] = CuDNNLSTM
    c['rnnbidi_mode'] = concatenate
    c['rnnact'] = 'tanh'
    c['rnninit'] = 'glorot_uniform'                      
    c['sdim'] = 5

    # cnn model
    c['cnn_dropout'] = 1/2     
    c['pool_layer'] = MaxPooling1D
    c['cnnact'] = 'relu'
    c['cnninit'] = 'glorot_uniform'
    c['pact'] = 'tanh'

    # projection layer
    c['proj'] = True
    c['pdim'] = 1/2
    c['p_layers'] = 1
    c['p_dropout'] = 1/2
    c['p_init'] = 'glorot_uniform'
    
    # QA-LSTM/CNN+attention
    c['adim'] = 1/2
    c['cfiltlen'] = 3
    
    # Attentive Pooling-LSTM/CNN
    c['w_feat_model'] = 'rnn'
    c['bll_dropout'] = 1/2
    
    # self attention model
    c['self_pdim'] = 1/2

    # mlp scoring function
    c['Ddim'] = 2
    
    ps, h = hash_params(c)

    return c, ps, h

In [None]:
conf = None
emb = None
vocab = None
inp_tr = None
inp_val = None
inp_test = None
y_val = None
y_test = None

In [None]:
def ranknet(y_true, y_pred):
    return K.mean(K.log(1. + K.exp(-(y_true * y_pred - (1-y_true) * y_pred))), axis=-1)

## Data Load

Load TrecQA dataset (wang et al. 2007).  http://cs.stanford.edu/people/mengqiu/data/qg-emnlp07-data.tgz

The format of the dataset is as follows.
- question1, label, sentence1   

In [None]:
def load_data_from_file(dsfile):
    #load a dataset in the csv format;
    q = [] # a set of questions
    sents = [] # a set of sentences
    labels = [] # a set of labels

    with open(dsfile) as f:
        c = csv.DictReader(f)
        for l in c:
            label = int(l['label'])
            labels.append(label)
            try:
                qtext = l['qtext'].decode('utf8')
                stext = l['atext'].decode('utf8')
            except AttributeError:  # python3 has no .decode()
                qtext = l['qtext']
                stext = l['atext']
            
            q.append(qtext.split(' '))
            sents.append(stext.split(' '))
            
    return (q, sents, labels)
    
def make_model_inputs(qi, si, f01, f10, q, sents, y):
    inp = {'qi': qi, 'si': si, 'f01':f01, 'f10':f10, 'q':q, 'sents':sents, 'y':y} 
    
    return inp

def load_set(fname, vocab=None, iseval=False):
    q, sents, y = load_data_from_file(fname)
    if not iseval:
        vocab = Vocabulary(q + sents) 
    
    pad = conf['pad']
    
    qi = vocab.vectorize(q, pad=pad)  
    si = vocab.vectorize(sents, pad=pad)        
    f01, f10 = sentence_flags(q, sents, pad)  
    
    inp = make_model_inputs(qi, si, f01, f10, q, sents, y)
    if iseval:
        return (inp, y)
    else:
        return (inp, y, vocab)        
    
def load_data(trainf, valf, testf):
    global vocab, inp_tr, inp_val, inp_test, y_train, y_val, y_test
    inp_tr, y_train, vocab = load_set(trainf, iseval=False)
    inp_val, y_val = load_set(valf, vocab=vocab, iseval=True)
    inp_test, y_test = load_set(testf, vocab=vocab, iseval=True)

## Individual Encoding:

In [None]:
def embedding():
    '''
    Declare all inputs (vectorized sentences and NLP flags)
    and generate outputs representing vector sequences with dropout applied.  
    Returns the vector dimensionality.       
    '''
    pad = conf['pad']
    dropout = conf['inp_e_dropout']
    
    # story selection
    input_qi = Input(name='qi', shape=(pad,), dtype='int32')                          
    input_si = Input(name='si', shape=(pad,), dtype='int32')                 
    input_f01 = Input(name='f01', shape=(pad, flagsdim))
    input_f10 = Input(name='f10', shape=(pad, flagsdim))         

    if conf['flag']:
        input_nodes = [input_qi, input_si, input_f01, input_f10]
        N = emb.N + flagsdim
    else:
        input_nodes = [input_qi, input_si]
        N = emb.N

    shared_embedding = Embedding(name='emb', input_dim=vocab.size(), input_length=pad,
                                output_dim=emb.N, mask_zero=False,
                                weights=[vocab.embmatrix(emb)], trainable=True)
    # nlp flag
    if conf['flag']:
        emb_qi = concatenate([shared_embedding(input_qi), input_f01])
        emb_si = concatenate([shared_embedding(input_si), input_f10])
    else:
        emb_qi = shared_embedding(input_qi)
        emb_si = shared_embedding(input_si)
    
    # positional encoding
    if conf['pe']:
        if conf['pe_method'] == 'fixed':
            encoding = position_encoding_fixed(pad, N)
            # pe_layer = Lambda(name='pe_fixed_layer', 
            #         function=lambda x: batch_multiply(x, encoding), 
            #         output_shape=lambda shape:shape)
            emb_qi = batch_multiply(emb_qi, encoding)
            emb_si = batch_multiply(emb_si, encoding)
        elif conf['pe_method'] == 'learned':
            encoder = Embedding(name='pe_learnable_layer', input_dim=conf['pad'], input_length=conf['pad'],
                                            output_dim=304, mask_zero=False, trainable=True)
            pos_val = K.constant(value=np.arange(conf['pad'])) # shape=(pad,)
            pos_val = K.expand_dims(pos_val, axis=0) # shape=(1, pad)
            pos_val = K.tile(pos_val, (K.shape(input_qi)[0], 1)) # shape=(batch_size_of_x, pad)
            pos_input = Input(name='pos_input', tensor=pos_val)
            input_nodes.append(pos_input)
            encoding = encoder(pos_input)

            emb_qi = add([emb_qi, encoding])
            emb_si = add([emb_si, encoding])
    
    emb_qi = Dropout(dropout, noise_shape=(None, pad, N))(emb_qi)
    emb_si = Dropout(dropout, noise_shape=(None, pad, N))(emb_si) # shape=(None, pad, N)

    emb_outputs = [emb_qi, emb_si]
    
    return N, input_nodes, emb_outputs

def batch_multiply(x, y): 
    y = K.expand_dims(y, axis=0)
    y = K.tile(y, (K.shape(x)[0], 1, 1)) 
    return multiply([x, y]) 

def position_encoding_fixed(sentence_size, embedding_size):
    """ 
    Position Encoding described in https://arxiv.org/pdf/1503.08895.pdf
    """
    encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
    ls = sentence_size + 1
    le = embedding_size + 1
    for i in range(1, le):
        for j in range(1, ls):
            encoding[i-1, j-1] = (i - (embedding_size+1)/2) * (j - (sentence_size+1)/2)
    encoding = 1 + 4 * encoding / embedding_size / sentence_size
    # Make position encoding of time words identity to avoid modifying them 
    encoding[:, -1] = 1.0
    # encoding = K.variable(value=np.transpose(encoding)) # shape=(pad, N)
    temp = np.transpose(encoding)
    encoding = tf.convert_to_tensor(temp, dtype=tf.float32)
    return encoding

## Combined Encoding (with Tests)

In [None]:
def combined_embedding():
    '''
    Declare all inputs (vectorized sentences and NLP flags)
    and generate outputs representing vector sequences with dropout applied.  
    Returns the vector dimensionality.       
    '''
    pad = conf['pad']
    dropout = conf['inp_e_dropout']
    
    # story selection
    input_qi = Input(name='qi', shape=(pad,), dtype='int32')                          
    input_si = Input(name='si', shape=(pad,), dtype='int32')                 
    input_f01 = Input(name='f01', shape=(pad, flagsdim))
    input_f10 = Input(name='f10', shape=(pad, flagsdim))         

    if conf['flag']:
        input_nodes = [input_qi, input_si, input_f01, input_f10]
        N = emb.N + flagsdim
    else:
        input_nodes = [input_qi, input_si]
        N = emb.N

    shared_embedding = Embedding(name='emb', input_dim=vocab.size(), input_length=pad,
                                output_dim=emb.N, mask_zero=False,
                                weights=[vocab.embmatrix(emb)], trainable=True)
    # nlp flag
    if conf['flag']:
        emb_qi = concatenate([shared_embedding(input_qi), input_f01])
        emb_si = concatenate([shared_embedding(input_si), input_f10])
    else:
        emb_qi = shared_embedding(input_qi)
        emb_si = shared_embedding(input_si)
    
    # positional encoding
    if conf['pe']:
        # Fixed PE
        encoding_fixed = position_encoding_fixed(pad, N)
        emb_qi_fixed = batch_multiply(emb_qi, encoding_fixed)
        emb_si_fixed = batch_multiply(emb_qi, encoding_fixed)

        # Trainable PE
        encoder_learn = Embedding(name='pe_learnable_layer', input_dim=conf['pad'], input_length=conf['pad'],
                                        output_dim=304, mask_zero=False, trainable=True)
        pos_val = K.constant(value=np.arange(conf['pad'])) # shape=(pad,)
        pos_val = K.expand_dims(pos_val, axis=0) # shape=(1, pad)
        pos_val = K.tile(pos_val, (K.shape(input_qi)[0], 1)) # shape=(batch_size_of_x, pad)
        pos_input = Input(name='pos_input', tensor=pos_val)
        input_nodes.append(pos_input)
        encoding_learn = encoder_learn(pos_input)
        emb_qi_learn = add([emb_qi, encoding_learn])
        emb_si_learn = add([emb_si, encoding_learn])

        print(emb_qi_fixed.shape)
        print(emb_si_fixed.shape)
        print(emb_qi_learn.shape)
        print(emb_qi_learn.shape)

        # # Uncomment when combining
        emb_qi= add([emb_qi_fixed, emb_qi_learn])
        emb_si = add([emb_si_fixed, emb_si_learn])

        emb_qi = Dropout(dropout, noise_shape=(None, pad, N))(emb_qi)
        emb_si = Dropout(dropout, noise_shape=(None, pad, N))(emb_si) # shape=(None, pad, N)

        emb_qi = Permute((2,1))(emb_qi)
        emb_si = Permute((2,1))(emb_si)
        emb_qi = Activation('tanh')(Bidirectional(LSTM(60, return_sequences=True), merge_mode='ave')(emb_qi))
        emb_si = Activation('tanh')(Bidirectional(LSTM(60, return_sequences=True), merge_mode='ave')(emb_si))
        emb_qi = Permute((2,1))(emb_qi)
        emb_si = Permute((2,1))(emb_si)
    
    emb_outputs = [emb_qi, emb_si]
    
    return N, input_nodes, emb_outputs



### Embedding Test:

In [None]:
path = '/content/gdrive/MyDrive/FYP_Project/data'
trainf = path + '/train-all.csv' 
valf = path + '/dev.csv'
testf = path + '/test.csv'
params = []

conf, ps, h = config()

if conf['emb'] == 'Glove': # Please download the GloVe in here http://nlp.stanford.edu/data/glove.6B.zip
    print('GloVe')
    emb = GloVe(N=conf['embdim'])

print('Dataset')
load_data(trainf,valf,testf)
N, input_nodes_emb, output_nodes_emb = combined_embedding()

GloVe


KeyboardInterrupt: ignored

In [None]:
print(N)
print(len(input_nodes_emb))
print(input_nodes_emb[0].shape)
print(input_nodes_emb[1].shape)
print(input_nodes_emb[2].shape)
print(input_nodes_emb[3].shape)
print(input_nodes_emb[4].shape)
print(len(output_nodes_emb))
print(output_nodes_emb[0].shape)
print(output_nodes_emb[1].shape)

## Model Utils

In [None]:
def projection_layer(inputs, input_size):
    input0 = inputs[0]
    input1 = inputs[1]
    for p_i in range(conf['p_layers']):
        shared_dense = Dense(name='pdeep%d'%(p_i), units=int(input_size*conf['pdim']),
                activation='linear', kernel_initializer=conf['p_init'], kernel_regularizer=l2(conf['l2reg']))
        qi_proj = Activation(conf['pact'])(BatchNormalization()(shared_dense(input0)))
        si_proj = Activation(conf['pact'])(BatchNormalization()(shared_dense(input1)))
        input0 = qi_proj
        input1 = si_proj
        input_size = int(input_size * conf['pdim'])

    dropout = conf['p_dropout']
    qi_proj = Dropout(dropout, noise_shape=(input_size,))(qi_proj)
    si_proj = Dropout(dropout, noise_shape=(input_size,))(si_proj)

    return qi_proj, si_proj

## Average Model

In [None]:
def avg_model(input_nodes, N, pfx=''):
    shared_dense = Dense(int(N), activation='linear', name='wproj'+pfx)
    qi_wproj = TimeDistributed(shared_dense)(input_nodes[0])
    si_wproj = TimeDistributed(shared_dense)(input_nodes[1])
    
    qi_wproj = TimeDistributed(BatchNormalization())(qi_wproj)
    si_wproj = TimeDistributed(BatchNormalization())(si_wproj)

    qi_wproj = TimeDistributed(Activation('tanh'))(qi_wproj)
    si_wproj = TimeDistributed(Activation('tanh'))(si_wproj)
    
    avg_layer = Lambda(name='bow'+pfx, function=lambda x: K.mean(x, axis=1), output_shape=lambda shape:(shape[0],) + shape[2:])
    qi_avg = avg_layer(qi_wproj)
    si_avg = avg_layer(si_wproj)

    if conf['proj']:
        qi_avg, si_avg = projection_layer([qi_avg, si_avg], int(N))

    return [qi_avg, si_avg]

## RNN model (with Tests)

In [None]:
def rnn_model(input_nodes, N, pfx=''):
    qi_rnn, si_rnn, nc = rnn_input(N, pfx=pfx, dropout=conf['rnn_dropout'], sdim=conf['sdim'], 
                            rnnbidi_mode=conf['rnnbidi_mode'], rnn=conf['rnn'], rnnact=conf['rnnact'], 
                            rnninit=conf['rnninit'], inputs=input_nodes, return_sequence=False)

    if conf['proj']:
        qi_rnn, si_rnn = projection_layer([qi_rnn, si_rnn], nc)
    
    print(qi_rnn.shape)
    print(si_rnn.shape)
    return [qi_rnn, si_rnn]

def rnn_input(N, dropout=3/4, sdim=2, rnn=GRU, rnnact='tanh', rnninit='glorot_uniform', rnnbidi_mode=add, 
              inputs=None, return_sequence=True, pfx=''):
    if rnnbidi_mode == concatenate:
        sdim /= 2
    shared_rnn_f = rnn(int(N*sdim), kernel_initializer=rnninit, input_shape=(None, conf['pad'], N), 
                       return_sequences=return_sequence, name='rnnf'+pfx)
    shared_rnn_b = rnn(int(N*sdim), kernel_initializer=rnninit, input_shape=(None, conf['pad'], N),
                       return_sequences=return_sequence, go_backwards=True, name='rnnb'+pfx)
    qi_rnn_f = shared_rnn_f(inputs[0])
    si_rnn_f = shared_rnn_f(inputs[1])
    
    qi_rnn_b = shared_rnn_b(inputs[0])
    si_rnn_b = shared_rnn_b(inputs[1])
    
    qi_rnn = Activation(rnnact)(BatchNormalization()(rnnbidi_mode([qi_rnn_f, qi_rnn_b])))
    si_rnn = Activation(rnnact)(BatchNormalization()(rnnbidi_mode([si_rnn_f, si_rnn_b])))
    
    if rnnbidi_mode == concatenate:
        sdim *= 2
        
    qi_rnn = Dropout(dropout, noise_shape=(int(N*sdim),))(qi_rnn)
    si_rnn = Dropout(dropout, noise_shape=(int(N*sdim),))(si_rnn)
    
    return (qi_rnn, si_rnn, int(N*sdim))

### Test RNN Model

In [None]:
# test_emb = []
N, input_nodes_emb, output_nodes_emb = embedding() 
iam = rnn_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()

## CNN model

In [None]:
def cnn_model(input_nodes, N, pfx=''):
    qi_cnn, si_cnn, nc = cnnsum_input(conf['pad'], dropout=conf['cnn_dropout'],
                                l2reg=conf['l2reg'], cnninit=conf['cnninit'], cnnact=conf['cnnact'],
                                input_dim=N, inputs=input_nodes)
    if conf['proj']:
        qi_cnn, si_cnn = projection_layer([qi_cnn, si_cnn], nc)

    return [qi_cnn, si_cnn]

def cnnsum_input(pad, dropout=3/4, l2reg=1e-4, cnninit='glorot_uniform', cnnact='relu',
        cdim={1: 1/2, 2: 1/2, 3: 1/2, 4: 1/2, 5: 1/2, 6: 1/2, 7: 1/2}, inputs=None, input_dim=304, pfx=''):
    qi_cnn_res_list = []
    si_cnn_res_list = []
    tot_len = 0
    for fl, cd in cdim.items():
        nb_filter = int(input_dim*cd)
        shared_conv = Convolution1D(name=pfx+'conv%d'%(fl), input_shape=(None, conf['pad'], input_dim),
                    kernel_size=fl, filters=nb_filter, activation='linear',
                    kernel_regularizer=l2(l2reg), kernel_initializer=cnninit)
        qi_cnn_one = Activation(cnnact)(BatchNormalization()(shared_conv(inputs[0])))
        si_cnn_one = Activation(cnnact)(BatchNormalization()(shared_conv(inputs[1])))
        
        pool = MaxPooling1D(pool_size=int(conf['pad']-fl+1), name=pfx+'pool%d'%(fl))
        qi_pool_one = pool(qi_cnn_one)
        si_pool_one = pool(si_cnn_one)

        flatten = Flatten(name=pfx+'flatten%d'%(fl))
        qi_out_one = flatten(qi_pool_one)
        si_out_one = flatten(si_pool_one)

        qi_cnn_res_list.append(qi_out_one)
        si_cnn_res_list.append(si_out_one)
    
        tot_len += nb_filter

    qi_cnn = Dropout(dropout, noise_shape=(tot_len,))(concatenate(qi_cnn_res_list))
    si_cnn = Dropout(dropout, noise_shape=(tot_len,))(concatenate(si_cnn_res_list))

    return (qi_cnn, si_cnn, tot_len)

## Self-Attention Model (with Tests)

In [None]:
def self_attention_model(input_nodes, N, pfx=''):
    # only using self-attention
    # apply point-wise fc-layer on both-side (1x1 spatial convolution)
    shared_dense = Dense(int(conf['self_pdim']*N), activation='linear', name='pont_wise_fc'+pfx) 
    qi_key = TimeDistributed(shared_dense)(input_nodes[0])
    si_key = TimeDistributed(shared_dense)(input_nodes[1]) 

    qi_key = TimeDistributed(BatchNormalization())(qi_key)
    si_key = TimeDistributed(BatchNormalization())(si_key)

    qi_key = TimeDistributed(Activation('relu'))(qi_key)
    si_key = TimeDistributed(Activation('relu'))(si_key)
    
    # one-more 1x1 spartial convolution
    shared_dense_attn = Dense(1, activation='linear', name='point_wise_fc_attn'+pfx) 
    qi_matching = TimeDistributed(shared_dense_attn)(qi_key)
    si_matching = TimeDistributed(shared_dense_attn)(si_key)
    
    # get attn values
    flatten = Flatten(name='attn_flatten'+pfx)
    qi_matching = Activation('softmax')(flatten(qi_matching))
    qi_matching = RepeatVector(int(N))(qi_matching)
    qi_matching = Permute((2,1))(qi_matching)
    si_matching = Activation('softmax')(flatten(si_matching))
    si_matching = RepeatVector(int(N))(si_matching)
    si_matching = Permute((2,1))(si_matching)

    # q, sentence updates
    qi_val = multiply([qi_matching, input_nodes[0]])
    si_val = multiply([si_matching, input_nodes[1]])

    # weighted_averaging
    avg_layer = Lambda(name='bow'+pfx, function=lambda x: K.mean(x, axis=1), output_shape=lambda shape:(shape[0],) + shape[2:])
    qi_val = avg_layer(qi_val)
    si_val = avg_layer(si_val)

    if conf['proj']:
        qi_val, si_val = projection_layer([qi_val, si_val], int(N))

    print(qi_val.shape)
    print(si_val.shape)
    return [qi_val, si_val]

### Test Self Attention Module

In [None]:
# test_emb = []
N, input_nodes_emb, output_nodes_emb = embedding() 
iam = self_attention_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()

(?, 304)
(?, 304)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 qi (InputLayer)                [(None, 60)]         0           []                               
                                                                                                  
 si (InputLayer)                [(None, 60)]         0           []                               
                                                                                                  
 emb (Embedding)                (None, 60, 300)      15508200    ['qi[0][0]',                     
                                                                  'si[0][0]']                     
                                                                                                  
 f01 (InputLayer)               [(None, 60, 4)]      0           []       

## Interactive Attention (with Tests)

In [None]:
def interactive_attention_model(input_nodes, N, pfx=''):
    # d=304
    # m=n=60
    shared_dense = Dense(int(N), activation='linear', name='wproj'+pfx)
    qi_wproj = TimeDistributed(shared_dense)(input_nodes[0])
    si_wproj = TimeDistributed(shared_dense)(input_nodes[1])
    
    qi_wproj = TimeDistributed(BatchNormalization())(qi_wproj)
    si_wproj = TimeDistributed(BatchNormalization())(si_wproj)

    qi_wproj = TimeDistributed(Activation('sigmoid'))(qi_wproj)
    si_wproj = TimeDistributed(Activation('sigmoid'))(si_wproj)

    qi_e = Permute((2,1))(qi_wproj) # Shape is now ?xdxn
    qi_e = BatchNormalization()(qi_e)
    si_e = Permute((2,1))(si_wproj) # Shape is now ?xdxm
    si_e = BatchNormalization()(si_e)




    # 1 Alignment :)
    qi_e_trans = Permute((2,1))(qi_e) # Shape is now ?xnxd
    qi_e_trans = BatchNormalization()(qi_e_trans)
    alignment = dot(axes=(2,1), inputs=[qi_e_trans,si_e]) # Shape is now ?xnxm
    alignment = BatchNormalization()(alignment)
    alignment_trans = Permute((2,1))(alignment) # Shape is now ?xmxn
    alignment_trans = BatchNormalization()(alignment_trans)
    r_a = Activation('softmax')(alignment) # Shape is now ?xnxm
    r_q = Activation('softmax')(alignment_trans) # Shape is now ?xmxn
    print(r_a.shape)
    print(r_q.shape)

    
    # 2 Co-Attention :)
    q_sum = dot(axes=(2,1), inputs=[qi_e, r_a]) # Shape is now ?xdxm
    q_sum = BatchNormalization()(q_sum)
    a_sum = dot(axes=(2,1), inputs=[si_e, r_q]) # Shape is now ?xdxn
    a_sum = BatchNormalization()(a_sum)

    q_coa = dot(axes=(2,1), inputs=[q_sum,r_q]) # Shape is now ?xdxn
    q_coa = BatchNormalization()(q_coa)
    a_coa = dot(axes=(2,1), inputs=[a_sum,r_a]) # Shape is now ?xdxm   
    a_coa = BatchNormalization()(a_coa)
    print(q_coa.shape)
    print(a_coa.shape)


    # 3 Representation Compression :)
    q_full = concatenate([qi_e, a_sum, q_coa]) # Shape is now ?xdx3n
    a_full = concatenate([si_e, q_sum, a_coa]) # Shape is now ?xdx3m
    print(q_full.shape)
    print(a_full.shape)
    q_com = Activation('sigmoid')(Bidirectional(LSTM(30, return_sequences=True), merge_mode='concat')(q_full)) # Shape is now ?xdxn
    q_com = BatchNormalization()(q_com) 

    a_com = Activation('sigmoid')(Bidirectional(LSTM(30, return_sequences=True), merge_mode='concat')(a_full)) # Shape is now ?xdxm
    a_com = BatchNormalization()(a_com) 

    print(q_com.shape)
    print(a_com.shape)


    # 4 TF Self-Attention :)
    # q_com = concatenate(axis=1, inputs=[q_com, q_com, q_com, q_com])
    # a_com = concatenate(axis=1, inputs=[a_com, a_com, a_com, a_com])
    mha = MultiHeadAttention(key_dim=100, value_dim=100, num_heads=4) 
    # mha = BatchNormalization()(mha) 

    q_mha, q_attn = mha(q_com, q_com, return_attention_scores=True) # Shape is now ?xdxm
    a_mha, a_attn = mha(a_com, a_com, return_attention_scores=True) # Shape is now ?xdxm
    print(q_mha.shape)
    print(a_mha.shape)

    # Residual FFN :)
    q_res = Dense(60, activation='relu')(q_mha)
    q_res = BatchNormalization()(q_res) 

    a_res = Dense(60, activation='relu')(a_mha)
    a_res = BatchNormalization()(a_res) 

    q_self = add([q_res, q_mha]) # Shape is now ?xdxm
    a_self = add([a_res, a_mha]) # Shape is now ?xdxm
    print(q_self.shape)
    print(a_self.shape)

    # 5 Aggregation :)
    q_final = Activation('tanh')(Bidirectional(LSTM(60, return_sequences=True), merge_mode='ave')(q_self))
    q_final = BatchNormalization()(q_final) 
    a_final = Activation('tanh')(Bidirectional(LSTM(60, return_sequences=True), merge_mode='ave')(a_self))
    a_final = BatchNormalization()(a_final) 

    print(q_final.shape)
    print(a_final.shape)

    qi_val = Permute((2,1))(q_final) # Shape is now ?xdxn
    qi_val = BatchNormalization()(qi_val) 

    si_val = Permute((2,1))(a_final) # Shape is now ?xdxm
    si_val = BatchNormalization()(si_val) 

    # 6 Weighted_averaging :)
    avg_layer = Lambda(name='bow'+pfx, function=lambda x: K.mean(x, axis=1), output_shape=lambda shape:(shape[0],) + shape[2:])
    qi_val = avg_layer(qi_val)
    si_val = avg_layer(si_val)

    if conf['proj']:
        qi_val, si_val = projection_layer([qi_val, si_val], int(N))

    print("qi_val.shape: ",qi_val.shape)
    print("si_val.shape: ",si_val.shape)


    return [qi_val, si_val]

### Test Interactive Attention Module
Call Embedding and Data Load Before this in 'Run Code' section

In [None]:
# test_emb = []
N, input_nodes_emb, output_nodes_emb = embedding() 
conf['proj']= False
iam = interactive_attention_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()

(?, 60, 60)
(?, 60, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 180)
(?, 304, 180)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
qi_val.shape:  (?, 304)
si_val.shape:  (?, 304)
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 qi (InputLayer)                [(None, 60)]         0           []                               
                                                                                                  
 emb (Embedding)                (None, 60, 300)      15508200    ['qi[0][0]',                     
                                                                  'si[0][0]']                     
                                                                                                  
 f01 (InputLayer)               [(None, 60, 4)]      0      

## Pooling with Self Attention (with Tests)

### ap_self_attention_model

In [None]:
def ap_self_attention_model(input_nodes, N, pfx=''):
    # only using self-attention
    # apply point-wise fc-layer on both-side (1x1 spatial convolution)
    shared_dense = Dense(int(conf['self_pdim']*N), activation='linear', name='pont_wise_fc'+pfx) 
    qi_key = TimeDistributed(shared_dense)(input_nodes[0])
    si_key = TimeDistributed(shared_dense)(input_nodes[1]) 

    qi_key = TimeDistributed(BatchNormalization())(qi_key)
    si_key = TimeDistributed(BatchNormalization())(si_key)

    qi_key = TimeDistributed(Activation('relu'))(qi_key)
    si_key = TimeDistributed(Activation('relu'))(si_key)
    
    # one-more 1x1 spartial convolution
    shared_dense_attn = Dense(1, activation='linear', name='point_wise_fc_attn'+pfx) 
    qi_matching = TimeDistributed(shared_dense_attn)(qi_key)
    si_matching = TimeDistributed(shared_dense_attn)(si_key)
    
    # get attn values
    flatten = Flatten(name='attn_flatten'+pfx)
    qi_matching = Activation('softmax')(flatten(qi_matching))
    qi_matching = RepeatVector(int(N))(qi_matching)
    qi_matching = Permute((2,1))(qi_matching)
    si_matching = Activation('softmax')(flatten(si_matching))
    si_matching = RepeatVector(int(N))(si_matching)
    si_matching = Permute((2,1))(si_matching)

    # q, sentence updates
    qi_val = multiply([qi_matching, input_nodes[0]])
    si_val = multiply([si_matching, input_nodes[1]])

    # weighted_averaging
    # avg_layer = Lambda(name='bow'+pfx, function=lambda x: K.mean(x, axis=1), output_shape=lambda shape:(shape[0],) + shape[2:])
    # qi_val = avg_layer(qi_val)
    # si_val = avg_layer(si_val)

    print(qi_val.shape)
    print(si_val.shape)

    if conf['proj']:
        qi_val, si_val = projection_layer([qi_val, si_val], int(N))
    
    print(qi_val.shape)
    print(si_val.shape)

    return [qi_val, si_val]

### Attentive Pooling (with Tests)

In [None]:
def ap_model(input_nodes, N, pfx=''):
        qi_feat, si_feat = ap_self_attention_model(N=N, pfx='sa_q'+pfx, input_nodes=input_nodes)
        """Attentive pooling
        Args:
            q: encoder output for question (batch_size, q_len, vector_size)
            a: encoder output for question (batch_size, a_len, vector_size)
        Returns:
            final representation Tensor r_q, r_a for q and a (batch_size, vector_size)
        """
        batch_size = conf['batch_size']
        # c = q.get_shape().as_list()[-1]  # vector size
        c = 304
        m = 60
        n = 60

        
        # G = tanh(Q*U*A^T)  here Q is equal to Q transpose in origin paper.
        Q = qi_feat  # (b, m, c)
        A = si_feat  # (b, n, c)
        print("Q.shape: ", Q.shape)
        print("A.shape: ", A.shape)
        temp = np.zeros((c,c))
        # U = tf.Variable(tf.truncated_normal([c,c], stddev=0.05))
        U = tf.convert_to_tensor(temp, dtype=tf.float32)
        U_batch = K.tile(K.expand_dims(U, axis=0), (K.shape(Q)[0], 1, 1))
        print(Q.shape)
        print(U_batch.shape)
        print(A.shape)
        G = tf.tanh(
            tf.matmul(
                tf.matmul(Q, U_batch), A, transpose_b=True)
        ) 
        print("G.shape:",G.shape) # G b*m*n

        # column-wise and row-wise max-poolings to generate g_q (b*m*1), g_a (b*1*n)
        g_q = tf.reduce_max(G, axis=2, keepdims=True)
        g_a = tf.reduce_max(G, axis=1, keepdims=True)
        print("g_q.shape:", g_q.shape)
        print("g_a.shape:",g_a.shape)

        # create attention vectors sigma_q (b*m), sigma_a (b*n)
        sigma_q = tf.nn.softmax(g_q)
        sigma_a = tf.nn.softmax(g_a)
        # final output r_q, r_a  (b*c)
        r_q = tf.squeeze(tf.matmul(tf.transpose(Q, [0, 2, 1]), sigma_q), axis=2)
        r_a = tf.squeeze(tf.matmul(sigma_a, A), axis=1)
        print("r_q.shape:", r_q.shape)
        print("r_a.shape:",r_a.shape)

        return [r_q, r_a]  # (b, c)


In [None]:
N, input_nodes_emb, output_nodes_emb = embedding()
conf['proj'] = False
iam = ap_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()
test_model.compile(loss=ranknet, optimizer=conf['opt'])

TypeError: ignored

### Max Pooling (with Tests)

In [None]:
def maxp_model(input_nodes, N, pfx=''):
    qi_feat, si_feat = ap_self_attention_model(N=N, pfx='sa_q'+pfx, input_nodes=input_nodes)
    # apply column wise max pooling (correct)
    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    # apply row wise max pooling ??
    # maxpool = Lambda(lambda x: K.max(x, axis=2, keepdims=False), output_shape=lambda x: (x[0], x[1]))

    maxpool.supports_masking = True
    question_pool = maxpool(qi_feat)
    answer_pool = maxpool(si_feat)
    print("question_pool.shape", question_pool.shape)
    print("answer_pool.shape", answer_pool.shape)

    return [question_pool, answer_pool]

In [None]:
N, input_nodes_emb, output_nodes_emb = embedding()
conf['proj'] = False
iam = maxp_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()
test_model.compile(loss=ranknet, optimizer=conf['opt'])

### Avg Pooling (with Tests)

In [None]:
def avgp_model(input_nodes, N, pfx=''):
    qi_feat, si_feat = ap_self_attention_model(N=N, pfx='sa_q'+pfx, input_nodes=input_nodes)
    # apply column wise mean pooling (correct)
    meanpool = Lambda(lambda x: K.mean(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    # apply row wise mean pooling ??
    # meanpool = Lambda(lambda x: K.mean(x, axis=2, keepdims=False), output_shape=lambda x: (x[0], x[1]))

    meanpool.supports_masking = True
    question_pool = meanpool(qi_feat)
    answer_pool = meanpool(si_feat)
    print("question_pool.shape", question_pool.shape)
    print("answer_pool.shape", answer_pool.shape)

    return [question_pool, answer_pool]

In [None]:
N, input_nodes_emb, output_nodes_emb = embedding()
conf['proj'] = False
iam = avgp_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()
test_model.compile(loss=ranknet, optimizer=conf['opt'])

## Pooling with Interactive Attention (with Tests)

### ap_interactive_attention_model

In [None]:
def ap_interactive_attention_model(input_nodes, N, pfx=''):
    # d=304
    # m=n=60
    shared_dense = Dense(int(N), activation='linear', name='wproj'+pfx)
    qi_wproj = TimeDistributed(shared_dense)(input_nodes[0])
    si_wproj = TimeDistributed(shared_dense)(input_nodes[1])
    
    qi_wproj = TimeDistributed(BatchNormalization())(qi_wproj)
    si_wproj = TimeDistributed(BatchNormalization())(si_wproj)

    qi_wproj = TimeDistributed(Activation('sigmoid'))(qi_wproj)
    si_wproj = TimeDistributed(Activation('sigmoid'))(si_wproj)

    qi_e = Permute((2,1))(qi_wproj) # Shape is now ?xdxn
    qi_e = BatchNormalization()(qi_e)
    si_e = Permute((2,1))(si_wproj) # Shape is now ?xdxm
    si_e = BatchNormalization()(si_e)




    # 1 Alignment :)
    qi_e_trans = Permute((2,1))(qi_e) # Shape is now ?xnxd
    qi_e_trans = BatchNormalization()(qi_e_trans)
    alignment = dot(axes=(2,1), inputs=[qi_e_trans,si_e]) # Shape is now ?xnxm
    alignment = BatchNormalization()(alignment)
    alignment_trans = Permute((2,1))(alignment) # Shape is now ?xmxn
    alignment_trans = BatchNormalization()(alignment_trans)
    r_a = Activation('softmax')(alignment) # Shape is now ?xnxm
    r_q = Activation('softmax')(alignment_trans) # Shape is now ?xmxn
    print(r_a.shape)
    print(r_q.shape)

    
    # 2 Co-Attention :)
    q_sum = dot(axes=(2,1), inputs=[qi_e, r_a]) # Shape is now ?xdxm
    q_sum = BatchNormalization()(q_sum)
    a_sum = dot(axes=(2,1), inputs=[si_e, r_q]) # Shape is now ?xdxn
    a_sum = BatchNormalization()(a_sum)

    q_coa = dot(axes=(2,1), inputs=[q_sum,r_q]) # Shape is now ?xdxn
    q_coa = BatchNormalization()(q_coa)
    a_coa = dot(axes=(2,1), inputs=[a_sum,r_a]) # Shape is now ?xdxm   
    a_coa = BatchNormalization()(a_coa)
    print(q_coa.shape)
    print(a_coa.shape)


    # 3 Representation Compression :)
    q_full = concatenate([qi_e, a_sum, q_coa]) # Shape is now ?xdx3n
    a_full = concatenate([si_e, q_sum, a_coa]) # Shape is now ?xdx3m
    print(q_full.shape)
    print(a_full.shape)
    q_com = Activation('sigmoid')(Bidirectional(LSTM(30, return_sequences=True), merge_mode='concat')(q_full)) # Shape is now ?xdxn
    q_com = BatchNormalization()(q_com) 

    a_com = Activation('sigmoid')(Bidirectional(LSTM(30, return_sequences=True), merge_mode='concat')(a_full)) # Shape is now ?xdxm
    a_com = BatchNormalization()(a_com) 

    print(q_com.shape)
    print(a_com.shape)


    # 4 TF Self-Attention :)
    # q_com = concatenate(axis=1, inputs=[q_com, q_com, q_com, q_com])
    # a_com = concatenate(axis=1, inputs=[a_com, a_com, a_com, a_com])
    mha = MultiHeadAttention(key_dim=100, value_dim=100, num_heads=4) 
    # mha = BatchNormalization()(mha) 

    q_mha, q_attn = mha(q_com, q_com, return_attention_scores=True) # Shape is now ?xdxm
    a_mha, a_attn = mha(a_com, a_com, return_attention_scores=True) # Shape is now ?xdxm
    print(q_mha.shape)
    print(a_mha.shape)

    # Residual FFN :)
    q_res = Dense(60, activation='relu')(q_mha)
    q_res = BatchNormalization()(q_res) 

    a_res = Dense(60, activation='relu')(a_mha)
    a_res = BatchNormalization()(a_res) 

    q_self = add([q_res, q_mha]) # Shape is now ?xdxm
    a_self = add([a_res, a_mha]) # Shape is now ?xdxm
    print(q_self.shape)
    print(a_self.shape)

    # 5 Aggregation :)
    q_final = Activation('sigmoid')(Bidirectional(LSTM(60, return_sequences=True), merge_mode='ave')(q_self))
    q_final = BatchNormalization()(q_final) 
    a_final = Activation('sigmoid')(Bidirectional(LSTM(60, return_sequences=True), merge_mode='ave')(a_self))
    a_final = BatchNormalization()(a_final) 

    print(q_final.shape)
    print(a_final.shape)

    qi_val = Permute((2,1))(q_final) # Shape is now ?xdxn
    qi_val = BatchNormalization()(qi_val) 

    si_val = Permute((2,1))(a_final) # Shape is now ?xdxm
    si_val = BatchNormalization()(si_val) 

    # # 6 Weighted_averaging :)
    # avg_layer = Lambda(name='bow'+pfx, function=lambda x: K.mean(x, axis=1), output_shape=lambda shape:(shape[0],) + shape[2:])
    # qi_val = avg_layer(qi_val)
    # si_val = avg_layer(si_val)

    if conf['proj']:
        qi_val, si_val = projection_layer([qi_val, si_val], int(N))

    print("qi_val.shape: ",qi_val.shape)
    print("si_val.shape: ",si_val.shape)


    return [qi_val, si_val]

### Attentive Pooling (with Tests)

In [None]:
def ap_ia_model(input_nodes, N, pfx=''):
        qi_feat, si_feat = ap_interactive_attention_model(N=N, pfx='sa_q'+pfx, input_nodes=input_nodes)
        """Attentive pooling
        Args:
            q: encoder output for question (batch_size, q_len, vector_size)
            a: encoder output for question (batch_size, a_len, vector_size)
        Returns:
            final representation Tensor r_q, r_a for q and a (batch_size, vector_size)
        """
        batch_size = conf['batch_size']
        # c = q.get_shape().as_list()[-1]  # vector size
        c = 304
        m = 60
        n = 60

        
        # G = tanh(Q*U*A^T)  here Q is equal to Q transpose in origin paper.
        Q = qi_feat  # (b, m, c)
        A = si_feat  # (b, n, c)
        print("Q.shape: ", Q.shape)
        print("A.shape: ", A.shape)
        temp = np.zeros((c,c))
        # U = tf.Variable(tf.truncated_normal([c,c], stddev=0.05))
        U = tf.convert_to_tensor(temp, dtype=tf.float32)
        U_batch = K.tile(K.expand_dims(U, axis=0), (K.shape(Q)[0], 1, 1))
        print(Q.shape)
        print(U_batch.shape)
        print(A.shape)
        G = tf.tanh(
            tf.matmul(
                tf.matmul(Q, U_batch), A, transpose_b=True)
        ) 
        print("G.shape:",G.shape) # G b*m*n

        # column-wise and row-wise max-poolings to generate g_q (b*m*1), g_a (b*1*n)
        g_q = tf.reduce_max(G, axis=2, keepdims=True)
        g_a = tf.reduce_max(G, axis=1, keepdims=True)
        print("g_q.shape:", g_q.shape)
        print("g_a.shape:",g_a.shape)

        # create attention vectors sigma_q (b*m), sigma_a (b*n)
        sigma_q = tf.nn.softmax(g_q)
        sigma_a = tf.nn.softmax(g_a)
        # final output r_q, r_a  (b*c)
        r_q = tf.squeeze(tf.matmul(tf.transpose(Q, [0, 2, 1]), sigma_q), axis=2)
        r_a = tf.squeeze(tf.matmul(sigma_a, A), axis=1)
        print("r_q.shape:", r_q.shape)
        print("r_a.shape:",r_a.shape)

        return [r_q, r_a]  # (b, c)


In [None]:
N, input_nodes_emb, output_nodes_emb = embedding()
conf['proj'] = False
iam = ap_ia_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()
test_model.compile(loss=ranknet, optimizer=conf['opt'])

(?, 60, 60)
(?, 60, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 180)
(?, 304, 180)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
qi_val.shape:  (?, 60, 304)
si_val.shape:  (?, 60, 304)
Q.shape:  (?, 60, 304)
A.shape:  (?, 60, 304)
(?, 60, 304)
(?, 304, 304)
(?, 60, 304)
G.shape: (?, 60, 60)
g_q.shape: (?, 60, 1)
g_a.shape: (?, 1, 60)
r_q.shape: (?, 304)
r_a.shape: (?, 304)
Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 qi (InputLayer)                [(None, 60)]         0           []                               
                                                                                                  
 emb (Embedding)                (None, 60, 300)      15508200    ['qi[0][0]',                     
                                                           

### Max Pooling (with Tests)

In [None]:
def maxp_ia_model(input_nodes, N, pfx=''):
    qi_feat, si_feat = ap_interactive_attention_model(N=N, pfx='sa_q'+pfx, input_nodes=input_nodes)
    # apply column wise max pooling (correct)
    maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    # apply row wise max pooling ??
    # maxpool = Lambda(lambda x: K.max(x, axis=2, keepdims=False), output_shape=lambda x: (x[0], x[1]))

    maxpool.supports_masking = True
    question_pool = maxpool(qi_feat)
    answer_pool = maxpool(si_feat)
    print("question_pool.shape", question_pool.shape)
    print("answer_pool.shape", answer_pool.shape)

    return [question_pool, answer_pool]

In [None]:
N, input_nodes_emb, output_nodes_emb = embedding()
conf['proj'] = False
iam = maxp_ia_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()
test_model.compile(loss=ranknet, optimizer=conf['opt'])

### Avg Pooling (with Tests)

In [None]:
def avgp_ia_model(input_nodes, N, pfx=''):
    qi_feat, si_feat = ap_interactive_attention_model(N=N, pfx='sa_q'+pfx, input_nodes=input_nodes)
    # apply column wise mean pooling (correct)
    meanpool = Lambda(lambda x: K.mean(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
    # apply row wise mean pooling ??
    # meanpool = Lambda(lambda x: K.mean(x, axis=2, keepdims=False), output_shape=lambda x: (x[0], x[1]))

    meanpool.supports_masking = True
    question_pool = meanpool(qi_feat)
    answer_pool = meanpool(si_feat)
    print("question_pool.shape", question_pool.shape)
    print("answer_pool.shape", answer_pool.shape)

    return [question_pool, answer_pool]

In [None]:
N, input_nodes_emb, output_nodes_emb = embedding()
conf['proj'] = False
iam = avgp_ia_model(output_nodes_emb, N, pfx= 'S')
test_model = Model(inputs=input_nodes_emb, outputs=iam)
test_model.summary()
test_model.compile(loss=ranknet, optimizer=conf['opt'])

## MLP scoring function
To compare two sentence vectors, we used the mlp similarity function.

In [None]:
def mlp_ptscorer(inputs, Ddim, N, l2reg, pfx='out', oact='sigmoid', extra_inp=[]):
    """ Element-wise features from the pair fed to an MLP. """

    sum_vec = add(inputs)
    mul_vec = multiply(inputs)

    mlp_input = concatenate([sum_vec, mul_vec])

    # Ddim may be either 0 (no hidden layer), scalar (single hidden layer) or
    # list (multiple hidden layers)
    if Ddim == 0:
        Ddim = []
    elif not isinstance(Ddim, list):
        Ddim = [Ddim]
    if Ddim:
        for i, D in enumerate(Ddim):
            shared_dense = Dense(int(N*D), kernel_regularizer=l2(l2reg), 
                                 activation='linear', name=pfx+'hdn%d'%(i))
            mlp_input = Activation('tanh')(shared_dense(mlp_input))

    shared_dense = Dense(1, kernel_regularizer=l2(l2reg), activation=oact, name=pfx+'mlp')
    mlp_out = shared_dense(mlp_input)
    
    return mlp_out

## Model Architecture 

In [None]:
def build_model():
    # input embedding         
    N, input_nodes_emb, output_nodes_emb = embedding() 
    # Projection layer = false for attentive pooling 
    conf['proj'] = False
    # answer sentence selection
    # avg_model / rnn_model / cnn_model / 
    # self_attention_model / ap_model / maxp_model / avgp_model
    # interactive_attention_model / ap_ia_model / maxp_ia_model / avgp_ia_model

    ptscorer_inputs = interactive_attention_model(output_nodes_emb, N, pfx='S')
    print("Using interactive_attention_model model...")

    scoreS = mlp_ptscorer(ptscorer_inputs, conf['Ddim'], N,  
            conf['l2reg'], pfx='outS', oact='sigmoid')                

    output_nodes = scoreS

    model = Model(inputs=input_nodes_emb, outputs=output_nodes)
    
    model.compile(loss=ranknet, optimizer=conf['opt'])
    return model

## Train and Evaluation

In [None]:
def train_and_eval(runid):
    print('Model')
    model = build_model()
    print(model.summary())
    
    print('Training')
    fit_model(model, weightsf='weights-'+runid+'-bestval.h5py')
    model.save_weights('weights-'+runid+'-final.h5py', overwrite=True)
    model.load_weights('weights-'+runid+'-bestval.h5py')

    print('Predict&Eval (best val epoch)')
    res = eval(model)

In [None]:
def fit_model(model, **kwargs):
    epochs = conf['epochs']
    callbacks = fit_callbacks(kwargs.pop('weightsf'))
    
    return model.fit(inp_tr, y=y_train, validation_data=[inp_val, y_val], batch_size=conf['batch_size'],
                     callbacks = callbacks, epochs=epochs)


At every epoch, the callback function measures mrr performance and accuracy 

In [None]:
def fit_callbacks(weightsf):                                  
    return [AnsSelCB(inp_val['q'], inp_val['sents'], y_val, inp_val),
            ModelCheckpoint(weightsf, save_best_only=True, monitor='mrr', mode='max'),
            EarlyStopping(monitor='mrr', mode='max', patience=conf['patience'])]

In [None]:
def eval(model):
    res = []
    for inp in [inp_val, inp_test]:
        if inp is None:
            res.append(None)
            continue

        pred = model.predict(inp)
        res.append(eval_QA(pred, inp['q'], inp['y']))
    pred = model.predict(inp_test)
    string1 = "When was Florence Nightingale born ?"
    sorted_output(inp_test['q'], inp_test['sents'], inp_test['y'], pred, 1, string1)
    return tuple(res)

## Run Code

In [None]:
if __name__ == "__main__":
    path = '/content/gdrive/MyDrive/FYP_Project/data'
    trainf = path + '/train-all.csv' 
    valf = path + '/dev.csv'
    testf = path + '/test.csv'
    params = []
    
    conf, ps, h = config()

    # Uncomment for GloVe embedding
    if conf['emb'] == 'Glove': # Please download the GloVe in here http://nlp.stanford.edu/data/glove.6B.zip
        print('GloVe')
        emb = GloVe(N=conf['embdim'])

    print('Dataset')
    load_data(trainf,valf,testf)
    runid = 'Model-%x' % (h)
    print('RunID: %s  (%s)' % (runid, ps))
    
    # Final Model Training and Evaluation (Uncomment when module check is complete)
    train_and_eval(runid)

GloVe
Dataset
Vocabulary of 51694 words
RunID: Model--2747bc947d8848f7  ({"Ddim": "2", "adim": "0.5", "batch_size": "320", "bll_dropout": "0.5", "cfiltlen": "3", "cnn_dropout": "0.5", "cnnact": "relu", "cnninit": "glorot_uniform", "emb": "Glove", "embdim": "300", "epochs": "5", "flag": "True", "inp_e_dropout": "0.5", "l2reg": "0.0001", "opt": "adam", "p_dropout": "0.5", "p_init": "glorot_uniform", "p_layers": "1", "pact": "tanh", "pad": "60", "patience": "155", "pdim": "0.5", "pe": "True", "pe_method": "fixed", "pool_layer": "<class 'keras.layers.pooling.MaxPooling1D'>", "proj": "True", "rnn": "<class 'keras.layers.cudnn_recurrent.CuDNNLSTM'>", "rnn_dropout": "0.5", "rnnact": "tanh", "rnnbidi": "True", "rnnbidi_mode": "<function concatenate at 0x7f6d585954d0>", "rnninit": "glorot_uniform", "sdim": "5", "self_pdim": "0.5", "w_feat_model": "rnn"})
Model
(?, 60, 60)
(?, 60, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 180)
(?, 304, 180)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 304, 60)
(?, 30

KeyboardInterrupt: ignored

## Predict for question using Saved Model

In [None]:
path = '/content/gdrive/MyDrive/FYP_Project/data'
trainf = path + '/train-all.csv' 
valf = path + '/dev.csv'
testf = path + '/test.csv'
params = []

conf, ps, h = config()

# Uncomment for GloVe embedding
if conf['emb'] == 'Glove': # Please download the GloVe in here http://nlp.stanford.edu/data/glove.6B.zip
    print('GloVe')
    emb = GloVe(N=conf['embdim'])

print('Dataset')
load_data(trainf,valf,testf)
runid = 'Model-%x' % (h)
print('RunID: %s  (%s)' % (runid, ps))
print('Model')
model = build_model()
print(model.summary())

model.load_weights('weights-'+runid+'-bestval.h5py')
res = []

pred = model.predict(inp_test)
res.append(eval_QA(pred, inp_test['q'], inp_test['y']))
string1 = "When was Florence Nightingale born ?"
sorted_output(inp_test['q'], inp_test['sents'], inp_test['y'], pred, 1, string1)


GloVe
Dataset
Vocabulary of 51694 words
RunID: Model--4b958568ebe118da  ({"Ddim": "2", "adim": "0.5", "batch_size": "320", "bll_dropout": "0.5", "cfiltlen": "3", "cnn_dropout": "0.5", "cnnact": "relu", "cnninit": "glorot_uniform", "emb": "Glove", "embdim": "300", "epochs": "5", "flag": "True", "inp_e_dropout": "0.5", "l2reg": "0.0001", "opt": "adam", "p_dropout": "0.5", "p_init": "glorot_uniform", "p_layers": "1", "pact": "tanh", "pad": "60", "patience": "155", "pdim": "0.5", "pe": "True", "pe_method": "learned", "pool_layer": "<class 'keras.layers.pooling.MaxPooling1D'>", "proj": "True", "rnn": "<class 'keras.layers.cudnn_recurrent.CuDNNLSTM'>", "rnn_dropout": "0.5", "rnnact": "tanh", "rnnbidi": "True", "rnnbidi_mode": "<function concatenate at 0x7fa121034290>", "rnninit": "glorot_uniform", "sdim": "5", "self_pdim": "0.5", "w_feat_model": "rnn"})
Model
Using maxp_ia_model model...
Model: "model_1"
________________________________________________________________________________________

  updates=self.state_updates,


MRR: 0.621210
MAP: 0.604001
Question:
What do practitioners of Wicca worship ?

Candidate answers sorted by ypred score:
That 's because Ms . Palmer is a witch , the high priestess of a group that practices Wicca at Fort Hood with the knowledge and approval of the U.S . Army .
Wicca members tend to be white , college-educated , middle-class women , many of whom have families , said Ms . Berger , who said the religion was becoming more institutionalized as the members aged .
Ms . Siefferly , a senior at the high school , has practiced Wicca for several years after reading a book on the subject , Schram said .
Such sentiments are rooted in fear and false notions that witches worship Satan or sacrifice animals , said David Oringderff , head of the San Antonio -based pagan group that sponsors Ms . Palmer 's Fort Hood Open Circle .
The inch- thick chaplain handbook includes a five -page primer on Wicca , described as `` a reconstruction of the Nature worship of tribal Europe . ''
The fear o

## GPU Check and Model Save

In [None]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
import shutil
shutil.copyfile("/content/weights-Model--dc752ee900001d1-bestval.h5py", "/content/gdrive/MyDrive/FYPModels/weights-Model--dc752ee900001d1-bestval.h5py")
shutil.copyfile("/content/weights-Model--5c8bdb18d3a5047f-bestval.h5py", "/content/gdrive/MyDrive/FYPModels/weights-Model--5c8bdb18d3a5047f-bestval.h5py")
shutil.copyfile("/content/weights-Model--5c8bdb18d3a5047f-final.h5py.data-00000-of-00001", "/content/gdrive/MyDrive/FYPModels/weights-Model--5c8bdb18d3a5047f-final.h5py.data-00000-of-00001")
shutil.copyfile("/content/weights-Model--5c8bdb18d3a5047f-final.h5py.index", "/content/gdrive/MyDrive/FYPModels/weights-Model--5c8bdb18d3a5047f-final.h5py.index")


'/content/gdrive/MyDrive/FYPModels/weights-Model--5c8bdb18d3a5047f-final.h5py.index'