In [1]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, activations
from tensorflow.keras.activations import sigmoid
from utils import f1
import numpy as np
import pandas as pd
tf.__version__

ImportError: No module named 'utils'

## Creating a 2D-GRU Cell that takes 3 recurrent states plus the input data interaction

Detailed article: https://arxiv.org/pdf/1604.04378.pdf

#### <center>Input</center>
$$S^{M,N}$$
$$S_{ij} = Scalar\ or\ Vector$$

#### <center>Recurrent function to compute the hidden state at position i,j</center>

$$\vec{h}_{ij}=f(\vec{h}_{i-1,j},\vec{h}_{i,j-1}, \vec{h}_{i-1,j-1}, \vec{s}_{ij})$$

#### <center>Implementation of f to compute $\vec{h}_{ij}$</center>

$$\underset{(H*3+I) \times 1}{\vec{q}}=[ \underset{1 \times H}{\vec{h}^{T}_{(i-1,j)}}; \underset{1 \times H}{\vec{h}^{T}_{(i,j-1)}}; \underset{1 \times H}{\vec{h}^{T}_{(i-1,j-1)}}; \underset{1 \times I}{\vec{s}^{T}_{ij}}]^T$$
$$\underset{H \times 1}{\vec{r}_{l}}= \sigma\left (\underset{H\times(H*3+I)}{W^{(rl)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(rl)}}}\right )$$
$$\underset{H \times 1}{\vec{r}_{t}}= \sigma\left (\underset{H\times(H*3+I)}{W^{(rt)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(rt)}}}\right )$$
$$\underset{H \times 1}{\vec{r}_{d}}= \sigma\left (\underset{H\times(H*3+I)}{W^{(rd)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(rd)}}}\right )$$

$$\underset{(H*3) \times 1}{\vec{r}}=[ \underset{1 \times H}{\vec{r}^T_{l}}; \underset{1 \times H}{\vec{r}^T_{t}}; \underset{1 \times H}{\vec{r}^T_{d}} ]^T$$

$$\underset{H \times 1}{\vec{z'}_{i}}= \underset{H\times(H*3+I)}{W^{(zi)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(zi)}}}$$
$$\underset{H \times 1}{\vec{z'}_{l}}= \underset{H\times(H*3+I)}{W^{(zl)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(zl)}}}$$
$$\underset{H \times 1}{\vec{z'}_{t}}= \underset{H\times(H*3+I)}{W^{(zt)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(zt)}}}$$
$$\underset{H \times 1}{\vec{z'}_{d}}= \underset{H\times(H*3+I)}{W^{(zd)}}\cdot\underset{(H*3+I) \times 1}{\vec{q}}+\underset{H \times 1}{\vec{b^{(zd)}}}$$

$$[\underset{H \times 1}{\vec{z}_{i}}; \underset{H \times 1}{\vec{z}_{l}}; \underset{H \times 1}{\vec{z}_{t}}; \underset{H \times 1}{\vec{z}_{d}}] = SoftMaxByRow\left([\underset{H \times 1}{\vec{z'}_{i}}; \underset{H \times 1}{\vec{z'}_{l}}; \underset{H \times 1}{\vec{z'}_{t}}; \underset{H \times 1}{\vec{z'}_{d}}]\right)$$

$$\underset{H \times 1}{\vec{h'}_{ij}}=\theta\left( \underset{H\times I}{W^{(i)}}\cdot\underset{I\times 1}{\vec{s}_{ij}} + \underset{H \times H*3}{U}\cdot\left( \underset{(H*3) \times 1}{\vec{r}} \otimes \left [ \underset{1 \times H}{\vec{h}^{T}_{(i-1,j)}}; \underset{1 \times H}{\vec{h}^{T}_{(i,j-1)}}; \underset{1 \times H}{\vec{h}^{T}_{(i-1,j-1)}} \right]^T \right) + \underset{H \times 1}{\vec{b^{(i)}}} \right)$$

$$\underset{H \times 1}{\vec{h}_{ij}}=\underset{H \times 1}{\vec{z}_{l}}\otimes \underset{H \times 1}{\vec{h}_{(i,j-1)}} + \underset{H \times 1}{\vec{z}_{t}}\otimes \underset{H \times 1}{\vec{h}_{(i-1,j)}} + \underset{H \times 1}{\vec{z}_{d}}\otimes \underset{H \times 1}{\vec{h}_{(i-1,j-1)}} + \underset{H \times 1}{\vec{z}_{i}}\otimes \underset{H \times 1}{\vec{h'}_{(i,j)}}$$

In [2]:
import sys
K.clear_session()

# First, let's define a 2D RNN Cell, as a layer subclass.
# NOTE: the weights and the forward passage are done with the BATCH dimension,
# which means that the previous equations must account for this new dimension
class MultiDimensinalGRUCell(tf.keras.layers.Layer):
    def __init__(self, 
                 units,
                 activation=None,
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 recurrent_regularizer=None,
                 bias_regularizer=None,
                 **kwargs):
        
        self.units = units
        self.state_size = units
        
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer) # TODO add to the weights
        self.recurrent_regularizer = regularizers.get(recurrent_regularizer) # TODO add to the weights
        self.bias_regularizer = regularizers.get(bias_regularizer) # TODO add to the weights
        
        self.activation = activations.get(activation)
        
        super().__init__(**kwargs)

    def build(self, input_shape):
        
        #input shape [BATCH, S]
        recurrent_space_plus_feature_dim = self.state_size*3 + input_shape[-1]
        
        # Trainnable weights for the reset GATES
        self.w_rl = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_left_reset_weight')
        self.b_rl = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_left_reset_bias')
        
        self.w_rt = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_top_reset_weight')
        self.b_rt = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_top_reset_bias')
        
        self.w_rd = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_diagonal_reset_weight')
        self.b_rd = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_diagonal_reset_bias')
        
        # Trainnable weights for the feature (Z) GATES
        self.w_zi = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_input_feature_weight')
        self.b_zi = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_input_feature_bias')
        
        self.w_zl = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_left_feature_weight')
        self.b_zl = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_left_feature_bias')
        
        self.w_zt = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_top_feature_weight')
        self.b_zt = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_top_feature_bias')
        
        self.w_zd = self.add_weight(shape=(recurrent_space_plus_feature_dim, self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='recurrent_diagonal_feature_weight')
        self.b_zd = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='recurrent_diagonal_feature_bias')
        
        # projection weigts U
        self.u = self.add_weight(shape=(self.state_size*3, self.state_size),
                                      initializer=self.bias_initializer,
                                      name='U_weights')
        
        self.w_i = self.add_weight(shape=(input_shape[-1], self.state_size),
                                      initializer=self.recurrent_initializer,
                                      name='input_feature_weight')
        self.b_i = self.add_weight(shape=(self.state_size,),
                                      initializer=self.bias_initializer,
                                      name='input_feature_bias')
        
        super().build(input_shape)

    def call(self, x, states):
        
        # states: [BATCH, 3 (left, top, diagonal)]
        
        left_state = states[0] #(BATCH, RECURRENT_DIM)
        top_state = states[1] #(BATCH, RECURRENT_DIM)
        diagonal_state = states[2] #(BATCH, RECURRENT_DIM)
        
        q_vec = tf.concat([left_state, top_state, diagonal_state, x], axis=1) # [BATCH, 3*RECURRENT_DIM + INPUT_DIM]
        
                     # [BATCH, RECURRENT_DIM]  [1, RECURRENT_DIM]
        reset_left = K.bias_add(K.dot(q_vec, self.w_rl), self.b_rl)
        reset_left = sigmoid(reset_left) # [BATCH, RECURRENT_DIM]
        
        reset_top = K.bias_add(K.dot(q_vec, self.w_rt), self.b_rt)
        reset_top = sigmoid(reset_top) # [BATCH, RECURRENT_DIM]
        
        reset_diagonal = K.bias_add(K.dot(q_vec, self.w_rd), self.b_rd)
        reset_diagonal = sigmoid(reset_diagonal) # [BATCH, RECURRENT_DIM]

        reset = tf.concat([reset_left, reset_top, reset_diagonal], axis=1) # [BATCH, 3*RECURRENT_DIM]
        
        _z_input = K.bias_add(K.dot(q_vec, self.w_zi), self.b_zi) # [BATCH, RECURRENT_DIM]
        _z_left = K.bias_add(K.dot(q_vec, self.w_zl), self.b_zl) # [BATCH, RECURRENT_DIM]
        _z_top = K.bias_add(K.dot(q_vec, self.w_zt), self.b_zt) # [BATCH, RECURRENT_DIM]
        _z_diagonal = K.bias_add(K.dot(q_vec, self.w_zd), self.b_zd) # [BATCH, RECURRENT_DIM]
        
        _z_input = tf.expand_dims(_z_input, axis=-1)
        _z_left = tf.expand_dims(_z_left, axis=-1)
        _z_top = tf.expand_dims(_z_top, axis=-1)
        _z_diagonal = tf.expand_dims(_z_diagonal, axis=-1)
        
        _z = tf.concat([_z_input, _z_left, _z_top, _z_diagonal], axis=-1)
        _z = K.softmax(_z, axis=-1)
        
        # each will have dims # [BATCH, RECURRENT_DIM]
        z = tf.split(_z, num_or_size_splits=4, axis=-1) 
        z_input = K.squeeze(z[0], axis=-1)
        z_left = K.squeeze(z[1], axis=-1)
        z_top = K.squeeze(z[2], axis=-1)
        z_diagonal = K.squeeze(z[3], axis=-1)
        
        # compute candite hidden space
        _states = tf.concat([left_state, top_state, diagonal_state], axis=1) # [BATCH, 3*RECURRENT_DIM]
        reset_states = reset * _states # reset the hidden states # [BATCH, 3*RECURRENT_DIM]
        _h_reset_states = K.dot(reset_states, self.u) # [BATCH, RECURRENT_DIM]
        _h = K.bias_add(K.dot(x, self.w_i), self.b_i) # [BATCH, RECURRENT_DIM]
        _h = _h + _h_reset_states
        _h = self.activation(_h)

        h = z_left * left_state + z_top * top_state + z_diagonal * diagonal_state + z_input * _h

        # OUTPUT [BATCH, Features]
        return h#tf.random.normal((K.shape(x)[0], self.state_size))
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.state_size)
 

## Multidimensional RNN (2D), implements the multidimensional recurrency in tensorflow

It's written in tensorflow 2.0, but a conversion for tensorflow 1.X should be straitforward with the tf.while_loop and tf.cond 

In [3]:
    
class MultiDimensionalRNN(tf.keras.layers.Layer):
    
    def __init__(self, 
                 cell,
                 inital_state=None,
                 **kwargs):

        # outter layer keep track of the inner layer
        self.cell = cell
        self.inital_state = inital_state

        super().__init__(**kwargs)

    def build(self, input_shape):
        self.rows = int(input_shape[1])
        self.columns = int(input_shape[2])
        if len(input_shape)==3:
            self.features = 1
        else:
            self.features = int(input_shape[3])

        super().build(input_shape)

    # Base LOOP
    @tf.function
    def dynamic_tf_loop(self, input_data):

        input_shape = K.shape(input_data)
        print("[DYNAMIC LOOP] input shape",input_shape)

        if self.inital_state is None:
            initial_state = tf.zeros((input_shape[1],self.cell.state_size)) # default state
        else:
            initial_state = self.inital_state

        input_flat = tf.TensorArray(dtype=tf.float32, size=self.rows*self.columns)

        input_flat = input_flat.unstack(input_data) # 1D data representation

        # each entry (2D matrix entry) of the TensorArray is compised by a vector [BATCH, FEATURES]

        ######
        # find the recursive states base on the one dimentional index
        ######
        def get_back_state(index, states, columns):
            if 0>=index%columns: # out of the matrix/terminal case
                return initial_state
            else:
                return states.read(index-1)

        def get_up_state(index, states, columns):
            if index<columns: # out of the matrix/terminal case
                return initial_state
            else:
                return states.read(index-columns)

        def get_diagonal_state(index, states, columns):
            if 0>=index%columns or index<columns: # out of the matrix/terminal case
                return initial_state
            else:
                return states.read(index-columns-1)
        ######
        # END
        ######

        # flat matrix with all the hidden states
        # clear_after_read must be False, since some entries (e.g. diagonal) could read up on 3 times the same state value
        states = tf.TensorArray(dtype=tf.float32, size=self.rows*self.columns, clear_after_read = False)

        # sequential loop -> recursive loop
        print("Loop Iterations",self.rows*self.columns)
        for i in tf.range(self.rows*self.columns):

            states = states.write(i,
                                  self.cell(input_flat.read(i),
                                                 [get_back_state(i, states, self.columns),
                                                 get_up_state(i, states, self.columns),
                                                 get_diagonal_state(i, states, self.columns)])
                                 )


        return states.stack()
    
    @tf.function
    def call(self, x):
        input_shape = K.shape(x)
        batch = input_shape[0]

        input_data = K.reshape(x, [batch, -1, self.features])
        input_data = tf.transpose(input_data, (1, 0, 2))

        states = self.dynamic_tf_loop(input_data)

        # return the last computed states, i.e. h_{M,N}
        return tf.reshape(states[-1], self.compute_output_shape(input_shape))

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.cell.state_size)
            

In [3]:
K.clear_session()

samples = np.random.randint(0,5,size=(10,5,6,2),dtype=np.int32)
print(samples[0,:,:,0])

model = tf.keras.Sequential()
model.add(MultiDimensionalRNN(MultiDimensinalGRUCell(4, activation='tanh'), input_shape=(5,6,2)))

model.summary()

[[0 0 3 2 1 3]
 [2 1 1 4 2 1]
 [3 2 2 1 3 3]
 [4 4 0 4 3 1]
 [0 4 4 0 3 3]]
[DYNAMIC LOOP] input shape Tensor("Shape:0", shape=(3,), dtype=int32)
Loop Iterations 30
[DYNAMIC LOOP] input shape Tensor("Shape:0", shape=(3,), dtype=int32)
Loop Iterations 30
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
multi_dimensional_rnn (Multi (None, 4)                 480       
Total params: 480
Trainable params: 480
Non-trainable params: 0
_________________________________________________________________


In [4]:
model.predict(samples)


array([[ 0.11600827,  0.08719105, -0.9175204 ,  0.3317431 ],
       [ 0.09044574,  0.02689793, -0.89777946,  0.34646532],
       [ 0.08953112,  0.03760115, -0.8939328 ,  0.3576638 ],
       [ 0.11472344,  0.10193343, -0.8590306 ,  0.35245255],
       [ 0.1352495 ,  0.04094839, -0.83062124,  0.4225371 ],
       [ 0.12522927,  0.14861901, -0.86151654,  0.3836741 ],
       [ 0.10181195,  0.13453832, -0.87949765,  0.39124557],
       [ 0.10965315, -0.02972251, -0.9107307 ,  0.35705563],
       [ 0.12083595,  0.13359556, -0.83007824,  0.41804466],
       [ 0.12485104,  0.06513244, -0.9355093 ,  0.42319936]],
      dtype=float32)

### Small simillarity test

In [4]:

def read_stsb_csv(csv_file):
    _data = []

    with open(csv_file, "r", encoding='utf8') as f:
        for line in f:
            _l = line.split("\t")[4:7]
            #_l[1] = _l[1][1:-1]
            #_l[2] = _l[2][1:-1]
            _data.append(_l)
            

    _data = pd.DataFrame(_data, columns=["similarity", "sentenceA", "sentenceB"])
    _data["similarity"] = pd.to_numeric(_data["similarity"]) 
    
    return _data

# read STS_B dataset
# download at: http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark

train = read_stsb_csv("/backup/Semantic_Similarity/stsbenchmark/sts-train.csv")
dev = read_stsb_csv("/backup/Semantic_Similarity/stsbenchmark/sts-dev.csv")

In [5]:
# prepare tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tk = Tokenizer()

tk.fit_on_texts(train["sentenceA"])
tk.fit_on_texts(train["sentenceB"])

tk.fit_on_texts(dev["sentenceA"])
tk.fit_on_texts(dev["sentenceB"])

all_sentences = []

all_sentences.extend(tk.texts_to_sequences(train["sentenceA"].values.tolist()))
all_sentences.extend(tk.texts_to_sequences(train["sentenceB"].values.tolist()))
all_sentences.extend(tk.texts_to_sequences(dev["sentenceA"].values.tolist()))
all_sentences.extend(tk.texts_to_sequences(dev["sentenceB"].values.tolist()))

_stats = list(map(lambda x:len(x),all_sentences))

print("Max length:",max(_stats))
print("Avg length:",np.average(_stats))

Max length: 58
Avg length: 10.418126638156988


In [6]:
# load embeddings
embeddings = {}
with open("/backup/pre-trained_embeddings/glove/glove.6B.50d.txt", "r", encoding='utf8') as f:
    for line in f:
        _l = line.split(" ")
        embeddings[_l[0]] = _l[1:]
        
emb_voc = set(embeddings.keys())
emb_text = set(tk.word_index.keys())

print("coverage",len(emb_text&emb_voc)/len(emb_text))

coverage 0.9108079748163693


In [7]:
#remove tokens without embeddings

miss_tokens = emb_text - (emb_text&emb_voc)

for token in miss_tokens:
    del tk.word_index[token]

#rebuild tokenizer index (hacking the tokenizer, not the clean solution)
tk.word_index = { d[0]:i+1 for i,d in  enumerate(tk.word_index.items())}
tk.index_word = { i:w for w,i in  tk.word_index.items()}

In [8]:
emb_text = set(tk.word_index.keys())

print("coverage",len(emb_text&emb_voc)/len(emb_text))

all_sentences = []

all_sentences.extend(tk.texts_to_sequences(train["sentenceA"].values.tolist()))
all_sentences.extend(tk.texts_to_sequences(train["sentenceB"].values.tolist()))
all_sentences.extend(tk.texts_to_sequences(dev["sentenceA"].values.tolist()))
all_sentences.extend(tk.texts_to_sequences(dev["sentenceB"].values.tolist()))

_stats = list(map(lambda x:len(x),all_sentences))

print("Max length:",max(_stats))
print("Avg length:",np.average(_stats))

coverage 1.0
Max length: 55
Avg length: 10.238860532487239


In [9]:
#build embedding matrix
EMB_SIZE = 50
VOCAB_SIZE = len(emb_text)+1
emb_matrix = np.empty((VOCAB_SIZE, EMB_SIZE))
emb_matrix[0,:] = 0.04 * np.random.random((50,)) - 0.02

[0.04 * np.random.random((50,)) - 0.02] # 0 index random init between [-0.02,0.02[
for w,i in tk.word_index.items():
    emb_matrix[i,:] = np.array(embeddings[w])


In [10]:
# auxiliar input layer
class SimilarityLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        self.dot_layer = Dot(axes=-1, normalize=True)
        super().__init__(**kwargs)
            
    def build(self, input_shape):
        
        self.sentence_size = int(input_shape[0][1])
        emb_dim = int(input_shape[0][2])
        
        self.a = self.add_weight(shape=(emb_dim,1),
                                 initializer="glorot_uniform")
        
        self.b = self.add_weight(shape=(emb_dim,1),
                                 initializer="glorot_uniform")

        super().build(input_shape)
        
    @tf.function
    def call(self, x):
        inputA = x[0] # embedding format
        inputB = x[1] # embedding format
        
        dot = K.expand_dims(self.dot_layer(x), axis=-1)
        # TODO the code here will be broken if the matrix is not square!!! TODO IZ FIX
        #print(dot)
        _a = K.dot(inputA, self.a)
        _a = K.expand_dims(_a, axis=-2)
        _a = K.repeat_elements(_a, self.sentence_size, axis=-2)
        #print(_a)
        _b = K.dot(inputB, self.b)
        _b = K.expand_dims(_b, axis=-2)
        _b = K.repeat_elements(_b, self.sentence_size, axis=-2)
        #print(_b)
        
        return K.concatenate([dot, _a, _b],axis=-1)


In [28]:
K.clear_session()

from tensorflow.keras.layers import InputLayer, Embedding, Dot, Dense, Conv2D, GlobalMaxPool2D, MaxPool2D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras import Input

S_LENGTH = 25

def model_cnn(categorical=True):
    inputA = Input(shape=(S_LENGTH,), name="inputA")
    inputB = Input(shape=(S_LENGTH,), name="inputB")

    emb = Embedding(VOCAB_SIZE, EMB_SIZE, embeddings_initializer=tf.keras.initializers.Constant(emb_matrix), trainable=False)

    sim = SimilarityLayer()
    cnn_1 = Conv2D(32, kernel_size=(3,3), activation = "relu")
    pool_1 = MaxPool2D()
    flat_1 = Flatten()
    if categorical:
        dense = Dense(6, activation="softmax")
    else:
        dense = Dense(1, activation="sigmoid")

    embA = emb(inputA)
    embB = emb(inputB)
    dot = sim([embA,embB])
    cnn = cnn_1(dot)
    pool = pool_1(cnn)
    flat = flat_1(pool)
    out_sim = dense(flat)

    return Model(inputs=[inputA,inputB], outputs=[out_sim])

def model_cnn_global_max(categorical=True):
    inputA = Input(shape=(S_LENGTH,), name="inputA")
    inputB = Input(shape=(S_LENGTH,), name="inputB")

    emb = Embedding(VOCAB_SIZE, EMB_SIZE, embeddings_initializer=tf.keras.initializers.Constant(emb_matrix), trainable=False)

    sim = SimilarityLayer()
    cnn_1 = Conv2D(32, kernel_size=(3,3), activation = "relu")
    pool_1 = GlobalMaxPool2D()

    if categorical:
        dense = Dense(6, activation="softmax")
    else:
        dense = Dense(1, activation="sigmoid")

    embA = emb(inputA)
    embB = emb(inputB)
    dot = sim([embA,embB])
    cnn = cnn_1(dot)
    pool = pool_1(cnn)
    out_sim = dense(pool)

    return Model(inputs=[inputA,inputB], outputs=[out_sim])

def model_small_cnn(categorical=True):
    inputA = Input(shape=(S_LENGTH,), name="inputA")
    inputB = Input(shape=(S_LENGTH,), name="inputB")

    emb = Embedding(VOCAB_SIZE, EMB_SIZE, embeddings_initializer=tf.keras.initializers.Constant(emb_matrix), trainable=False)

    sim = SimilarityLayer()
    cnn_1 = Conv2D(8, kernel_size=(3,3), activation = "relu")
    pool_1 = MaxPool2D()
    flat_1 = Flatten()
    if categorical:
        dense = Dense(6, activation="softmax")
    else:
        dense = Dense(1, activation="sigmoid")

    embA = emb(inputA)
    embB = emb(inputB)
    dot = sim([embA,embB])
    cnn = cnn_1(dot)
    pool = pool_1(cnn)
    flat = flat_1(pool)
    out_sim = dense(flat)

    return Model(inputs=[inputA,inputB], outputs=[out_sim])

def model_2D_gru(categorical=True):
    inputA = Input(shape=(S_LENGTH,), name="inputA")
    inputB = Input(shape=(S_LENGTH,), name="inputB")

    emb = Embedding(VOCAB_SIZE, EMB_SIZE, embeddings_initializer=tf.keras.initializers.Constant(emb_matrix), trainable=False)

    sim = SimilarityLayer()

    gru_2d = MultiDimensionalRNN(MultiDimensinalGRUCell(6, activation='tanh'))
    if categorical:
        dense = Dense(6, activation="softmax")
    else:
        dense = Dense(1, activation="sigmoid")

    embA = emb(inputA)
    embB = emb(inputB)
    dot = sim([embA,embB])
    h = gru_2d(dot)
    out_sim = dense(h)

    return Model(inputs=[inputA,inputB], outputs=[out_sim])

def model_2D_gru_cnn(categorical=True):
    inputA = Input(shape=(S_LENGTH,), name="inputA")
    inputB = Input(shape=(S_LENGTH,), name="inputB")

    emb = Embedding(VOCAB_SIZE, EMB_SIZE, embeddings_initializer=tf.keras.initializers.Constant(emb_matrix), trainable=False)

    sim = SimilarityLayer()
    
    cnn_1 = Conv2D(4, kernel_size=(3,3), activation = "relu")
    pool_1 = MaxPool2D()
    cnn_2 = Conv2D(8, kernel_size=(3,3), activation = "relu")
    pool_2 = MaxPool2D()
    gru_2d = MultiDimensionalRNN(MultiDimensinalGRUCell(6, activation='tanh'))
    if categorical:
        dense = Dense(6, activation="softmax")
    else:
        dense = Dense(1, activation="sigmoid")

    embA = emb(inputA)
    embB = emb(inputB)
    dot = sim([embA,embB])
    x = cnn_1(dot)
    x = pool_1(x)
    x = cnn_2(x)
    x = pool_2(x)
    h = gru_2d(x)
    out_sim = dense(h)

    return Model(inputs=[inputA,inputB], outputs=[out_sim])


In [29]:
normalize_sim = lambda x: (x - np.min(x))/(np.max(x)-np.min(x))
binary_normalize_sim = lambda x: x.apply(lambda y:1 if y>0 else 0)
categorical_normalize_sim = lambda x: tf.keras.utils.to_categorical(np.round(x))
dumb_convertion = lambda y: [ np.array(x, dtype=np.int32) for x in y]

x_train_a = dumb_convertion(tk.texts_to_sequences(train["sentenceA"].values.tolist()))
x_train_a = pad_sequences(x_train_a, maxlen=S_LENGTH, padding='post', truncating='post', value=0.0)
x_train_b = dumb_convertion(tk.texts_to_sequences(train["sentenceB"].values.tolist()))
x_train_b = pad_sequences(x_train_b, maxlen=S_LENGTH, padding='post', truncating='post', value=0.0)
y_train = categorical_normalize_sim(train["similarity"].values)

x_dev_a = dumb_convertion(tk.texts_to_sequences(dev["sentenceA"].values.tolist()))
x_dev_a = pad_sequences(x_dev_a, maxlen=S_LENGTH, padding='post', truncating='post', value=0.0)
x_dev_b = dumb_convertion(tk.texts_to_sequences(dev["sentenceB"].values.tolist()))
x_dev_b = pad_sequences(x_dev_b, maxlen=S_LENGTH, padding='post', truncating='post', value=0.0)
y_dev = categorical_normalize_sim(dev["similarity"].values)

def data_generator(x_a, x_b, y, batch_size = 32):
    assert len(x_a) == len(x_b)
    assert len(x_a) == len(y)
    
    steps = len(x_a)//batch_size
    
    while True:
        for i in range(0, len(x_a), steps):
            yield [np.array(x_a[i:i+batch_size]), np.array(x_b[i:i+batch_size])], y[i:i+batch_size]
    

In [30]:
for model in [model_cnn, model_cnn_global_max, model_small_cnn, model_2D_gru_cnn, model_2D_gru]:
    print("\n", model.__name__)
    model = model()
    model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=["accuracy", f1])
    model.summary()
    BATCH = 32

    model.fit_generator(data_generator(x_train_a, x_train_b, y_train, batch_size=BATCH),
                        steps_per_epoch= len(x_train_a)//BATCH,
                        epochs = 10,
                        validation_data=data_generator(x_dev_a, x_dev_b, y_dev, batch_size=BATCH), 
                        validation_steps= len(x_dev_a)//BATCH)
    del model



 model_cnn
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputA (InputLayer)             [(None, 25)]         0                                            
__________________________________________________________________________________________________
inputB (InputLayer)             [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 25, 50)       651050      inputA[0][0]                     
                                                                 inputB[0][0]                     
__________________________________________________________________________________________________
similarity_layer (SimilarityLay (None, 25, 25, 3)    100         embedding[0][0]  

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

 model_2D_gru_cnn
[DYNAMIC LOOP] input shape Tensor("Shape:0", shape=(3,), dtype=int32)
Loop Iterations 16
[DYNAMIC LOOP] input shape Tensor("Shape:0", shape=(3,), dtype=int32)
Loop Iterations 16
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputA (InputLayer)             [(None, 25)]         0                                            
__________________________________________________________________________________________________
inputB (InputLayer)             [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 25, 50)       651050      inputA[0][0]               

[DYNAMIC LOOP] input shape Tensor("Shape:0", shape=(3,), dtype=int32)
Loop Iterations 625
 32/179 [====>.........................] - ETA: 4:07 - loss: 1.7808 - accuracy: 0.1650 - f1: 0.0000e+00[DYNAMIC LOOP] input shape Tensor("Shape:0", shape=(3,), dtype=int32)
Loop Iterations 625
Loop Iterations 625
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
