In [2]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [5]:
import torch
import keras
from layer import MaskedAverage
import numpy as np

## Masked Average

In [3]:
x_tensor = torch.tensor([
    [100, 1, 10],
    [110, 10, 30],
    [90, 33, 99]
])

In [4]:
x_tensor.shape

torch.Size([3, 3])

In [68]:
masked_impl_avg = MaskedAverage()(input_data=x_tensor, mask=[1, 1, 1])
keras_avg = keras.layers.Average()(inputs=[x_tensor, torch.tensor([1, 1, 1])])

print(f"MaskedAverage: {masked_impl_avg} \nkeras.layers.Average: {keras_avg}")

MaskedAverage: tensor([100.0000,  14.6667,  46.3333]) 
keras.layers.Average: tensor([[50.5000,  1.0000,  5.5000],
        [55.5000,  5.5000, 15.5000],
        [45.5000, 17.0000, 50.0000]])


Average works on wrong axis.

In [71]:
import numpy as np

x = np.array([
    [1., 2., 3.],
    [4., 5., 6.],
    [7., 8., 9.],
    [7., 8., 9.],
    [11., 8., 9.]]
)

In [72]:
MaskedAverage()(x)

tensor([6.0000, 6.2000, 7.2000])

In [73]:
inputs = np.random.random([10, 3]).astype(np.float32)
# We create input and simulate empty rows as could be yielded by our Embedding model on unknown words (Not that any of those exist in corpus)
inputs[3, :] = 0.
inputs[5, :] = 0.

In [74]:
inputs  # 0.4803591

array([[0.720102  , 0.85836864, 0.3445978 ],
       [0.29986498, 0.5420876 , 0.88653666],
       [0.8114074 , 0.52032554, 0.1960794 ],
       [0.        , 0.        , 0.        ],
       [0.3971557 , 0.19923395, 0.11562309],
       [0.        , 0.        , 0.        ],
       [0.22157855, 0.1699425 , 0.39186078],
       [0.80099577, 0.2734881 , 0.918462  ],
       [0.16538976, 0.5264178 , 0.91051924],
       [0.8423247 , 0.5678992 , 0.27526826]], dtype=float32)

In [75]:
x = keras.layers.Masking(mask_value=0.0)(inputs)
res = MaskedAverage()(x)

In [77]:
print(res)  # The result is correct wrt to the zero values to skip for the average

tensor([0.5324, 0.4572, 0.5049])


## WeightedSumLayer

In [50]:
# Class Definition
class WeightedSumLayer(keras.layers.Layer):
    def __init__(self, **kwargs):
        super(WeightedSumLayer, self).__init__(**kwargs)
        self.supports_masking = True

    def call(self, input_data: list, mask=None):
        x, w = input_data
        w = keras.ops.expand_dims(w, axis=-1)

        return keras.ops.sum(x * w, axis=1)

# Embedding simulation as interaction

In [7]:
input_array = np.array([[1, 4, 7, 7, 11]])  # (One sentence)
input_array  # An example input of indices of our embedding matrix for the terms

array([[ 1,  4,  7,  7, 11]])

In [8]:
model = keras.Sequential()
model.add(keras.layers.Embedding(120, 5))
model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step


In [9]:
print(output_array)

[[[ 0.03431283 -0.02247928 -0.04623246  0.01455034 -0.02974732]
  [ 0.0192939   0.04212562  0.01354809  0.02876313 -0.01755306]
  [-0.01599893  0.00968315 -0.0218256   0.0361133   0.00044717]
  [-0.01599893  0.00968315 -0.0218256   0.0361133   0.00044717]
  [-0.01512191  0.02084753  0.02845163 -0.04753092  0.02707013]]]


In [32]:
output_array.shape

(1, 5, 5)

In [11]:
import layer

y_s = layer.MaskedAverage()(output_array)

In [12]:
y_s

tensor([[ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039]], device='cuda:0')

In [15]:
print(y_s.shape)
print(output_array.shape)

torch.Size([1, 5])
(1, 5, 5)


In [17]:
# Add the missing but required batch size to for the attention
batched_w_sum = np.expand_dims(y_s.cpu(), 0)
batched_embeddings = np.expand_dims(output_array, 0)

In [35]:
y_s

tensor([[ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039]], device='cuda:0')

In [42]:
a_y = y_s[None, :, :]  # Adding batching to a_y to match dimensions required

In [43]:
att_weights = keras.layers.Attention(name='att_weights')([output_array[0], a_y])

In [53]:
att_weights[0]

tensor([[ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039],
        [ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039],
        [ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039],
        [ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039],
        [ 0.0013,  0.0120, -0.0096,  0.0136, -0.0039]], device='cuda:0')

In [57]:
output_array

array([[[ 0.03431283, -0.02247928, -0.04623246,  0.01455034,
         -0.02974732],
        [ 0.0192939 ,  0.04212562,  0.01354809,  0.02876313,
         -0.01755306],
        [-0.01599893,  0.00968315, -0.0218256 ,  0.0361133 ,
          0.00044717],
        [-0.01599893,  0.00968315, -0.0218256 ,  0.0361133 ,
          0.00044717],
        [-0.01512191,  0.02084753,  0.02845163, -0.04753092,
          0.02707013]]], dtype=float32)

In [58]:
z = WeightedSumLayer()([output_array, att_weights[0][0]])

In [60]:
z  # Looking at the paper it seems reasonable to have zt as the average of word embeddings given by att_weights. It also makes sense conceptually as I use attention to focus on the right words and keep those in the "sum value" of the embeddings to extract a combination that gives a real (higher level) meaning

tensor([[ 2.6959e-04,  4.3352e-04, -9.5660e-05,  6.9240e-04, -3.5162e-04]],
       device='cuda:0')