In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [2]:
import torch
import keras
from layer import MaskedAverage
import numpy as np

  return torch._C._cuda_getDeviceCount() > 0


## Masked Average

In [3]:
x_tensor = torch.tensor([
    [100, 1, 10],
    [110, 10, 30],
    [90, 33, 99]
])

In [4]:
x_tensor.shape

torch.Size([3, 3])

In [68]:
masked_impl_avg = MaskedAverage()(input_data=x_tensor, mask=[1, 1, 1])
keras_avg = keras.layers.Average()(inputs=[x_tensor, torch.tensor([1, 1, 1])])

print(f"MaskedAverage: {masked_impl_avg} \nkeras.layers.Average: {keras_avg}")

MaskedAverage: tensor([100.0000,  14.6667,  46.3333]) 
keras.layers.Average: tensor([[50.5000,  1.0000,  5.5000],
        [55.5000,  5.5000, 15.5000],
        [45.5000, 17.0000, 50.0000]])


Average works on wrong axis.

In [71]:
import numpy as np

x = np.array([
    [1., 2., 3.],
    [4., 5., 6.],
    [7., 8., 9.],
    [7., 8., 9.],
    [11., 8., 9.]]
)

In [72]:
MaskedAverage()(x)

tensor([6.0000, 6.2000, 7.2000])

In [73]:
inputs = np.random.random([10, 3]).astype(np.float32)
# We create input and simulate empty rows as could be yielded by our Embedding model on unknown words (Not that any of those exist in corpus)
inputs[3, :] = 0.
inputs[5, :] = 0.

In [74]:
inputs  # 0.4803591

array([[0.720102  , 0.85836864, 0.3445978 ],
       [0.29986498, 0.5420876 , 0.88653666],
       [0.8114074 , 0.52032554, 0.1960794 ],
       [0.        , 0.        , 0.        ],
       [0.3971557 , 0.19923395, 0.11562309],
       [0.        , 0.        , 0.        ],
       [0.22157855, 0.1699425 , 0.39186078],
       [0.80099577, 0.2734881 , 0.918462  ],
       [0.16538976, 0.5264178 , 0.91051924],
       [0.8423247 , 0.5678992 , 0.27526826]], dtype=float32)

In [75]:
x = keras.layers.Masking(mask_value=0.0)(inputs)
res = MaskedAverage()(x)

In [77]:
print(res)  # The result is correct wrt to the zero values to skip for the average

tensor([0.5324, 0.4572, 0.5049])


## WeightedSumLayer

In [4]:
# Class Definition
class WeightedSumLayer(keras.layers.Layer):
    def __init__(self, **kwargs):
        super(WeightedSumLayer, self).__init__(**kwargs)
        self.supports_masking = True

    def call(self, input_data: list, mask=None):
        x, w = input_data
        w = keras.ops.expand_dims(w, axis=-1)

        return keras.ops.sum(x * w, axis=1)

# Embedding simulation as interaction

In [5]:
input_array = np.array([[1, 4, 7, 7, 11]])  # (One sentence)
input_array  # An example input of indices of our embedding matrix for the terms

array([[ 1,  4,  7,  7, 11]])

In [6]:
model = keras.Sequential()
model.add(keras.layers.Embedding(120, 5))
model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [7]:
print(output_array)

[[[-0.01709533  0.02543542 -0.04860786 -0.03673366 -0.01704457]
  [ 0.02166657 -0.03744847  0.04946844 -0.0041815   0.0431186 ]
  [-0.01143669  0.03215592 -0.04944665 -0.01991327 -0.01345753]
  [-0.01143669  0.03215592 -0.04944665 -0.01991327 -0.01345753]
  [ 0.03590691 -0.03487306  0.00515183 -0.04022937 -0.01694779]]]


In [8]:
output_array.shape

(1, 5, 5)

In [9]:
import layer

y_s = layer.MaskedAverage()(output_array)

In [10]:
y_s

tensor([[ 0.0035,  0.0035, -0.0186, -0.0242, -0.0036]])

In [11]:
print(y_s.shape)
print(output_array.shape)

torch.Size([1, 5])
(1, 5, 5)


In [21]:
y_s

tensor([[ 0.0035,  0.0035, -0.0186, -0.0242, -0.0036]])

In [22]:
a_y = y_s[None, :, :]  # Adding batching to a_y to match dimensions required

In [25]:
a_y

tensor([[[ 0.0035,  0.0035, -0.0186, -0.0242, -0.0036]]])

In [41]:
att_weights = keras.layers.Attention(name='att_weights')([a_y, output_array])
# att_weights = keras.layers.Attention(name='att_weights')([output_array[0], a_y]) Quale giusto?

In [51]:
# Each attention weight tells me how much weight a row has in my context. Giving more importance to relevant words (when trained)
att_weights[0][0]

tensor([ 0.0035,  0.0035, -0.0186, -0.0242, -0.0036])

In [104]:
output_array[0]

array([[-0.01709533,  0.02543542, -0.04860786, -0.03673366, -0.01704457],
       [ 0.02166657, -0.03744847,  0.04946844, -0.0041815 ,  0.0431186 ],
       [-0.01143669,  0.03215592, -0.04944665, -0.01991327, -0.01345753],
       [-0.01143669,  0.03215592, -0.04944665, -0.01991327, -0.01345753],
       [ 0.03590691, -0.03487306,  0.00515183, -0.04022937, -0.01694779]],
      dtype=float32)

In [65]:
z = WeightedSumLayer()([output_array, att_weights[0][0]])

In [66]:
z  # Looking at the paper it seems reasonable to have zt as the average of word embeddings given by att_weights. It also makes sense conceptually as I use attention to focus on the right words and keep those in the "sum value" of the embeddings to extract a combination that gives a real (higher level) meaning

tensor([[ 0.0004, -0.0013,  0.0021,  0.0009,  0.0007]])

In [123]:
# Does the method behave as expected? It does:
total_sum = []

row_attention_weight = att_weights[0][0]
print(f"Weight: {row_attention_weight}")

for c in range(output_array.shape[2]):
    column = output_array[0][:, c]
    print(f"Current column: {column}")
    total_sum.append(torch.sum(torch.tensor([column[i] * row_attention_weight[i] for i in range(len(column))])))

Weight: tensor([ 0.0035,  0.0035, -0.0186, -0.0242, -0.0036])
Current column: [-0.01709533  0.02166657 -0.01143669 -0.01143669  0.03590691]
Current column: [ 0.02543542 -0.03744847  0.03215592  0.03215592 -0.03487306]
Current column: [-0.04860786  0.04946844 -0.04944665 -0.04944665  0.00515183]
Current column: [-0.03673366 -0.0041815  -0.01991327 -0.01991327 -0.04022937]
Current column: [-0.01704457  0.0431186  -0.01345753 -0.01345753 -0.01694779]


In [127]:
total_sum

[tensor(0.0004),
 tensor(-0.0013),
 tensor(0.0021),
 tensor(0.0009),
 tensor(0.0007)]

# Max Margin Layer

In [32]:

class MaxMargin(keras.layers.Layer):
    def __init__(self, **kwargs):
        super(MaxMargin, self).__init__(**kwargs)
        self.supports_masking = True

    def call(self, input_tensor, mask=None):
        z_s = input_tensor[0]
        z_n = input_tensor[1]
        r_s = input_tensor[2]

        z_s = z_s / keras.ops.cast(keras.backend.epsilon() + keras.ops.sqrt(
            keras.ops.sum(keras.ops.square(z_s), axis=-1, keepdims=True)), keras.backend.floatx())
        z_n = z_n / keras.ops.cast(keras.backend.epsilon() + keras.ops.sqrt(
            keras.ops.sum(keras.ops.square(z_n), axis=-1, keepdims=True)), keras.backend.floatx())
        r_s = r_s / keras.ops.cast(keras.backend.epsilon() + keras.ops.sqrt(
            keras.ops.sum(keras.ops.square(r_s), axis=-1, keepdims=True)), keras.backend.floatx())

        steps = z_n.shape[1]

        pos = keras.ops.sum(z_s * r_s, axis=-1, keepdims=True)
        pos = keras.ops.repeat(pos, steps, axis=-1)
        r_s = keras.ops.expand_dims(r_s, axis=-2)
        r_s = keras.ops.repeat(r_s, steps, axis=1)
        neg = keras.ops.sum(z_n * r_s, axis=-1)

        loss = keras.ops.cast(keras.ops.sum(keras.ops.maximum(0, (1. - pos + neg)), axis=-1, keepdims=True), keras.backend.floatx())
        return loss

In [33]:
# todo: Now test this converted function

In [34]:
z_s = torch.tensor([[3, 4, 5, 6, 7]])
z_n = torch.tensor([[1, 2, 2, 4, 3]])
r_s = torch.tensor([[3, 0, 1, 1, 1]])

In [35]:
max_margin = MaxMargin()([z_s, z_n, r_s])

In [36]:
max_margin

tensor([[4.6163]])