In [2]:
import tensorflow as tf
from tensorflow.keras import layers

In [3]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    if mask is not None:
        scaled_product += (mask * -1e9)
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
    return attention

In [4]:
class MultiHeadAttention1(layers.Layer):
    def __init__(self, d_model, num_heads, length):
        super().__init__()
        self.n_heads = num_heads
        self.d_model = d_model
        self.length = length
        
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.n_heads == 0
        self.d_head = self.d_model // self.n_heads
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size):
        shape = (batch_size, -1, self.n_heads, self.d_head)
        splitted_inputs = tf.reshape(inputs, shape=shape)
        return tf.transpose(splitted_inputs, perm=[0,2,1,3])
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        attention = tf.transpose(attention, perm=[0,2,1,3])
        concat_attention = tf.reshape(attention, shape=(batch_size, -1, self.d_model))
        outputs = self.final_lin(concat_attention)
        return outputs

In [None]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads
        self.wq = layers.Dense(d_model, activation='relu')
        self.wk = layers.Dense(d_model, activation='relu')
        self.wv = layers.Dense(d_model, activation='relu')
        self.dense = layers.Dense(d_model, activation='relu')
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def _generate_relative_position_matrix(self, length, max_relative_position, cache=False):
        if not cache:
            range_vec = tf.range(length)
            range_mat = tf.reshape(tf.tile(range_vec, [length]), [length, length])
            distance_mat = range_mat - tf.transpose(range_mat)
        else:
            distance_mat = tf.expand_dims(tf.range(-length+1, 1, 1), 0)
        distance_mat_clipped = tf.clip_by_value(distance_mat, -max_relative_position, max_relative_position)
        final_mat = distance_mat_clipped + max_relative_position
    
    def _generate_relative_positions_embeddings(self, length, depth, max_relative_position, name, cache=False):
        with tf.compat.v1.variable_scope(name):
            relative_positions_matrix = self._generate_relative_positions_matrix(length, max_relative_position, cache=cache)
            vocab_size = max_relative_position * 2 + 1
            embeddings_table = tf.compat.v1.get_variable(name, [vocab_size, depth])
            embeddings = tf.gather(embeddings_table, relative_positions_matrix)
            return embeddings
    
    def _relative_attention_inner(self, x, y, z, transpose, mask, length):
        batch_size = tf.shape(x)[0]
        heads = N_HEADS
        xy_matmul = tf.matmul(x, y, transpose_b=transpose)
        x_t = tf.transpose(x, [2, 0, 1, 3])
        x_t_r = tf.reshape(x_t, [length, heads * batch_size, -1])
        x_tz_matmul = tf.matmul(x_t_r, z, transpose_b=transpose)
        x_tz_matmul_r = tf.reshape(x_tz_matmul, [length, batch_size, heads, -1])
        x_tz_matmul_r_t = tf.transpose(x_tz_matmul_r, [1,2,0,3])
        return tf.math.add(xy_matmul, x_tz_matmul_r_t)

In [8]:
tf.reshape(tf.tile(tf.range(5), [5]), [5, 5]) - tf.transpose(tf.reshape(tf.tile(tf.range(5), [5]), [5, 5]))

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[ 0,  1,  2,  3,  4],
       [-1,  0,  1,  2,  3],
       [-2, -1,  0,  1,  2],
       [-3, -2, -1,  0,  1],
       [-4, -3, -2, -1,  0]], dtype=int32)>

In [9]:
tf.expand_dims(tf.range(-5+1, 1, 1), 0)

<tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[-4, -3, -2, -1,  0]], dtype=int32)>

In [12]:
tf.clip_by_value(tf.reshape(tf.tile(tf.range(5), [5]), [5, 5]) - tf.transpose(tf.reshape(tf.tile(tf.range(5), [5]), [5, 5])), -2, 2)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[ 0,  1,  2,  2,  2],
       [-1,  0,  1,  2,  2],
       [-2, -1,  0,  1,  2],
       [-2, -2, -1,  0,  1],
       [-2, -2, -2, -1,  0]], dtype=int32)>

In [13]:
tf.clip_by_value(tf.reshape(tf.tile(tf.range(5), [5]), [5, 5]) - tf.transpose(tf.reshape(tf.tile(tf.range(5), [5]), [5, 5])), -2, 2)+2

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[2, 3, 4, 4, 4],
       [1, 2, 3, 4, 4],
       [0, 1, 2, 3, 4],
       [0, 0, 1, 2, 3],
       [0, 0, 0, 1, 2]], dtype=int32)>

In [1]:
import torch

In [None]:
class MultiHeadAttention(layers.Layer):
    
      def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model  # typically 512
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = tf.keras.layers.Dense(d_model, activation='relu')
        self.wk = tf.keras.layers.Dense(d_model, activation='relu')
        self.wv = tf.keras.layers.Dense(d_model, activation='relu')
        
        self.dense = tf.keras.layers.Dense(d_model, activation='relu')
       
      def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
     
        return tf.transpose(x, perm=[0, 2, 1, 3])

      def _generate_relative_positions_matrix(self, length, max_relative_position,
                                            cache=False):
        """Generates matrix of relative positions between inputs."""
        if not cache:
          range_vec = tf.range(length)
          range_mat = tf.reshape(tf.tile(range_vec, [length]), [length, length])
          distance_mat = range_mat - tf.transpose(range_mat)
        else:
          distance_mat = tf.expand_dims(tf.range(-length+1, 1, 1), 0)
        distance_mat_clipped = tf.clip_by_value(distance_mat, -max_relative_position,
                                                max_relative_position)
        # Shift values to be >= 0. Each integer still uniquely identifies a relative
        # position difference.
     
        final_mat = distance_mat_clipped + max_relative_position
        
     
        return final_mat

      def _generate_relative_positions_embeddings(self, length, depth,
                                                  max_relative_position, name,
                                                  cache=False):
        """Generates tensor of size [1 if cache else length, length, depth]."""
        with tf.compat.v1.variable_scope(name):
          
          relative_positions_matrix = self._generate_relative_positions_matrix(
              length, max_relative_position, cache=cache)
          vocab_size = max_relative_position * 2 + 1
          # Generates embedding for each relative position of dimension depth.
          embeddings_table = tf.compat.v1.get_variable(name, [vocab_size, depth])
          embeddings = tf.gather(embeddings_table, relative_positions_matrix)
       
          return embeddings 

      def _relative_attention_inner(self, x, y, z, transpose,mask,length):
        batch_size = tf.shape(x)[0]
        heads = N_HEADS
        
                                                                   
        # xy_matmul is [batch_size, heads, length or 1, length or depth]
        xy_matmul = tf.matmul(x, y, transpose_b=transpose)
       
        # x_t is [length or 1, batch_size, heads, length or depth]
        x_t = tf.transpose(x, [2, 0, 1, 3])
        
        # x_t_r is [length or 1, batch_size * heads, length or depth]
        x_t_r = tf.reshape(x_t, [length, heads * batch_size, -1])
        
        # x_tz_matmul is [length or 1, batch_size * heads, length or depth]
        x_tz_matmul = tf.matmul(x_t_r, z, transpose_b=transpose) 
       
        # x_tz_matmul_r is [length or 1, batch_size, heads, length or depth]
        x_tz_matmul_r = tf.reshape(x_tz_matmul, [length, batch_size, heads, -1])
        
        # x_tz_matmul_r_t is [batch_size, heads, length or 1, length or depth]
        x_tz_matmul_r_t = tf.transpose(x_tz_matmul_r, [1, 2, 0, 3])
     
      
        
        return tf.math.add(xy_matmul, x_tz_matmul_r_t)

      def dot_product_attention_relative(self,length, q,
                                      k,
                                      v,
                                      mask,
                                      bias,
                                      max_relative_position,
                                      dropout_rate=0.0,
                                      image_shapes=None,
                                      save_weights_to=None,
                                      name=None,
                                      make_image_summary=True,
                                      cache=False):
        if not max_relative_position:
          raise ValueError("Max relative position (%s) should be > 0 when using "
                          "relative self attention." % (max_relative_position))
        with tf.compat.v1.variable_scope(
            name, default_name="dot_product_attention_relative",
            values=[q, k, v]) as scope:

          # This calculation only works for self attention.
          # q, k and v must therefore have the same shape.
          if not cache:
            q.get_shape().assert_is_compatible_with(k.get_shape())
            q.get_shape().assert_is_compatible_with(v.get_shape())
          
          # Use separate embeddings suitable for keys and values.
          depth = self.depth #64 #128   #self.depth 512/4
         
          # print('Error...')
          relations_keys = self._generate_relative_positions_embeddings(length, depth, max_relative_position, "relative_positions_keys",cache=cache)
          
          relations_values = self._generate_relative_positions_embeddings(length, depth, max_relative_position, "relative_positions_values",cache=cache)
          
          # Compute self attention considering the relative position embeddings.
        
          logits = self._relative_attention_inner(q, k, relations_keys, True,mask,length)
          #print("logits",logits.shape,mask.shape)
          logits += (mask * -1e9)
          weights = tf.nn.softmax(logits)
          return self._relative_attention_inner(weights, v, relations_values, False,None,length), weights
    
      def call(self,length, q, k, v, mask):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
       
        oot, attweight = self.dot_product_attention_relative(length,q, k, v, mask,bias=None, max_relative_position=16, dropout_rate=0.1,image_shapes=None,save_weights_to=None,name=None,make_image_summary=False,cache=False)
        scaled_attention = tf.transpose(oot, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,(batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        output = tf.reshape(output, (batch_size, length, self.d_model))
        
        return output#, attweight


In [None]:
def _generate_relative_positions_matrix(self, length, max_relative_position,
                                            cache=False):
        """Generates matrix of relative positions between inputs."""
        if not cache:
          range_vec = torch.arange(length) 
          range_mat = torch.tile(range_vec, [length]).view(length, length)
          distance_mat = range_mat - torch.transpose(range_mat)
        else:
          distance_mat = torch.arange(-length+1, 1, 1)[None, :]
        distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position,
                                      max_relative_position)
        # Shift values to be >= 0. Each integer still uniquely identifies a relative
        # position difference.
     
        final_mat = distance_mat_clipped + max_relative_position
        
     
        return final_mat

In [None]:
def _generate_relative_positions_embeddings(self, length, depth,
                                                  max_relative_position, name,
                                                  cache=False):
        """Generates tensor of size [1 if cache else length, length, depth]."""
          relative_positions_matrix = self._generate_relative_positions_matrix(
              length, max_relative_position, cache=cache)
          vocab_size = max_relative_position * 2 + 1
          # Generates embedding for each relative position of dimension depth.
          embeddings_table = tf.compat.v1.get_variable(name, [vocab_size, depth])
          embeddings = tf.gather(embeddings_table, relative_positions_matrix)
       
          return embeddings 

In [10]:
from torch.utils.data import DataLoader

In [None]:
class RawNumpyFeatureLoader(DataLoader):
    """
    Use DataLoader to make texts into batch
    """
    def __init__(self, 
                batch_size,
                root_dir,
                ann_path, 
                feat_dir, 
                text_dir, 
                **kwargs):
       
        self.dataset = NumpyFeatureDataset(
            root_dir, ann_path,  feat_dir, text_dir)

        self.collate_fn = self.dataset.collate_fn
        
        super(RawNumpyFeatureLoader, self).__init__(
            self.dataset,
            batch_size=batch_size,
            pin_memory=True,
            collate_fn=self.collate_fn,
            **kwargs) 

In [None]:
class NumpyFeatureLoader(DataLoader):
    """
    Use BucketIterator to make texts of same length into batch
    """
    def __init__(self, 
                batch_size,
                root_dir,
                ann_path, 
                feat_dir, 
                text_dir, 
                device,
                **kwargs):
       
        self.dataset = abc

        self.collate_fn = self.dataset.collate_fn
        
        super(NumpyFeatureLoader, self).__init__(
            self.dataset,
            batch_size=batch_size,
            device=device, 
            sort_key=lambda x: len(x['text']),
            repeat=True, # Repeat the iterator for multiple epochs.
            sort=False,  # Sort all examples in data using `sort_key`.
            shuffle=True, # Shuffle data on each epoch run.
            sort_within_batch=True,
            **kwargs) # Use `sort_key` to sort examples in each batch.

In [19]:
from torchtext.data.utils import get_tokenizer

In [21]:
help(DataLoader)

Help on class DataLoader in module torch.utils.data.dataloader:

class DataLoader(typing.Generic)
 |  DataLoader(*args, **kwds)
 |  
 |  Data loader. Combines a dataset and a sampler, and provides an iterable over
 |  the given dataset.
 |  
 |  The :class:`~torch.utils.data.DataLoader` supports both map-style and
 |  iterable-style datasets with single- or multi-process loading, customizing
 |  loading order and optional automatic batching (collation) and memory pinning.
 |  
 |  See :py:mod:`torch.utils.data` documentation page for more details.
 |  
 |  Args:
 |      dataset (Dataset): dataset from which to load the data.
 |      batch_size (int, optional): how many samples per batch to load
 |          (default: ``1``).
 |      shuffle (bool, optional): set to ``True`` to have the data reshuffled
 |          at every epoch (default: ``False``).
 |      sampler (Sampler or Iterable, optional): defines the strategy to draw
 |          samples from the dataset. Can be any ``Iterable``

In [22]:
import torch.nn as nn

In [29]:
class RelativePosition(nn.Module):

    def __init__(self, num_units, max_relative_position):
        super().__init__()
        self.num_units = num_units
        self.max_relative_position = max_relative_position
        self.embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, num_units))
        nn.init.xavier_uniform_(self.embeddings_table)

    def forward(self, length_q, length_k):
        range_vec_q = torch.arange(length_q)
        range_vec_k = torch.arange(length_k)
        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
        distance_mat_clipped = torch.clamp(distance_mat, -self.max_relative_position, self.max_relative_position)
        final_mat = distance_mat_clipped + self.max_relative_position
        final_mat = torch.LongTensor(final_mat)
        print(final_mat)
        print(self.embeddings_table)
        embeddings = self.embeddings_table[final_mat]

        return embeddings

In [39]:
relative = RelativePosition(10, 5)

In [40]:
a = relative(7, 8)

tensor([[ 5,  6,  7,  8,  9, 10, 10, 10],
        [ 4,  5,  6,  7,  8,  9, 10, 10],
        [ 3,  4,  5,  6,  7,  8,  9, 10],
        [ 2,  3,  4,  5,  6,  7,  8,  9],
        [ 1,  2,  3,  4,  5,  6,  7,  8],
        [ 0,  1,  2,  3,  4,  5,  6,  7],
        [ 0,  0,  1,  2,  3,  4,  5,  6]])
Parameter containing:
tensor([[-0.1902, -0.5252,  0.5227,  0.5277,  0.1122,  0.0997,  0.3418,  0.3685,
         -0.0089, -0.0805],
        [ 0.0106, -0.3891,  0.2292, -0.3254, -0.2666, -0.1337,  0.2731,  0.3179,
         -0.4902,  0.1341],
        [-0.3047,  0.3478,  0.5268, -0.0111,  0.4231, -0.3569, -0.4623, -0.3676,
          0.0249,  0.0734],
        [-0.1396, -0.3121, -0.1920, -0.2770,  0.5302,  0.4391,  0.0854,  0.0637,
         -0.1859,  0.3874],
        [ 0.1061, -0.2006,  0.3348,  0.0188,  0.4034,  0.1066, -0.4279, -0.3755,
          0.1734,  0.1670],
        [-0.5014, -0.4008,  0.1953,  0.3884,  0.1437, -0.4882,  0.3691,  0.1262,
         -0.0872,  0.3135],
        [ 0.4184, -0.2667, -0

In [42]:
a.shape

torch.Size([7, 8, 10])