From 68d4dbf1238610ff7b73fb929ffffad7a53bf719 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 10 Mar 2020 15:50:36 +0100
Subject: [PATCH 1/3] add docstrings to tf classes

---
 rasa/core/policies/ted_policy.py        |   3 +-
 rasa/nlu/classifiers/diet_classifier.py |   8 +-
 rasa/utils/tensorflow/layers.py         | 255 +++++++++++++++++++++++-
 rasa/utils/tensorflow/transformer.py    | 229 +++++++++++++++------
 4 files changed, 421 insertions(+), 74 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 3a78c873f393..7251dd715051 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -150,7 +150,8 @@ class TEDPolicy(Policy):
         # The scale of regularization
         REGULARIZATION_CONSTANT: 0.001,
         # The scale of how important is to minimize the maximum similarity
-        # between embeddings of different labels.
+        # between embeddings of different labels,
+        # used only if 'loss_type' is set to 'margin'.
         NEGATIVE_MARGIN_SCALE: 0.8,
         # Dropout rate for embedding layers of dialogue features.
         DROP_RATE_DIALOGUE: 0.1,
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c4a0574bd10a..778ac9f4b015 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -125,7 +125,7 @@ def required_components(cls) -> List[Type[Component]]:
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
         KEY_RELATIVE_ATTENTION: False,
-        # If 'True' use key relative embeddings in attention
+        # If 'True' use value relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
         # Max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
@@ -169,13 +169,15 @@ def required_components(cls) -> List[Type[Component]]:
         # If 'True' the algorithm only minimizes maximum similarity over
         # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
-        # Scale loss inverse proportionally to confidence of correct prediction
+        # If 'True' scale loss inverse proportionally to the confidence
+        # of the correct prediction
         SCALE_LOSS: True,
         # ## Regularization parameters
         # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
         # The scale of how important is to minimize the maximum similarity
-        # between embeddings of different labels.
+        # between embeddings of different labels,
+        # used only if 'loss_type' is set to 'margin'.
         NEGATIVE_MARGIN_SCALE: 0.8,
         # Dropout rate for encoder
         DROP_RATE: 0.2,
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 55c0ddbe3a0f..b2a6a92a5a61 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -10,9 +10,32 @@
 
 
 class SparseDropout(tf.keras.layers.Dropout):
+    """Applies Dropout to the input.
+
+    Dropout consists in randomly setting
+    a fraction `rate` of input units to 0 at each update during training time,
+    which helps prevent overfitting.
+
+    Arguments:
+        rate: Float between 0 and 1; fraction of the input units to drop.
+    """
+
     def call(
-        self, inputs: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None
+        self, inputs: tf.SparseTensor, training: Optional[Union[tf.Tensor, bool]] = None
     ) -> tf.Tensor:
+        """Apply dropout to sparse inputs.
+
+        Arguments:
+            inputs: Input sparse tensor (of any rank).
+            training: Python boolean indicating whether the layer should behave in
+                training mode (adding dropout) or in inference mode (doing nothing).
+
+        Returns:
+            Output of dropout layer.
+        """
+        if not isinstance(inputs, tf.SparseTensor):
+            raise ValueError("Input tensor should be sparse.")
+
         if training is None:
             training = K.learning_phase()
 
@@ -34,7 +57,47 @@ def dropped_inputs() -> tf.Tensor:
 
 
 class DenseForSparse(tf.keras.layers.Dense):
-    """Dense layer for sparse input tensor."""
+    """Dense layer for sparse input tensor.
+
+    Just your regular densely-connected NN layer but for sparse tensors.
+
+    `Dense` implements the operation:
+    `output = activation(dot(input, kernel) + bias)`
+    where `activation` is the element-wise activation function
+    passed as the `activation` argument, `kernel` is a weights matrix
+    created by the layer, and `bias` is a bias vector created by the layer
+    (only applicable if `use_bias` is `True`).
+
+    Note: If the input to the layer has a rank greater than 2, then
+    it is flattened prior to the initial dot product with `kernel`.
+
+    Arguments:
+        units: Positive integer, dimensionality of the output space.
+            activation: Activation function to use.
+            If you don't specify anything, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to
+            the output of the layer (its "activation")..
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix.
+        bias_constraint: Constraint function applied to the bias vector.
+
+    Input shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        The most common situation would be
+        a 2D input with shape `(batch_size, input_dim)`.
+
+    Output shape:
+        N-D tensor with shape: `(batch_size, ..., units)`.
+        For instance, for a 2D input with shape `(batch_size, input_dim)`,
+        the output would have shape `(batch_size, units)`.
+    """
 
     def __init__(self, reg_lambda: float = 0, **kwargs: Any) -> None:
         if reg_lambda > 0:
@@ -45,6 +108,14 @@ def __init__(self, reg_lambda: float = 0, **kwargs: Any) -> None:
         super().__init__(kernel_regularizer=regularizer, **kwargs)
 
     def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
+        """Apply dense layer to sparse inputs.
+
+        Arguments:
+            inputs: Input sparse tensor (of any rank).
+
+        Returns:
+            Output of dense layer.
+        """
         if not isinstance(inputs, tf.SparseTensor):
             raise ValueError("Input tensor should be sparse.")
 
@@ -67,13 +138,56 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
 
 
 class DenseWithSparseWeights(tf.keras.layers.Dense):
+    """Just your regular densely-connected NN layer but with sparse weights.
+
+    `Dense` implements the operation:
+    `output = activation(dot(input, kernel) + bias)`
+    where `activation` is the element-wise activation function
+    passed as the `activation` argument, `kernel` is a weights matrix
+    created by the layer, and `bias` is a bias vector created by the layer
+    (only applicable if `use_bias` is `True`).
+    It creates `kernel_mask` to set fraction of the `kernel` weights to zero.
+
+    Note: If the input to the layer has a rank greater than 2, then
+    it is flattened prior to the initial dot product with `kernel`.
+
+    Arguments:
+        sparsity: Float between 0 and 1. Fraction of the `kernel`
+            weights to set to zero.
+        units: Positive integer, dimensionality of the output space.
+        activation: Activation function to use.
+            If you don't specify anything, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix.
+        bias_initializer: Initializer for the bias vector.
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix.
+        bias_regularizer: Regularizer function applied to the bias vector.
+        activity_regularizer: Regularizer function applied to
+            the output of the layer (its "activation")..
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix.
+        bias_constraint: Constraint function applied to the bias vector.
+
+    Input shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        The most common situation would be
+        a 2D input with shape `(batch_size, input_dim)`.
+
+    Output shape:
+        N-D tensor with shape: `(batch_size, ..., units)`.
+        For instance, for a 2D input with shape `(batch_size, input_dim)`,
+        the output would have shape `(batch_size, units)`.
+    """
+
     def __init__(self, sparsity: float = 0.8, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.sparsity = sparsity
 
     def build(self, input_shape: tf.TensorShape) -> None:
         super().build(input_shape)
-        # create random mask to set some weights to 0
+        # create random mask to set fraction of the `kernel` weights to zero
         kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
         kernel_mask = tf.cast(
             tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
@@ -83,13 +197,32 @@ def build(self, input_shape: tf.TensorShape) -> None:
         )
 
     def call(self, inputs: tf.Tensor) -> tf.Tensor:
-        # set some weights to 0 according to precomputed mask
+        # set fraction of the `kernel` weights to zero according to precomputed mask
         self.kernel.assign(self.kernel * self.kernel_mask)
         return super().call(inputs)
 
 
 class Ffnn(tf.keras.layers.Layer):
-    """Create feed-forward network with hidden layers and name suffix."""
+    """Feed-forward network layer.
+
+    Arguments:
+        layer_sizes: List of integers with dimensionality of the layers.
+        dropout_rate: Float between 0 and 1; fraction of the input units to drop.
+        reg_lambda: Float, regularization factor.
+        sparsity: Float between 0 and 1. Fraction of the `kernel`
+            weights to set to zero.
+        layer_name_suffix: Text added to the name of the layers.
+
+    Input shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        The most common situation would be
+        a 2D input with shape `(batch_size, input_dim)`.
+
+    Output shape:
+        N-D tensor with shape: `(batch_size, ..., layer_sizes[-1])`.
+        For instance, for a 2D input with shape `(batch_size, input_dim)`,
+        the output would have shape `(batch_size, layer_sizes[-1])`.
+    """
 
     def __init__(
         self,
@@ -125,7 +258,25 @@ def call(
 
 
 class Embed(tf.keras.layers.Layer):
-    """Create dense embedding layer with a name."""
+    """Dense embedding layer.
+
+    Arguments:
+        embed_dim: Positive integer, dimensionality of the output space.
+        reg_lambda: Float; regularization factor.
+        layer_name_suffix: Text added to the name of the layers.
+        similarity_type: Optional type of similarity measure to use,
+            either 'cosine' or 'inner'.
+
+    Input shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        The most common situation would be
+        a 2D input with shape `(batch_size, input_dim)`.
+
+    Output shape:
+        N-D tensor with shape: `(batch_size, ..., embed_dim)`.
+        For instance, for a 2D input with shape `(batch_size, input_dim)`,
+        the output would have shape `(batch_size, embed_dim)`.
+    """
 
     def __init__(
         self,
@@ -160,6 +311,19 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
 
 
 class InputMask(tf.keras.layers.Layer):
+    """The layer that masks 15% of the input.
+
+    Input shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        The most common situation would be
+        a 2D input with shape `(batch_size, input_dim)`.
+
+    Output shape:
+        N-D tensor with shape: `(batch_size, ..., input_dim)`.
+        For instance, for a 2D input with shape `(batch_size, input_dim)`,
+        the output would have shape `(batch_size, input_dim)`.
+    """
+
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
@@ -225,6 +389,14 @@ def x_masked() -> tf.Tensor:
 
 
 class CRF(tf.keras.layers.Layer):
+    """CRF layer.
+
+    Arguments:
+        num_tags: Positive integer, number of tags.
+        reg_lambda: Float; regularization factor.
+        name: Optional name of the layer.
+    """
+
     def __init__(
         self, num_tags: int, reg_lambda: float, name: Optional[Text] = None
     ) -> None:
@@ -242,6 +414,17 @@ def build(self, input_shape: tf.TensorShape) -> None:
         self.built = True
 
     def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        """Decodes the highest scoring sequence of tags.
+
+        Arguments:
+            logits: A [batch_size, max_seq_len, num_tags] tensor of
+                unary potentials.
+            sequence_lengths: A [batch_size] vector of true sequence lengths.
+
+        Returns:
+            A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+            Contains the highest scoring tag indices.
+        """
         pred_ids, _ = tfa.text.crf.crf_decode(
             logits, self.transition_params, sequence_lengths
         )
@@ -255,6 +438,19 @@ def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
     def loss(
         self, logits: tf.Tensor, tag_indices: tf.Tensor, sequence_lengths: tf.Tensor
     ) -> tf.Tensor:
+        """Computes the log-likelihood of tag sequences in a CRF.
+
+        Arguments:
+            logits: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+                to use as input to the CRF layer.
+            tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+                we compute the log-likelihood.
+            sequence_lengths: A [batch_size] vector of true sequence lengths.
+
+        Returns:
+            Negative mean log-likelihood of all examples,
+            given the sequence of tag indices.
+        """
         log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
             logits, tag_indices, sequence_lengths, self.transition_params
         )
@@ -262,6 +458,32 @@ def loss(
 
 
 class DotProductLoss(tf.keras.layers.Layer):
+    """Dot-product loss layer.
+
+    Arguments:
+        num_neg: Positive integer, the number of incorrect labels;
+            the algorithm will minimize their similarity to the input.
+        loss_type: The type of the loss function, either 'softmax' or 'margin'.
+        mu_pos: Float, indicates how similar the algorithm should
+            try to make embedding vectors for correct labels;
+            should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        mu_neg: Float, maximum negative similarity for incorrect labels,
+            should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        use_max_sim_neg: Boolean, if 'True' the algorithm only minimizes
+            maximum similarity over incorrect intent labels,
+            used only if 'loss_type' is set to 'margin'.
+        neg_lambda: Float, the scale of how important is to minimize
+            the maximum similarity between embeddings of different labels,
+            used only if 'loss_type' is set to 'margin'.
+        scale_loss: Boolean, if 'True' scale loss inverse proportionally to
+            the confidence of the correct prediction.
+        name: Optional name of the layer.
+        parallel_iterations: Positive integer, the number of iterations allowed
+            to run in parallel.
+        same_sampling: Boolean, if 'True' sample same negative labels
+            for the whole batch.
+    """
+
     def __init__(
         self,
         num_neg: int,
@@ -599,8 +821,21 @@ def call(
         all_labels: tf.Tensor,
         mask: Optional[tf.Tensor] = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Calculate loss and accuracy."""
-
+        """Calculate loss and accuracy.
+
+        Arguments:
+            inputs_embed: Embedding tensor for the batch inputs.
+            labels_embed: Embedding tensor for the batch labels.
+            labels: Tensor representing batch labels.
+            all_labels_embed: Embedding tensor for the all labels.
+            all_labels: Tensor representing all labels.
+            mask: Optional tensor representing sequence mask,
+                contains `1` for inputs and `0` for padding.
+
+        Returns:
+            loss: Total loss.
+            accuracy: Training accuracy.
+        """
         (
             pos_inputs_embed,
             pos_labels_embed,
@@ -623,10 +858,10 @@ def call(
             mask,
         )
 
-        acc = self._calc_accuracy(sim_pos, sim_neg_il)
+        accuracy = self._calc_accuracy(sim_pos, sim_neg_il)
 
         loss = self._chosen_loss(
             sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li, mask
         )
 
-        return loss, acc
+        return loss, accuracy
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index c7a309db1c38..51bfe3094508 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -10,6 +10,25 @@
 # from https://www.tensorflow.org/tutorials/text/transformer
 # and https://github.com/tensorflow/tensor2tensor
 class MultiHeadAttention(tf.keras.layers.Layer):
+    """Multi-headed attention layer.
+
+    Arguments:
+        units: Positive integer, output dim of hidden layer.
+        num_heads: Positive integer, number of heads
+            to repeat the same attention structure.
+        attention_dropout_rate: Float, dropout rate inside attention for training.
+        sparsity: Float between 0 and 1. Fraction of the `kernel`
+            weights to set to zero.
+        unidirectional: Boolean, use a unidirectional or bidirectional encoder.
+        use_key_relative_position: Boolean, if 'True' use key
+            relative embeddings in attention.
+        use_value_relative_position: Boolean, if 'True' use value
+            relative embeddings in attention.
+        max_relative_position: Positive integer, max position for relative embeddings.
+        heads_share_relative_embedding: Boolean, if 'True'
+            heads will share relative embeddings.
+    """
+
     def __init__(
         self,
         units: int,
@@ -44,19 +63,21 @@ def __init__(
         self._depth = units // self.num_heads
 
         # process queries
-        self._wq = DenseWithSparseWeights(
+        self._query_dense_layer = DenseWithSparseWeights(
             units=units, use_bias=False, sparsity=sparsity
         )
         # process keys
-        self._wk = DenseWithSparseWeights(
+        self._key_dense_layer = DenseWithSparseWeights(
             units=units, use_bias=False, sparsity=sparsity
         )
         # process values
-        self._wv = DenseWithSparseWeights(
+        self._value_dense_layer = DenseWithSparseWeights(
             units=units, use_bias=False, sparsity=sparsity
         )
         # process attention output
-        self._dense = DenseWithSparseWeights(units=units, sparsity=sparsity)
+        self._output_dense_layer = DenseWithSparseWeights(
+            units=units, sparsity=sparsity
+        )
 
         self._create_relative_embeddings()
 
@@ -128,10 +149,15 @@ def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tens
     def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
         """Universal method to convert tensor from relative to absolute indexing.
 
-        x.shape =
-        (batch, num_heads, length, relative_length, depth)
-        or (batch, num_heads, length, relative_length)
         "Slides" relative embeddings by 45 degree.
+
+        Arguments:
+        x: A tensor of shape (batch, num_heads, length, relative_length, depth)
+            or (batch, num_heads, length, relative_length)
+
+        Returns:
+            A tensor of shape (batch, num_heads, length, length, depth)
+            or (batch, num_heads, length, length)
         """
 
         x_dim = len(x.shape)
@@ -233,36 +259,39 @@ def droped_logits() -> tf.Tensor:
 
     def _scaled_dot_product_attention(
         self,
-        q: tf.Tensor,
-        k: tf.Tensor,
-        v: tf.Tensor,
+        query: tf.Tensor,
+        key: tf.Tensor,
+        value: tf.Tensor,
         pad_mask: tf.Tensor,
         training: tf.Tensor,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Calculate the attention weights.
-        q, k, v must have matching leading dimensions.
-        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
-        The mask has different shapes depending on its type(padding or look ahead)
+
+        query, key, value must have matching leading dimensions.
+        key, value must have matching penultimate dimension,
+        i.e.: seq_len_k = seq_len_v.
+        The mask has different shapes depending on its type (padding or look ahead)
         but it must be broadcastable for addition.
 
-        Args:
-          q: query shape == (..., seq_len_q, depth)
-          k: key shape == (..., seq_len_k, depth)
-          v: value shape == (..., seq_len_v, depth_v)
-          pad_mask: Float tensor with shape broadcastable
-                to (..., seq_len_q, seq_len_k). Defaults to None.
+        Arguments:
+            query: A tensor with shape (..., length, depth).
+            key: A tensor with shape (..., length, depth).
+            value: A tensor with shape (..., length, depth).
+            pad_mask: Float tensor with shape broadcastable
+                to (..., length, length). Defaults to None.
 
         Returns:
-          output, attention_weights
+            output: A tensor with shape (..., length, depth).
+            attention_weights: A tensor with shape (..., length, length).
         """
 
-        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
+        matmul_qk = tf.matmul(query, key, transpose_b=True)  # (..., length, length)
 
         if self.use_key_relative_position:
-            matmul_qk += self._matmul_with_relative_keys(q)
+            matmul_qk += self._matmul_with_relative_keys(query)
 
         # scale matmul_qk
-        dk = tf.cast(tf.shape(k)[-1], tf.float32)
+        dk = tf.cast(tf.shape(key)[-1], tf.float32)
         logits = matmul_qk / tf.math.sqrt(dk)
 
         # add the mask to the scaled tensor.
@@ -273,13 +302,11 @@ def _scaled_dot_product_attention(
         if self.attention_dropout_rate > 0:
             logits = self._drop_attention_logits(logits, pad_mask, training)
 
-        # softmax is normalized on the last axis (seq_len_k) so that the scores
+        # softmax is normalized on the last axis (length) so that the scores
         # add up to 1.
-        attention_weights = tf.nn.softmax(
-            logits, axis=-1
-        )  # (..., seq_len_q, seq_len_k)
+        attention_weights = tf.nn.softmax(logits, axis=-1)  # (..., length, length)
 
-        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
+        output = tf.matmul(attention_weights, value)  # (..., length, depth)
         if self.use_value_relative_position:
             output += self._matmul_with_relative_values(attention_weights)
 
@@ -289,7 +316,7 @@ def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
         """Split the last dimension into (num_heads, depth).
 
         Transpose the result such that the shape is
-        (batch_size, num_heads, seq_len, depth)
+        (batch_size, num_heads, length, depth)
         """
 
         x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth))
@@ -299,50 +326,86 @@ def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
         """Inverse of split_heads.
 
         Args:
-          x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
+            x: A Tensor with shape [batch, num_heads, length, units / num_heads]
 
         Returns:
-          a Tensor with shape [batch, length, channels]
+            A Tensor with shape [batch, length, units]
         """
 
-        # (batch_size, seq_len_q, num_heads, depth)
+        # (batch_size, length, num_heads, depth)
         x = tf.transpose(x, perm=[0, 2, 1, 3])
-        # (batch_size, seq_len_q, units)
+        # (batch_size, length, units)
         return tf.reshape(x, (tf.shape(x)[0], -1, self.units))
 
     # noinspection PyMethodOverriding
     def call(
         self,
-        v: tf.Tensor,
-        k: tf.Tensor,
-        q: tf.Tensor,
+        query_input: tf.Tensor,
+        source_input: tf.Tensor,
         pad_mask: Optional[tf.Tensor] = None,
         training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Apply attention mechanism to query_input and source_input.
+
+        Arguments:
+            query_input: A tensor with shape [batch_size, length, input_size].
+            source_input: A tensor with shape [batch_size, length, input_size].
+            pad_mask: Float tensor with shape broadcastable
+                to (..., length, length). Defaults to None.
+            training: A bool, whether in training mode or not.
+
+        Returns:
+            Attention layer output with shape [batch_size, length, units]
+        """
         if training is None:
             training = K.learning_phase()
 
-        q = self._wq(q)  # (batch_size, seq_len_q, units)
-        k = self._wk(k)  # (batch_size, seq_len_k, units)
-        v = self._wv(v)  # (batch_size, seq_len_v, units)
+        query = self._query_dense_layer(query_input)  # (batch_size, length, units)
+        key = self._key_dense_layer(source_input)  # (batch_size, length, units)
+        value = self._value_dense_layer(source_input)  # (batch_size, length, units)
 
-        q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
-        k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
-        v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
+        query = self._split_heads(query)  # (batch_size, num_heads, length, depth)
+        key = self._split_heads(key)  # (batch_size, num_heads, length, depth)
+        value = self._split_heads(value)  # (batch_size, num_heads, length, depth)
 
         attention, attention_weights = self._scaled_dot_product_attention(
-            q, k, v, pad_mask, training
+            query, key, value, pad_mask, training
         )
-        # attention.shape == (batch_size, num_heads, seq_len_q, depth)
-        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
-        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, units)
+        # attention.shape == (batch_size, num_heads, length, depth)
+        # attention_weights.shape == (batch_size, num_heads, length, length)
+        attention = self._combine_heads(attention)  # (batch_size, length, units)
 
-        output = self._dense(attention)  # (batch_size, seq_len_q, units)
+        output = self._output_dense_layer(attention)  # (batch_size, length, units)
 
         return output, attention_weights
 
 
 class TransformerEncoderLayer(tf.keras.layers.Layer):
+    """Transformer encoder layer.
+
+    The layer is composed of the sublayers:
+        1. Self-attention layer
+        2. Feed-forward network (which is 2 fully-connected layers)
+
+    Arguments:
+        units: Positive integer, output dim of hidden layer.
+        num_heads: Positive integer, number of heads
+            to repeat the same attention structure.
+        filter_units: Positive integer, output dim of the first ffn hidden layer.
+        dropout_rate: Float between 0 and 1; fraction of the input units to drop.
+        attention_dropout_rate: Float, dropout rate inside attention for training.
+        sparsity: Float between 0 and 1. Fraction of the `kernel`
+            weights to set to zero.
+        unidirectional: Boolean, use a unidirectional or bidirectional encoder.
+        use_key_relative_position: Boolean, if 'True' use key
+            relative embeddings in attention.
+        use_value_relative_position: Boolean, if 'True' use value
+            relative embeddings in attention.
+        max_relative_position: Positive integer, max position for relative embeddings.
+        heads_share_relative_embedding: Boolean, if 'True'
+            heads will share relative embeddings.
+    """
+
     def __init__(
         self,
         units: int,
@@ -377,11 +440,11 @@ def __init__(
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
             DenseWithSparseWeights(
                 units=filter_units, activation=tfa.activations.gelu, sparsity=sparsity
-            ),  # (batch_size, seq_len, filter_units)
+            ),  # (batch_size, length, filter_units)
             tf.keras.layers.Dropout(dropout_rate),
             DenseWithSparseWeights(
                 units=units, sparsity=sparsity
-            ),  # (batch_size, seq_len, units)
+            ),  # (batch_size, length, units)
             tf.keras.layers.Dropout(dropout_rate),
         ]
 
@@ -391,25 +454,60 @@ def call(
         pad_mask: Optional[tf.Tensor] = None,
         training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> tf.Tensor:
+        """Apply transformer encoder layer.
+
+        Arguments:
+            x: A tensor with shape [batch_size, length, units].
+            pad_mask: Float tensor with shape broadcastable
+                to (..., length, length). Defaults to None.
+            training: A bool, whether in training mode or not.
+
+        Returns:
+            Transformer encoder layer output with shape [batch_size, length, units]
+        """
         if training is None:
             training = K.learning_phase()
 
-        x_norm = self._layer_norm(x)  # (batch_size, seq_len, units)
-        attn_out, _ = self._mha(
-            x_norm, x_norm, x_norm, pad_mask=pad_mask, training=training
-        )
+        x_norm = self._layer_norm(x)  # (batch_size, length, units)
+        attn_out, _ = self._mha(x_norm, x_norm, pad_mask=pad_mask, training=training)
         attn_out = self._dropout(attn_out, training=training)
         x += attn_out
 
-        ffn_out = x  # (batch_size, seq_len, units)
+        ffn_out = x  # (batch_size, length, units)
         for layer in self._ffn_layers:
             ffn_out = layer(ffn_out, training=training)
         x += ffn_out
 
-        return x  # (batch_size, seq_len, units)
+        return x  # (batch_size, length, units)
 
 
 class TransformerEncoder(tf.keras.layers.Layer):
+    """Transformer encoder.
+
+    Encoder stack is made up of `num_layers` identical encoder layers.
+
+    Arguments:
+        num_layers: Positive integer, number of encoder layers.
+        units: Positive integer, output dim of hidden layer.
+        num_heads: Positive integer, number of heads
+            to repeat the same attention structure.
+        filter_units: Positive integer, output dim of the first ffn hidden layer.
+        reg_lambda: Float, regularization factor.
+        dropout_rate: Float between 0 and 1; fraction of the input units to drop.
+        attention_dropout_rate: Float, dropout rate inside attention for training.
+        sparsity: Float between 0 and 1. Fraction of the `kernel`
+            weights to set to zero.
+        unidirectional: Boolean, use a unidirectional or bidirectional encoder.
+        use_key_relative_position: Boolean, if 'True' use key
+            relative embeddings in attention.
+        use_value_relative_position: Boolean, if 'True' use value
+            relative embeddings in attention.
+        max_relative_position: Positive integer, max position for relative embeddings.
+        heads_share_relative_embedding: Boolean, if 'True'
+            heads will share relative embeddings.
+        name: Optional name of the layer.
+    """
+
     def __init__(
         self,
         num_layers: int,
@@ -494,22 +592,33 @@ def call(
         pad_mask: Optional[tf.Tensor] = None,
         training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> tf.Tensor:
+        """Apply transformer encoder.
+
+        Arguments:
+            x: A tensor with shape [batch_size, length, input_size].
+            pad_mask: Float tensor with shape broadcastable
+                to (..., length, length). Defaults to None.
+            training: A bool, whether in training mode or not.
+
+        Returns:
+            Transformer encoder output with shape [batch_size, length, units]
+        """
 
         # adding embedding and position encoding.
-        x = self._embedding(x)  # (batch_size, seq_len, units)
+        x = self._embedding(x)  # (batch_size, length, units)
         x *= tf.math.sqrt(tf.cast(self.units, tf.float32))
         x += self._positional_encoding(tf.shape(x)[1])
         x = self._dropout(x, training=training)
 
         if pad_mask is not None:
-            pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
+            pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, length)
             pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]
-            # pad_mask.shape = (batch_size, 1, 1, seq_len)
+            # pad_mask.shape = (batch_size, 1, 1, length)
             if self.unidirectional:
                 # add look ahead pad mask to emulate unidirectional behavior
                 pad_mask = tf.minimum(
                     1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
-                )  # (batch_size, 1, seq_len, seq_len)
+                )  # (batch_size, 1, length, length)
 
         for layer in self._enc_layers:
             x = layer(x, pad_mask=pad_mask, training=training)
@@ -517,4 +626,4 @@ def call(
         # if normalization is done in encoding layers, then it should also be done
         # on the output, since the output can grow very large, being the sum of
         # a whole stack of unnormalized layer outputs.
-        return self._layer_norm(x)  # (batch_size, seq_len, units)
+        return self._layer_norm(x)  # (batch_size, length, units)

From 2bb20a05a6a8e04e83c6d93e0cb8aa215473ed34 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 10 Mar 2020 15:53:28 +0100
Subject: [PATCH 2/3] update comments

---
 rasa/core/policies/ted_policy.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 7251dd715051..bd7b8625899b 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -106,7 +106,7 @@ class TEDPolicy(Policy):
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
         KEY_RELATIVE_ATTENTION: False,
-        # If 'True' use key relative embeddings in attention
+        # If 'True' use value relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
         # Max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
@@ -144,7 +144,8 @@ class TEDPolicy(Policy):
         # If 'True' the algorithm only minimizes maximum similarity over
         # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
-        # Scale loss inverse proportionally to confidence of correct prediction
+        # If 'True' scale loss inverse proportionally to the confidence
+        # of the correct prediction
         SCALE_LOSS: True,
         # ## Regularization parameters
         # The scale of regularization

From e97877f5ebf92a0c6223cf69785efc7c0a0644b6 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 11 Mar 2020 11:17:13 +0100
Subject: [PATCH 3/3] fix docstrings

---
 rasa/utils/tensorflow/layers.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index b2a6a92a5a61..71c42d0db3b7 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -32,6 +32,9 @@ def call(
 
         Returns:
             Output of dropout layer.
+
+        Raises:
+            A ValueError if inputs is not a sparse tensor
         """
         if not isinstance(inputs, tf.SparseTensor):
             raise ValueError("Input tensor should be sparse.")
@@ -79,8 +82,7 @@ class DenseForSparse(tf.keras.layers.Dense):
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix.
         bias_initializer: Initializer for the bias vector.
-        kernel_regularizer: Regularizer function applied to
-            the `kernel` weights matrix.
+        reg_lambda: Float, regularization factor.
         bias_regularizer: Regularizer function applied to the bias vector.
         activity_regularizer: Regularizer function applied to
             the output of the layer (its "activation")..
@@ -115,6 +117,9 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
 
         Returns:
             Output of dense layer.
+
+        Raises:
+            A ValueError if inputs is not a sparse tensor
         """
         if not isinstance(inputs, tf.SparseTensor):
             raise ValueError("Input tensor should be sparse.")
@@ -343,7 +348,18 @@ def call(
         mask: tf.Tensor,
         training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Randomly mask input sequences."""
+        """Randomly mask input sequences.
+
+        Arguments:
+            x: Input sequence tensor of rank 3.
+            mask: A tensor representing sequence mask,
+                contains `1` for inputs and `0` for padding.
+            training: Python boolean indicating whether the layer should behave in
+                training mode (mask inputs) or in inference mode (doing nothing).
+
+        Returns:
+            A tuple of masked inputs and boolean mask.
+        """
 
         if training is None:
             training = K.learning_phase()