From 68d4dbf1238610ff7b73fb929ffffad7a53bf719 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 10 Mar 2020 15:50:36 +0100 Subject: [PATCH 1/3] add docstrings to tf classes --- rasa/core/policies/ted_policy.py | 3 +- rasa/nlu/classifiers/diet_classifier.py | 8 +- rasa/utils/tensorflow/layers.py | 255 +++++++++++++++++++++++- rasa/utils/tensorflow/transformer.py | 229 +++++++++++++++------ 4 files changed, 421 insertions(+), 74 deletions(-) diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 3a78c873f393..7251dd715051 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -150,7 +150,8 @@ class TEDPolicy(Policy): # The scale of regularization REGULARIZATION_CONSTANT: 0.001, # The scale of how important is to minimize the maximum similarity - # between embeddings of different labels. + # between embeddings of different labels, + # used only if 'loss_type' is set to 'margin'. NEGATIVE_MARGIN_SCALE: 0.8, # Dropout rate for embedding layers of dialogue features. DROP_RATE_DIALOGUE: 0.1, diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index c4a0574bd10a..778ac9f4b015 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -125,7 +125,7 @@ def required_components(cls) -> List[Type[Component]]: NUM_HEADS: 4, # If 'True' use key relative embeddings in attention KEY_RELATIVE_ATTENTION: False, - # If 'True' use key relative embeddings in attention + # If 'True' use value relative embeddings in attention VALUE_RELATIVE_ATTENTION: False, # Max position for relative embeddings MAX_RELATIVE_POSITION: None, @@ -169,13 +169,15 @@ def required_components(cls) -> List[Type[Component]]: # If 'True' the algorithm only minimizes maximum similarity over # incorrect intent labels, used only if 'loss_type' is set to 'margin'. USE_MAX_NEG_SIM: True, - # Scale loss inverse proportionally to confidence of correct prediction + # If 'True' scale loss inverse proportionally to the confidence + # of the correct prediction SCALE_LOSS: True, # ## Regularization parameters # The scale of regularization REGULARIZATION_CONSTANT: 0.002, # The scale of how important is to minimize the maximum similarity - # between embeddings of different labels. + # between embeddings of different labels, + # used only if 'loss_type' is set to 'margin'. NEGATIVE_MARGIN_SCALE: 0.8, # Dropout rate for encoder DROP_RATE: 0.2, diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py index 55c0ddbe3a0f..b2a6a92a5a61 100644 --- a/rasa/utils/tensorflow/layers.py +++ b/rasa/utils/tensorflow/layers.py @@ -10,9 +10,32 @@ class SparseDropout(tf.keras.layers.Dropout): + """Applies Dropout to the input. + + Dropout consists in randomly setting + a fraction `rate` of input units to 0 at each update during training time, + which helps prevent overfitting. + + Arguments: + rate: Float between 0 and 1; fraction of the input units to drop. + """ + def call( - self, inputs: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None + self, inputs: tf.SparseTensor, training: Optional[Union[tf.Tensor, bool]] = None ) -> tf.Tensor: + """Apply dropout to sparse inputs. + + Arguments: + inputs: Input sparse tensor (of any rank). + training: Python boolean indicating whether the layer should behave in + training mode (adding dropout) or in inference mode (doing nothing). + + Returns: + Output of dropout layer. + """ + if not isinstance(inputs, tf.SparseTensor): + raise ValueError("Input tensor should be sparse.") + if training is None: training = K.learning_phase() @@ -34,7 +57,47 @@ def dropped_inputs() -> tf.Tensor: class DenseForSparse(tf.keras.layers.Dense): - """Dense layer for sparse input tensor.""" + """Dense layer for sparse input tensor. + + Just your regular densely-connected NN layer but for sparse tensors. + + `Dense` implements the operation: + `output = activation(dot(input, kernel) + bias)` + where `activation` is the element-wise activation function + passed as the `activation` argument, `kernel` is a weights matrix + created by the layer, and `bias` is a bias vector created by the layer + (only applicable if `use_bias` is `True`). + + Note: If the input to the layer has a rank greater than 2, then + it is flattened prior to the initial dot product with `kernel`. + + Arguments: + units: Positive integer, dimensionality of the output space. + activation: Activation function to use. + If you don't specify anything, no activation is applied + (ie. "linear" activation: `a(x) = x`). + use_bias: Boolean, whether the layer uses a bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix. + bias_initializer: Initializer for the bias vector. + kernel_regularizer: Regularizer function applied to + the `kernel` weights matrix. + bias_regularizer: Regularizer function applied to the bias vector. + activity_regularizer: Regularizer function applied to + the output of the layer (its "activation").. + kernel_constraint: Constraint function applied to + the `kernel` weights matrix. + bias_constraint: Constraint function applied to the bias vector. + + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + The most common situation would be + a 2D input with shape `(batch_size, input_dim)`. + + Output shape: + N-D tensor with shape: `(batch_size, ..., units)`. + For instance, for a 2D input with shape `(batch_size, input_dim)`, + the output would have shape `(batch_size, units)`. + """ def __init__(self, reg_lambda: float = 0, **kwargs: Any) -> None: if reg_lambda > 0: @@ -45,6 +108,14 @@ def __init__(self, reg_lambda: float = 0, **kwargs: Any) -> None: super().__init__(kernel_regularizer=regularizer, **kwargs) def call(self, inputs: tf.SparseTensor) -> tf.Tensor: + """Apply dense layer to sparse inputs. + + Arguments: + inputs: Input sparse tensor (of any rank). + + Returns: + Output of dense layer. + """ if not isinstance(inputs, tf.SparseTensor): raise ValueError("Input tensor should be sparse.") @@ -67,13 +138,56 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor: class DenseWithSparseWeights(tf.keras.layers.Dense): + """Just your regular densely-connected NN layer but with sparse weights. + + `Dense` implements the operation: + `output = activation(dot(input, kernel) + bias)` + where `activation` is the element-wise activation function + passed as the `activation` argument, `kernel` is a weights matrix + created by the layer, and `bias` is a bias vector created by the layer + (only applicable if `use_bias` is `True`). + It creates `kernel_mask` to set fraction of the `kernel` weights to zero. + + Note: If the input to the layer has a rank greater than 2, then + it is flattened prior to the initial dot product with `kernel`. + + Arguments: + sparsity: Float between 0 and 1. Fraction of the `kernel` + weights to set to zero. + units: Positive integer, dimensionality of the output space. + activation: Activation function to use. + If you don't specify anything, no activation is applied + (ie. "linear" activation: `a(x) = x`). + use_bias: Boolean, whether the layer uses a bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix. + bias_initializer: Initializer for the bias vector. + kernel_regularizer: Regularizer function applied to + the `kernel` weights matrix. + bias_regularizer: Regularizer function applied to the bias vector. + activity_regularizer: Regularizer function applied to + the output of the layer (its "activation").. + kernel_constraint: Constraint function applied to + the `kernel` weights matrix. + bias_constraint: Constraint function applied to the bias vector. + + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + The most common situation would be + a 2D input with shape `(batch_size, input_dim)`. + + Output shape: + N-D tensor with shape: `(batch_size, ..., units)`. + For instance, for a 2D input with shape `(batch_size, input_dim)`, + the output would have shape `(batch_size, units)`. + """ + def __init__(self, sparsity: float = 0.8, **kwargs: Any) -> None: super().__init__(**kwargs) self.sparsity = sparsity def build(self, input_shape: tf.TensorShape) -> None: super().build(input_shape) - # create random mask to set some weights to 0 + # create random mask to set fraction of the `kernel` weights to zero kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1) kernel_mask = tf.cast( tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype @@ -83,13 +197,32 @@ def build(self, input_shape: tf.TensorShape) -> None: ) def call(self, inputs: tf.Tensor) -> tf.Tensor: - # set some weights to 0 according to precomputed mask + # set fraction of the `kernel` weights to zero according to precomputed mask self.kernel.assign(self.kernel * self.kernel_mask) return super().call(inputs) class Ffnn(tf.keras.layers.Layer): - """Create feed-forward network with hidden layers and name suffix.""" + """Feed-forward network layer. + + Arguments: + layer_sizes: List of integers with dimensionality of the layers. + dropout_rate: Float between 0 and 1; fraction of the input units to drop. + reg_lambda: Float, regularization factor. + sparsity: Float between 0 and 1. Fraction of the `kernel` + weights to set to zero. + layer_name_suffix: Text added to the name of the layers. + + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + The most common situation would be + a 2D input with shape `(batch_size, input_dim)`. + + Output shape: + N-D tensor with shape: `(batch_size, ..., layer_sizes[-1])`. + For instance, for a 2D input with shape `(batch_size, input_dim)`, + the output would have shape `(batch_size, layer_sizes[-1])`. + """ def __init__( self, @@ -125,7 +258,25 @@ def call( class Embed(tf.keras.layers.Layer): - """Create dense embedding layer with a name.""" + """Dense embedding layer. + + Arguments: + embed_dim: Positive integer, dimensionality of the output space. + reg_lambda: Float; regularization factor. + layer_name_suffix: Text added to the name of the layers. + similarity_type: Optional type of similarity measure to use, + either 'cosine' or 'inner'. + + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + The most common situation would be + a 2D input with shape `(batch_size, input_dim)`. + + Output shape: + N-D tensor with shape: `(batch_size, ..., embed_dim)`. + For instance, for a 2D input with shape `(batch_size, input_dim)`, + the output would have shape `(batch_size, embed_dim)`. + """ def __init__( self, @@ -160,6 +311,19 @@ def call(self, x: tf.Tensor) -> tf.Tensor: class InputMask(tf.keras.layers.Layer): + """The layer that masks 15% of the input. + + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + The most common situation would be + a 2D input with shape `(batch_size, input_dim)`. + + Output shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. + For instance, for a 2D input with shape `(batch_size, input_dim)`, + the output would have shape `(batch_size, input_dim)`. + """ + def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) @@ -225,6 +389,14 @@ def x_masked() -> tf.Tensor: class CRF(tf.keras.layers.Layer): + """CRF layer. + + Arguments: + num_tags: Positive integer, number of tags. + reg_lambda: Float; regularization factor. + name: Optional name of the layer. + """ + def __init__( self, num_tags: int, reg_lambda: float, name: Optional[Text] = None ) -> None: @@ -242,6 +414,17 @@ def build(self, input_shape: tf.TensorShape) -> None: self.built = True def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor: + """Decodes the highest scoring sequence of tags. + + Arguments: + logits: A [batch_size, max_seq_len, num_tags] tensor of + unary potentials. + sequence_lengths: A [batch_size] vector of true sequence lengths. + + Returns: + A [batch_size, max_seq_len] matrix, with dtype `tf.int32`. + Contains the highest scoring tag indices. + """ pred_ids, _ = tfa.text.crf.crf_decode( logits, self.transition_params, sequence_lengths ) @@ -255,6 +438,19 @@ def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor: def loss( self, logits: tf.Tensor, tag_indices: tf.Tensor, sequence_lengths: tf.Tensor ) -> tf.Tensor: + """Computes the log-likelihood of tag sequences in a CRF. + + Arguments: + logits: A [batch_size, max_seq_len, num_tags] tensor of unary potentials + to use as input to the CRF layer. + tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which + we compute the log-likelihood. + sequence_lengths: A [batch_size] vector of true sequence lengths. + + Returns: + Negative mean log-likelihood of all examples, + given the sequence of tag indices. + """ log_likelihood, _ = tfa.text.crf.crf_log_likelihood( logits, tag_indices, sequence_lengths, self.transition_params ) @@ -262,6 +458,32 @@ def loss( class DotProductLoss(tf.keras.layers.Layer): + """Dot-product loss layer. + + Arguments: + num_neg: Positive integer, the number of incorrect labels; + the algorithm will minimize their similarity to the input. + loss_type: The type of the loss function, either 'softmax' or 'margin'. + mu_pos: Float, indicates how similar the algorithm should + try to make embedding vectors for correct labels; + should be 0.0 < ... < 1.0 for 'cosine' similarity type. + mu_neg: Float, maximum negative similarity for incorrect labels, + should be -1.0 < ... < 1.0 for 'cosine' similarity type. + use_max_sim_neg: Boolean, if 'True' the algorithm only minimizes + maximum similarity over incorrect intent labels, + used only if 'loss_type' is set to 'margin'. + neg_lambda: Float, the scale of how important is to minimize + the maximum similarity between embeddings of different labels, + used only if 'loss_type' is set to 'margin'. + scale_loss: Boolean, if 'True' scale loss inverse proportionally to + the confidence of the correct prediction. + name: Optional name of the layer. + parallel_iterations: Positive integer, the number of iterations allowed + to run in parallel. + same_sampling: Boolean, if 'True' sample same negative labels + for the whole batch. + """ + def __init__( self, num_neg: int, @@ -599,8 +821,21 @@ def call( all_labels: tf.Tensor, mask: Optional[tf.Tensor] = None, ) -> Tuple[tf.Tensor, tf.Tensor]: - """Calculate loss and accuracy.""" - + """Calculate loss and accuracy. + + Arguments: + inputs_embed: Embedding tensor for the batch inputs. + labels_embed: Embedding tensor for the batch labels. + labels: Tensor representing batch labels. + all_labels_embed: Embedding tensor for the all labels. + all_labels: Tensor representing all labels. + mask: Optional tensor representing sequence mask, + contains `1` for inputs and `0` for padding. + + Returns: + loss: Total loss. + accuracy: Training accuracy. + """ ( pos_inputs_embed, pos_labels_embed, @@ -623,10 +858,10 @@ def call( mask, ) - acc = self._calc_accuracy(sim_pos, sim_neg_il) + accuracy = self._calc_accuracy(sim_pos, sim_neg_il) loss = self._chosen_loss( sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li, mask ) - return loss, acc + return loss, accuracy diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py index c7a309db1c38..51bfe3094508 100644 --- a/rasa/utils/tensorflow/transformer.py +++ b/rasa/utils/tensorflow/transformer.py @@ -10,6 +10,25 @@ # from https://www.tensorflow.org/tutorials/text/transformer # and https://github.com/tensorflow/tensor2tensor class MultiHeadAttention(tf.keras.layers.Layer): + """Multi-headed attention layer. + + Arguments: + units: Positive integer, output dim of hidden layer. + num_heads: Positive integer, number of heads + to repeat the same attention structure. + attention_dropout_rate: Float, dropout rate inside attention for training. + sparsity: Float between 0 and 1. Fraction of the `kernel` + weights to set to zero. + unidirectional: Boolean, use a unidirectional or bidirectional encoder. + use_key_relative_position: Boolean, if 'True' use key + relative embeddings in attention. + use_value_relative_position: Boolean, if 'True' use value + relative embeddings in attention. + max_relative_position: Positive integer, max position for relative embeddings. + heads_share_relative_embedding: Boolean, if 'True' + heads will share relative embeddings. + """ + def __init__( self, units: int, @@ -44,19 +63,21 @@ def __init__( self._depth = units // self.num_heads # process queries - self._wq = DenseWithSparseWeights( + self._query_dense_layer = DenseWithSparseWeights( units=units, use_bias=False, sparsity=sparsity ) # process keys - self._wk = DenseWithSparseWeights( + self._key_dense_layer = DenseWithSparseWeights( units=units, use_bias=False, sparsity=sparsity ) # process values - self._wv = DenseWithSparseWeights( + self._value_dense_layer = DenseWithSparseWeights( units=units, use_bias=False, sparsity=sparsity ) # process attention output - self._dense = DenseWithSparseWeights(units=units, sparsity=sparsity) + self._output_dense_layer = DenseWithSparseWeights( + units=units, sparsity=sparsity + ) self._create_relative_embeddings() @@ -128,10 +149,15 @@ def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tens def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor: """Universal method to convert tensor from relative to absolute indexing. - x.shape = - (batch, num_heads, length, relative_length, depth) - or (batch, num_heads, length, relative_length) "Slides" relative embeddings by 45 degree. + + Arguments: + x: A tensor of shape (batch, num_heads, length, relative_length, depth) + or (batch, num_heads, length, relative_length) + + Returns: + A tensor of shape (batch, num_heads, length, length, depth) + or (batch, num_heads, length, length) """ x_dim = len(x.shape) @@ -233,36 +259,39 @@ def droped_logits() -> tf.Tensor: def _scaled_dot_product_attention( self, - q: tf.Tensor, - k: tf.Tensor, - v: tf.Tensor, + query: tf.Tensor, + key: tf.Tensor, + value: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor, ) -> Tuple[tf.Tensor, tf.Tensor]: """Calculate the attention weights. - q, k, v must have matching leading dimensions. - k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v. - The mask has different shapes depending on its type(padding or look ahead) + + query, key, value must have matching leading dimensions. + key, value must have matching penultimate dimension, + i.e.: seq_len_k = seq_len_v. + The mask has different shapes depending on its type (padding or look ahead) but it must be broadcastable for addition. - Args: - q: query shape == (..., seq_len_q, depth) - k: key shape == (..., seq_len_k, depth) - v: value shape == (..., seq_len_v, depth_v) - pad_mask: Float tensor with shape broadcastable - to (..., seq_len_q, seq_len_k). Defaults to None. + Arguments: + query: A tensor with shape (..., length, depth). + key: A tensor with shape (..., length, depth). + value: A tensor with shape (..., length, depth). + pad_mask: Float tensor with shape broadcastable + to (..., length, length). Defaults to None. Returns: - output, attention_weights + output: A tensor with shape (..., length, depth). + attention_weights: A tensor with shape (..., length, length). """ - matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k) + matmul_qk = tf.matmul(query, key, transpose_b=True) # (..., length, length) if self.use_key_relative_position: - matmul_qk += self._matmul_with_relative_keys(q) + matmul_qk += self._matmul_with_relative_keys(query) # scale matmul_qk - dk = tf.cast(tf.shape(k)[-1], tf.float32) + dk = tf.cast(tf.shape(key)[-1], tf.float32) logits = matmul_qk / tf.math.sqrt(dk) # add the mask to the scaled tensor. @@ -273,13 +302,11 @@ def _scaled_dot_product_attention( if self.attention_dropout_rate > 0: logits = self._drop_attention_logits(logits, pad_mask, training) - # softmax is normalized on the last axis (seq_len_k) so that the scores + # softmax is normalized on the last axis (length) so that the scores # add up to 1. - attention_weights = tf.nn.softmax( - logits, axis=-1 - ) # (..., seq_len_q, seq_len_k) + attention_weights = tf.nn.softmax(logits, axis=-1) # (..., length, length) - output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v) + output = tf.matmul(attention_weights, value) # (..., length, depth) if self.use_value_relative_position: output += self._matmul_with_relative_values(attention_weights) @@ -289,7 +316,7 @@ def _split_heads(self, x: tf.Tensor) -> tf.Tensor: """Split the last dimension into (num_heads, depth). Transpose the result such that the shape is - (batch_size, num_heads, seq_len, depth) + (batch_size, num_heads, length, depth) """ x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth)) @@ -299,50 +326,86 @@ def _combine_heads(self, x: tf.Tensor) -> tf.Tensor: """Inverse of split_heads. Args: - x: a Tensor with shape [batch, num_heads, length, channels / num_heads] + x: A Tensor with shape [batch, num_heads, length, units / num_heads] Returns: - a Tensor with shape [batch, length, channels] + A Tensor with shape [batch, length, units] """ - # (batch_size, seq_len_q, num_heads, depth) + # (batch_size, length, num_heads, depth) x = tf.transpose(x, perm=[0, 2, 1, 3]) - # (batch_size, seq_len_q, units) + # (batch_size, length, units) return tf.reshape(x, (tf.shape(x)[0], -1, self.units)) # noinspection PyMethodOverriding def call( self, - v: tf.Tensor, - k: tf.Tensor, - q: tf.Tensor, + query_input: tf.Tensor, + source_input: tf.Tensor, pad_mask: Optional[tf.Tensor] = None, training: Optional[Union[tf.Tensor, bool]] = None, ) -> Tuple[tf.Tensor, tf.Tensor]: + """Apply attention mechanism to query_input and source_input. + + Arguments: + query_input: A tensor with shape [batch_size, length, input_size]. + source_input: A tensor with shape [batch_size, length, input_size]. + pad_mask: Float tensor with shape broadcastable + to (..., length, length). Defaults to None. + training: A bool, whether in training mode or not. + + Returns: + Attention layer output with shape [batch_size, length, units] + """ if training is None: training = K.learning_phase() - q = self._wq(q) # (batch_size, seq_len_q, units) - k = self._wk(k) # (batch_size, seq_len_k, units) - v = self._wv(v) # (batch_size, seq_len_v, units) + query = self._query_dense_layer(query_input) # (batch_size, length, units) + key = self._key_dense_layer(source_input) # (batch_size, length, units) + value = self._value_dense_layer(source_input) # (batch_size, length, units) - q = self._split_heads(q) # (batch_size, num_heads, seq_len_q, depth) - k = self._split_heads(k) # (batch_size, num_heads, seq_len_k, depth) - v = self._split_heads(v) # (batch_size, num_heads, seq_len_v, depth) + query = self._split_heads(query) # (batch_size, num_heads, length, depth) + key = self._split_heads(key) # (batch_size, num_heads, length, depth) + value = self._split_heads(value) # (batch_size, num_heads, length, depth) attention, attention_weights = self._scaled_dot_product_attention( - q, k, v, pad_mask, training + query, key, value, pad_mask, training ) - # attention.shape == (batch_size, num_heads, seq_len_q, depth) - # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) - attention = self._combine_heads(attention) # (batch_size, seq_len_q, units) + # attention.shape == (batch_size, num_heads, length, depth) + # attention_weights.shape == (batch_size, num_heads, length, length) + attention = self._combine_heads(attention) # (batch_size, length, units) - output = self._dense(attention) # (batch_size, seq_len_q, units) + output = self._output_dense_layer(attention) # (batch_size, length, units) return output, attention_weights class TransformerEncoderLayer(tf.keras.layers.Layer): + """Transformer encoder layer. + + The layer is composed of the sublayers: + 1. Self-attention layer + 2. Feed-forward network (which is 2 fully-connected layers) + + Arguments: + units: Positive integer, output dim of hidden layer. + num_heads: Positive integer, number of heads + to repeat the same attention structure. + filter_units: Positive integer, output dim of the first ffn hidden layer. + dropout_rate: Float between 0 and 1; fraction of the input units to drop. + attention_dropout_rate: Float, dropout rate inside attention for training. + sparsity: Float between 0 and 1. Fraction of the `kernel` + weights to set to zero. + unidirectional: Boolean, use a unidirectional or bidirectional encoder. + use_key_relative_position: Boolean, if 'True' use key + relative embeddings in attention. + use_value_relative_position: Boolean, if 'True' use value + relative embeddings in attention. + max_relative_position: Positive integer, max position for relative embeddings. + heads_share_relative_embedding: Boolean, if 'True' + heads will share relative embeddings. + """ + def __init__( self, units: int, @@ -377,11 +440,11 @@ def __init__( tf.keras.layers.LayerNormalization(epsilon=1e-6), DenseWithSparseWeights( units=filter_units, activation=tfa.activations.gelu, sparsity=sparsity - ), # (batch_size, seq_len, filter_units) + ), # (batch_size, length, filter_units) tf.keras.layers.Dropout(dropout_rate), DenseWithSparseWeights( units=units, sparsity=sparsity - ), # (batch_size, seq_len, units) + ), # (batch_size, length, units) tf.keras.layers.Dropout(dropout_rate), ] @@ -391,25 +454,60 @@ def call( pad_mask: Optional[tf.Tensor] = None, training: Optional[Union[tf.Tensor, bool]] = None, ) -> tf.Tensor: + """Apply transformer encoder layer. + + Arguments: + x: A tensor with shape [batch_size, length, units]. + pad_mask: Float tensor with shape broadcastable + to (..., length, length). Defaults to None. + training: A bool, whether in training mode or not. + + Returns: + Transformer encoder layer output with shape [batch_size, length, units] + """ if training is None: training = K.learning_phase() - x_norm = self._layer_norm(x) # (batch_size, seq_len, units) - attn_out, _ = self._mha( - x_norm, x_norm, x_norm, pad_mask=pad_mask, training=training - ) + x_norm = self._layer_norm(x) # (batch_size, length, units) + attn_out, _ = self._mha(x_norm, x_norm, pad_mask=pad_mask, training=training) attn_out = self._dropout(attn_out, training=training) x += attn_out - ffn_out = x # (batch_size, seq_len, units) + ffn_out = x # (batch_size, length, units) for layer in self._ffn_layers: ffn_out = layer(ffn_out, training=training) x += ffn_out - return x # (batch_size, seq_len, units) + return x # (batch_size, length, units) class TransformerEncoder(tf.keras.layers.Layer): + """Transformer encoder. + + Encoder stack is made up of `num_layers` identical encoder layers. + + Arguments: + num_layers: Positive integer, number of encoder layers. + units: Positive integer, output dim of hidden layer. + num_heads: Positive integer, number of heads + to repeat the same attention structure. + filter_units: Positive integer, output dim of the first ffn hidden layer. + reg_lambda: Float, regularization factor. + dropout_rate: Float between 0 and 1; fraction of the input units to drop. + attention_dropout_rate: Float, dropout rate inside attention for training. + sparsity: Float between 0 and 1. Fraction of the `kernel` + weights to set to zero. + unidirectional: Boolean, use a unidirectional or bidirectional encoder. + use_key_relative_position: Boolean, if 'True' use key + relative embeddings in attention. + use_value_relative_position: Boolean, if 'True' use value + relative embeddings in attention. + max_relative_position: Positive integer, max position for relative embeddings. + heads_share_relative_embedding: Boolean, if 'True' + heads will share relative embeddings. + name: Optional name of the layer. + """ + def __init__( self, num_layers: int, @@ -494,22 +592,33 @@ def call( pad_mask: Optional[tf.Tensor] = None, training: Optional[Union[tf.Tensor, bool]] = None, ) -> tf.Tensor: + """Apply transformer encoder. + + Arguments: + x: A tensor with shape [batch_size, length, input_size]. + pad_mask: Float tensor with shape broadcastable + to (..., length, length). Defaults to None. + training: A bool, whether in training mode or not. + + Returns: + Transformer encoder output with shape [batch_size, length, units] + """ # adding embedding and position encoding. - x = self._embedding(x) # (batch_size, seq_len, units) + x = self._embedding(x) # (batch_size, length, units) x *= tf.math.sqrt(tf.cast(self.units, tf.float32)) x += self._positional_encoding(tf.shape(x)[1]) x = self._dropout(x, training=training) if pad_mask is not None: - pad_mask = tf.squeeze(pad_mask, -1) # (batch_size, seq_len) + pad_mask = tf.squeeze(pad_mask, -1) # (batch_size, length) pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :] - # pad_mask.shape = (batch_size, 1, 1, seq_len) + # pad_mask.shape = (batch_size, 1, 1, length) if self.unidirectional: # add look ahead pad mask to emulate unidirectional behavior pad_mask = tf.minimum( 1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1]) - ) # (batch_size, 1, seq_len, seq_len) + ) # (batch_size, 1, length, length) for layer in self._enc_layers: x = layer(x, pad_mask=pad_mask, training=training) @@ -517,4 +626,4 @@ def call( # if normalization is done in encoding layers, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. - return self._layer_norm(x) # (batch_size, seq_len, units) + return self._layer_norm(x) # (batch_size, length, units) From 2bb20a05a6a8e04e83c6d93e0cb8aa215473ed34 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 10 Mar 2020 15:53:28 +0100 Subject: [PATCH 2/3] update comments --- rasa/core/policies/ted_policy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 7251dd715051..bd7b8625899b 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -106,7 +106,7 @@ class TEDPolicy(Policy): NUM_HEADS: 4, # If 'True' use key relative embeddings in attention KEY_RELATIVE_ATTENTION: False, - # If 'True' use key relative embeddings in attention + # If 'True' use value relative embeddings in attention VALUE_RELATIVE_ATTENTION: False, # Max position for relative embeddings MAX_RELATIVE_POSITION: None, @@ -144,7 +144,8 @@ class TEDPolicy(Policy): # If 'True' the algorithm only minimizes maximum similarity over # incorrect intent labels, used only if 'loss_type' is set to 'margin'. USE_MAX_NEG_SIM: True, - # Scale loss inverse proportionally to confidence of correct prediction + # If 'True' scale loss inverse proportionally to the confidence + # of the correct prediction SCALE_LOSS: True, # ## Regularization parameters # The scale of regularization From e97877f5ebf92a0c6223cf69785efc7c0a0644b6 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 11 Mar 2020 11:17:13 +0100 Subject: [PATCH 3/3] fix docstrings --- rasa/utils/tensorflow/layers.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py index b2a6a92a5a61..71c42d0db3b7 100644 --- a/rasa/utils/tensorflow/layers.py +++ b/rasa/utils/tensorflow/layers.py @@ -32,6 +32,9 @@ def call( Returns: Output of dropout layer. + + Raises: + A ValueError if inputs is not a sparse tensor """ if not isinstance(inputs, tf.SparseTensor): raise ValueError("Input tensor should be sparse.") @@ -79,8 +82,7 @@ class DenseForSparse(tf.keras.layers.Dense): use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix. bias_initializer: Initializer for the bias vector. - kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. + reg_lambda: Float, regularization factor. bias_regularizer: Regularizer function applied to the bias vector. activity_regularizer: Regularizer function applied to the output of the layer (its "activation").. @@ -115,6 +117,9 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor: Returns: Output of dense layer. + + Raises: + A ValueError if inputs is not a sparse tensor """ if not isinstance(inputs, tf.SparseTensor): raise ValueError("Input tensor should be sparse.") @@ -343,7 +348,18 @@ def call( mask: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None, ) -> Tuple[tf.Tensor, tf.Tensor]: - """Randomly mask input sequences.""" + """Randomly mask input sequences. + + Arguments: + x: Input sequence tensor of rank 3. + mask: A tensor representing sequence mask, + contains `1` for inputs and `0` for padding. + training: Python boolean indicating whether the layer should behave in + training mode (mask inputs) or in inference mode (doing nothing). + + Returns: + A tuple of masked inputs and boolean mask. + """ if training is None: training = K.learning_phase()