From 0322eebdd3819ac3b37c448c94dd36346294e162 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 5 Jul 2019 16:41:57 +0200
Subject: [PATCH 01/50] copy files from nlu_lstm

---
 rasa/core/featurizers.py               |   12 +-
 rasa/core/policies/embedding_policy.py | 1336 +++++++++++++++++-------
 2 files changed, 961 insertions(+), 387 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index 76634cac1517..737b4e22a80e 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -177,7 +177,7 @@ def prepare_from_domain(self, domain: Domain) -> None:
         """Creates internal vocabularies for user intents
         and bot actions to use for featurization"""
         self.user_labels = domain.intent_states + domain.entity_states
-        self.slot_labels = domain.slot_states
+        self.slot_labels = domain.slot_states + domain.form_states
         self.bot_labels = domain.action_names
 
         if self.use_shared_vocab:
@@ -249,7 +249,7 @@ def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
         """Create matrix with all actions from domain
             encoded in rows as bag of words."""
         encoded_all_actions = np.zeros(
-            (domain.num_actions, len(self.bot_vocab)), dtype=int
+            (domain.num_actions, len(self.bot_vocab)), dtype=np.int32
         )
         for idx, name in enumerate(domain.action_names):
             for t in name.split(self.split_symbol):
@@ -361,8 +361,10 @@ def _featurize_labels(
 
             labels.append(story_labels)
 
+        y = np.array(labels)
         # if it is MaxHistoryFeaturizer, squeeze out time axis
-        y = np.array(labels).squeeze()
+        if y.shape[1] == 1 and isinstance(self, MaxHistoryTrackerFeaturizer):
+            y = y[:, 0, :]
 
         return y
 
@@ -410,7 +412,7 @@ def create_X(
 
     def persist(self, path):
         featurizer_file = os.path.join(path, "featurizer.json")
-        rasa.utils.io.create_directory_for_file(featurizer_file)
+        utils.create_dir_for_file(featurizer_file)
         with open(featurizer_file, "w", encoding="utf-8") as f:
             # noinspection PyTypeChecker
             f.write(str(jsonpickle.encode(self)))
@@ -566,7 +568,7 @@ def _hash_example(states, action):
 
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]:
+    ) -> Tuple[List[List[Dict]], List[List[Text]]]:
 
         trackers_as_states = []
         trackers_as_actions = []
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index d1e5e1864cf1..bd6f9bcd9fb6 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -18,10 +18,17 @@
     TrackerFeaturizer,
     FullDialogueTrackerFeaturizer,
     LabelTokenizerSingleStateFeaturizer,
+    MaxHistoryTrackerFeaturizer
 )
 from rasa.core.policies.policy import Policy
 
 import tensorflow as tf
+from tensorflow.python.ops import gen_array_ops
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_layers
+from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder
+from tensor2tensor.models.evolved_transformer import evolved_transformer_encoder
+
 from rasa.core.policies.tf_utils import (
     TimeAttentionWrapper,
     ChronoBiasLayerNormBasicLSTMCell,
@@ -33,11 +40,10 @@
     from rasa.core.policies.tf_utils import TimeAttentionWrapperState
 
 try:
-    import cPickle as pickle  # pytype: disable=import-error
+    import cPickle as pickle
 except ImportError:
     import pickle
 
-tf.contrib._warning = None  # avoid warning println on contrib import - remove for tf 2
 
 logger = logging.getLogger(__name__)
 
@@ -75,8 +81,17 @@ class EmbeddingPolicy(Policy):
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_b": [],
+
+        "transformer": False,
+        "pos_encoding": "timing",  # {"timing", "emb", "custom_timing"}
+        # introduce phase shift in time encodings between transformers
+        # 0.5 - 0.8 works on small dataset
+        "pos_max_timescale": 1.0e1,
+        "max_seq_length": 256,
+        "num_heads": 4,
         # number of units in rnn cell
-        "rnn_size": 64,
+        "rnn_size": 128,
+        "num_rnn_layers": 1,
         # training parameters
         # flag if to turn on layer normalization for lstm cell
         "layer_norm": True,
@@ -136,44 +151,51 @@ class EmbeddingPolicy(Policy):
 
     # end default properties (DOC MARKER - don't remove)
 
-    @classmethod
-    def _standard_featurizer(cls):
-        return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
+    @staticmethod
+    def _standard_featurizer(max_history=None):
+        if max_history is None:
+            return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
+        else:
+            return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history)
 
     def __init__(
         self,
-        featurizer: Optional[FullDialogueTrackerFeaturizer] = None,
+        featurizer: Optional['FullDialogueTrackerFeaturizer'] = None,
         priority: int = 1,
-        encoded_all_actions: Optional[np.ndarray] = None,
-        graph: Optional[tf.Graph] = None,
-        session: Optional[tf.Session] = None,
-        intent_placeholder: Optional[tf.Tensor] = None,
-        action_placeholder: Optional[tf.Tensor] = None,
-        slots_placeholder: Optional[tf.Tensor] = None,
-        prev_act_placeholder: Optional[tf.Tensor] = None,
-        dialogue_len: Optional[tf.Tensor] = None,
-        x_for_no_intent: Optional[tf.Tensor] = None,
-        y_for_no_action: Optional[tf.Tensor] = None,
-        y_for_action_listen: Optional[tf.Tensor] = None,
-        similarity_op: Optional[tf.Tensor] = None,
-        alignment_history: Optional[tf.Tensor] = None,
-        user_embed: Optional[tf.Tensor] = None,
-        bot_embed: Optional[tf.Tensor] = None,
-        slot_embed: Optional[tf.Tensor] = None,
-        dial_embed: Optional[tf.Tensor] = None,
-        rnn_embed: Optional[tf.Tensor] = None,
-        attn_embed: Optional[tf.Tensor] = None,
-        copy_attn_debug: Optional[tf.Tensor] = None,
-        all_time_masks: Optional[tf.Tensor] = None,
+        encoded_all_actions: Optional['np.ndarray'] = None,
+        graph: Optional['tf.Graph'] = None,
+        session: Optional['tf.Session'] = None,
+        intent_placeholder: Optional['tf.Tensor'] = None,
+        action_placeholder: Optional['tf.Tensor'] = None,
+        slots_placeholder: Optional['tf.Tensor'] = None,
+        prev_act_placeholder: Optional['tf.Tensor'] = None,
+        dialogue_len: Optional['tf.Tensor'] = None,
+        x_for_no_intent: Optional['tf.Tensor'] = None,
+        y_for_no_action: Optional['tf.Tensor'] = None,
+        y_for_action_listen: Optional['tf.Tensor'] = None,
+        similarity_op: Optional['tf.Tensor'] = None,
+        alignment_history: Optional['tf.Tensor'] = None,
+        user_embed: Optional['tf.Tensor'] = None,
+        bot_embed: Optional['tf.Tensor'] = None,
+        slot_embed: Optional['tf.Tensor'] = None,
+        dial_embed: Optional['tf.Tensor'] = None,
+        rnn_embed: Optional['tf.Tensor'] = None,
+        attn_embed: Optional['tf.Tensor'] = None,
+        copy_attn_debug: Optional['tf.Tensor'] = None,
+        all_time_masks: Optional['tf.Tensor'] = None,
+        attention_weights=None,
+        max_history: Optional[int] = None,
         **kwargs: Any
     ) -> None:
-        if featurizer:
-            if not isinstance(featurizer, FullDialogueTrackerFeaturizer):
-                raise TypeError(
-                    "Passed tracker featurizer of type {}, "
-                    "should be FullDialogueTrackerFeaturizer."
-                    "".format(type(featurizer).__name__)
-                )
+        # if featurizer:
+        #     if not isinstance(featurizer, FullDialogueTrackerFeaturizer):
+        #         raise TypeError(
+        #             "Passed tracker featurizer of type {}, "
+        #             "should be FullDialogueTrackerFeaturizer."
+        #             "".format(type(featurizer).__name__)
+        #         )
+        if not featurizer:
+            featurizer = self._standard_featurizer(max_history)
         super(EmbeddingPolicy, self).__init__(featurizer, priority)
 
         # flag if to use the same embeddings for user and bot
@@ -219,7 +241,7 @@ def __init__(
         self.copy_attn_debug = copy_attn_debug
 
         self.all_time_masks = all_time_masks
-
+        self.attention_weights = attention_weights
         # internal tf instances
         self._train_op = None
         self._is_training = None
@@ -244,8 +266,14 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
                         self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"]
                     )
                 )
+        self.transformer = config['transformer']
+        self.pos_encoding = config['pos_encoding']
+        self.pos_max_timescale = config['pos_max_timescale']
+        self.max_seq_length = config['max_seq_length']
+        self.num_heads = config['num_heads']
 
         self.rnn_size = config["rnn_size"]
+        self.num_rnn_layers = config["num_rnn_layers"]
         self.layer_norm = config["layer_norm"]
 
         self.batch_size = config["batch_size"]
@@ -329,14 +357,21 @@ def _actions_for_Y(data_Y: np.ndarray) -> np.ndarray:
     def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray:
         """Prepare Y data for training: features for action labels."""
 
-        return np.stack(
-            [
-                np.stack(
-                    [self.encoded_all_actions[action_idx] for action_idx in action_ids]
-                )
-                for action_ids in actions_for_Y
-            ]
-        )
+        if len(actions_for_Y.shape) == 2:
+            return np.stack(
+                [
+                    np.stack(
+                        [self.encoded_all_actions[action_idx] for action_idx in action_ids]
+                    )
+                    for action_ids in actions_for_Y
+                ]
+            )
+        else:
+            return np.stack(
+                [
+                    self.encoded_all_actions[action_idx] for action_idx in actions_for_Y
+                ]
+            )
 
     # noinspection PyPep8Naming
     @staticmethod
@@ -382,7 +417,11 @@ def _create_tf_session_data(
         y_for_action_listen = self._create_y_for_action_listen(domain)
 
         # is needed to calculate train accuracy
-        all_Y_d = self._create_all_Y_d(X.shape[1])
+        if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
+            dial_len = X.shape[1]
+        else:
+            dial_len = 1
+        all_Y_d = self._create_all_Y_d(dial_len)
 
         return SessionData(
             X=X,
@@ -396,15 +435,14 @@ def _create_tf_session_data(
             all_Y_d=all_Y_d,
         )
 
-        # tf helpers:
-
+    # tf helpers:
     def _create_tf_nn(
         self,
-        x_in: tf.Tensor,
+        x_in: 'tf.Tensor',
         layer_sizes: List,
         droprate: float,
         layer_name_suffix: Text,
-    ) -> tf.Tensor:
+    ) -> 'tf.Tensor':
         """Create nn with hidden layers and name suffix."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -421,7 +459,7 @@ def _create_tf_nn(
             x = tf.layers.dropout(x, rate=droprate, training=self._is_training)
         return x
 
-    def _create_embed(self, x: tf.Tensor, layer_name_suffix: Text) -> tf.Tensor:
+    def _create_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
         """Create dense embedding layer with a name."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -435,7 +473,7 @@ def _create_embed(self, x: tf.Tensor, layer_name_suffix: Text) -> tf.Tensor:
         )
         return embed_x
 
-    def _create_tf_user_embed(self, a_in: tf.Tensor) -> tf.Tensor:
+    def _create_tf_user_embed(self, a_in: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding user vector."""
 
         layer_name_suffix = "a_and_b" if self.share_embedding else "a"
@@ -448,7 +486,7 @@ def _create_tf_user_embed(self, a_in: tf.Tensor) -> tf.Tensor:
         )
         return self._create_embed(a, layer_name_suffix=layer_name_suffix)
 
-    def _create_tf_bot_embed(self, b_in: tf.Tensor) -> tf.Tensor:
+    def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding bot vector."""
 
         layer_name_suffix = "a_and_b" if self.share_embedding else "b"
@@ -461,7 +499,7 @@ def _create_tf_bot_embed(self, b_in: tf.Tensor) -> tf.Tensor:
         )
         return self._create_embed(b, layer_name_suffix=layer_name_suffix)
 
-    def _create_tf_no_intent_embed(self, x_for_no_intent_i: tf.Tensor) -> tf.Tensor:
+    def _create_tf_no_intent_embed(self, x_for_no_intent_i: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding user vector for empty intent."""
 
         layer_name_suffix = "a_and_b" if self.share_embedding else "a"
@@ -476,7 +514,7 @@ def _create_tf_no_intent_embed(self, x_for_no_intent_i: tf.Tensor) -> tf.Tensor:
             self._create_embed(x_for_no_intent, layer_name_suffix=layer_name_suffix)
         )
 
-    def _create_tf_no_action_embed(self, y_for_no_action_in: tf.Tensor) -> tf.Tensor:
+    def _create_tf_no_action_embed(self, y_for_no_action_in: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding bot vector for empty action and action_listen."""
 
         layer_name_suffix = "a_and_b" if self.share_embedding else "b"
@@ -491,7 +529,8 @@ def _create_tf_no_action_embed(self, y_for_no_action_in: tf.Tensor) -> tf.Tensor
             self._create_embed(y_for_no_action, layer_name_suffix=layer_name_suffix)
         )
 
-    def _create_rnn_cell(self) -> tf.contrib.rnn.RNNCell:
+    def _create_rnn_cell(self):
+        # type: () -> tf.contrib.rnn.RNNCell
         """Create one rnn cell."""
 
         # chrono initialization for forget bias
@@ -524,11 +563,11 @@ def _create_rnn_cell(self) -> tf.contrib.rnn.RNNCell:
         )
 
     @staticmethod
-    def _num_units(memory: tf.Tensor) -> int:
+    def _num_units(memory: 'tf.Tensor') -> int:
         return memory.shape[-1].value
 
     def _create_attn_mech(
-        self, memory: tf.Tensor, real_length: tf.Tensor
+        self, memory: 'tf.Tensor', real_length: 'tf.Tensor'
     ) -> tf.contrib.seq2seq.AttentionMechanism:
 
         return tf.contrib.seq2seq.BahdanauAttention(
@@ -545,10 +584,10 @@ def _create_attn_mech(
 
     def cell_input_fn(
         self,
-        rnn_inputs: tf.Tensor,
-        attention: tf.Tensor,
+        rnn_inputs: 'tf.Tensor',
+        attention: 'tf.Tensor',
         num_cell_input_memory_units: int,
-    ) -> tf.Tensor:
+    ) -> 'tf.Tensor':
         """Combine rnn inputs and attention into cell input.
 
         Args:
@@ -594,8 +633,8 @@ def cell_input_fn(
             return rnn_inputs
 
     def rnn_and_attn_inputs_fn(
-        self, inputs: tf.Tensor, cell_state: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        self, inputs: 'tf.Tensor', cell_state: 'tf.Tensor'
+    ) -> Tuple['tf.Tensor', 'tf.Tensor']:
         """Construct rnn input and attention mechanism input.
 
         Args:
@@ -626,19 +665,20 @@ def rnn_and_attn_inputs_fn(
     def _create_attn_cell(
         self,
         cell: tf.contrib.rnn.RNNCell,
-        embed_utter: tf.Tensor,
-        embed_prev_action: tf.Tensor,
-        real_length: tf.Tensor,
-        embed_for_no_intent: tf.Tensor,
-        embed_for_no_action: tf.Tensor,
-        embed_for_action_listen: tf.Tensor,
+        embed_utter: 'tf.Tensor',
+        embed_prev_action: 'tf.Tensor',
+        real_length: 'tf.Tensor',
+        embed_for_no_intent: 'tf.Tensor',
+        embed_for_no_action: 'tf.Tensor',
+        embed_for_action_listen: 'tf.Tensor',
     ) -> tf.contrib.rnn.RNNCell:
         """Wrap cell in attention wrapper with given memory."""
 
         if self.attn_before_rnn:
             # create attention over previous user input
             num_memory_units_before_rnn = self._num_units(embed_utter)
-            attn_mech = self._create_attn_mech(embed_utter, real_length)
+            with tf.variable_scope('before', reuse=tf.AUTO_REUSE):
+                attn_mech = self._create_attn_mech(embed_utter, real_length)
 
             # create mask for empty user input not to pay attention to it
             ignore_mask = tf.reduce_all(
@@ -655,7 +695,8 @@ def _create_attn_cell(
 
         if self.attn_after_rnn:
             # create attention over previous bot actions
-            attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length)
+            with tf.variable_scope('after', reuse=tf.AUTO_REUSE):
+                attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length)
 
             # create mask for empty bot action or action_listen
             # not to pay attention to them
@@ -713,14 +754,14 @@ def _create_attn_cell(
 
     def _create_tf_dial_embed(
         self,
-        embed_utter: tf.Tensor,
-        embed_slots: tf.Tensor,
-        embed_prev_action: tf.Tensor,
-        mask: tf.Tensor,
-        embed_for_no_intent: tf.Tensor,
-        embed_for_no_action: tf.Tensor,
-        embed_for_action_listen: tf.Tensor,
-    ) -> Tuple[tf.Tensor, Union[tf.Tensor, "TimeAttentionWrapperState"]]:
+        embed_utter: 'tf.Tensor',
+        embed_slots: 'tf.Tensor',
+        embed_prev_action: 'tf.Tensor',
+        mask: 'tf.Tensor',
+        embed_for_no_intent: 'tf.Tensor',
+        embed_for_no_action: 'tf.Tensor',
+        embed_for_action_listen: 'tf.Tensor',
+    ) -> Tuple['tf.Tensor', Union['tf.Tensor', "TimeAttentionWrapperState"]]:
         """Create rnn for dialogue level embedding."""
 
         cell_input = tf.concat([embed_utter, embed_slots, embed_prev_action], -1)
@@ -740,16 +781,266 @@ def _create_tf_dial_embed(
                 embed_for_action_listen,
             )
 
-        return tf.nn.dynamic_rnn(
-            cell,
-            cell_input,
-            dtype=tf.float32,
-            sequence_length=real_length,
-            scope="rnn_decoder",
+        with tf.variable_scope('rnn_decoder', reuse=tf.AUTO_REUSE):
+            return tf.nn.dynamic_rnn(
+                cell,
+                cell_input,
+                dtype=tf.float32,
+                sequence_length=real_length,
+            )
+
+    def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights):
+        x_in = tf.concat([a_in, b_prev_in], -1)
+        # print(x_in.shape[-1])
+        # exit()
+
+        # x = x_in
+        hparams = transformer_base()
+
+        hparams.num_hidden_layers = self.num_rnn_layers
+        hparams.hidden_size = self.rnn_size
+        # it seems to be factor of 4 for transformer architectures in t2t
+        hparams.filter_size = hparams.hidden_size * 4
+        hparams.num_heads = self.num_heads
+        hparams.relu_dropout = self.droprate["rnn"]
+        hparams.pos = self.pos_encoding
+
+        hparams.max_length = self.max_seq_length
+
+        hparams.unidirectional_encoder = True
+
+        hparams.self_attention_type = "dot_product_relative_v2"
+        hparams.max_relative_position = 5
+        hparams.add_relative_to_values = True
+
+        # hparams.proximity_bias = True
+
+        # When not in training mode, set all forms of dropout to zero.
+        for key, value in hparams.values().items():
+            if key.endswith("dropout") or key == "label_smoothing":
+                setattr(hparams, key, value * tf.cast(self._is_training, tf.float32))
+        reg = tf.contrib.layers.l2_regularizer(self.C2)
+
+        x = tf.layers.dense(inputs=x_in,
+                            units=hparams.hidden_size,
+                            use_bias=False,
+                            kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
+                            kernel_regularizer=reg,
+                            name='transformer_embed_layer',
+                            reuse=tf.AUTO_REUSE)
+        # a = tf.layers.dense(inputs=a_in,
+        #                     units=hparams.hidden_size/3,
+        #                     use_bias=False,
+        #                     kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
+        #                     kernel_regularizer=reg,
+        #                     name='transformer_embed_layer_a',
+        #                     reuse=tf.AUTO_REUSE)
+        #
+        c = tf.layers.dense(inputs=c_in,
+                            units=hparams.hidden_size,
+                            use_bias=False,
+                            kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
+                            kernel_regularizer=reg,
+                            name='transformer_embed_layer_c',
+                            reuse=tf.AUTO_REUSE)
+        #
+        # b = tf.layers.dense(inputs=b_prev_in,
+        #                     units=hparams.hidden_size/3,
+        #                     use_bias=False,
+        #                     kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
+        #                     kernel_regularizer=reg,
+        #                     name='transformer_embed_layer_b',
+        #                     reuse=tf.AUTO_REUSE)
+
+        # x = tf.concat([a, c, b], -1)
+
+        x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training)
+
+        if hparams.multiply_embedding_mode == "sqrt_depth":
+            x *= hparams.hidden_size ** 0.5
+            c *= hparams.hidden_size ** 0.5
+
+        x *= tf.expand_dims(mask, -1)
+
+        with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
+            (x,
+             self_attention_bias,
+             encoder_decoder_attention_bias
+             ) = transformer_prepare_encoder(x, None, hparams)
+
+            if hparams.pos == 'custom_timing':
+                x = common_attention.add_timing_signal_1d(x, max_timescale=self.pos_max_timescale)
+
+            x *= tf.expand_dims(mask, -1)
+
+            x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+
+            attn_bias_for_padding = None
+            # Otherwise the encoder will just use encoder_self_attention_bias.
+            if hparams.unidirectional_encoder:
+                attn_bias_for_padding = encoder_decoder_attention_bias
+
+            x = transformer_encoder(
+                x,
+                self_attention_bias,
+                hparams,
+                nonpadding=mask,
+                save_weights_to=attention_weights,
+                attn_bias_for_padding=attn_bias_for_padding,
+            )
+
+            # x = tf.concat([x, c_in], -1)
+            # c_gate = tf.layers.dense(inputs=x,
+            #                          # units=hparams.hidden_size,
+            #                          # activation=tf.nn.softmax,
+            #                          units=1,
+            #                          activation=tf.math.sigmoid,
+            #                          bias_initializer=tf.constant_initializer(-1),
+            #                          # use_bias=False,
+            #                          # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
+            #                          kernel_regularizer=reg,
+            #                          name='slots_gate_layer_c',
+            #                          reuse=tf.AUTO_REUSE)
+            x += c #* c_gate
+            # x = common_layers.layer_postprocess(x, c, hparams)
+            x *= tf.expand_dims(mask, -1)
+
+            return tf.nn.relu(x), self_attention_bias, x_in
+
+    @staticmethod
+    def _rearrange_fn(list_tensor_1d_mask_1d):
+        """Rearranges tensor_1d to put all the values
+            where mask_1d=1 to the right and
+            where mask_1d=0 to the left"""
+        tensor_1d, mask_1d = list_tensor_1d_mask_1d
+
+        partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2)
+
+        return tf.concat(partitioned_tensor, 0)
+
+    @staticmethod
+    def _arrange_back_fn(list_tensor_1d_mask_1d):
+        """Arranges back tensor_1d to restore original order
+            modified by `_rearrange_fn` according to mask_1d:
+            - number of 0s in mask_1d values on the left are set to
+              their corresponding places where mask_1d=0,
+            - number of 1s in mask_1d values on the right are set to
+              their corresponding places where mask_1d=1"""
+        tensor_1d, mask_1d = list_tensor_1d_mask_1d
+
+        mask_indices = tf.dynamic_partition(
+            tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2
         )
 
+        mask_sum = tf.reduce_sum(mask_1d, axis=0)
+        partitioned_tensor = [
+            tf.zeros_like(tensor_1d[:-mask_sum]),
+            tensor_1d[-mask_sum:],
+        ]
+
+        return tf.dynamic_stitch(mask_indices, partitioned_tensor)
+
+    def _action_to_copy(self, x_in, x, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action):
+        with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
+            ignore_mask_listen = tf.to_float(tf.logical_or(
+                tf.reduce_all(
+                    tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action),
+                    -1,
+                ),
+                tf.reduce_all(
+                    tf.equal(tf.expand_dims(embed_for_action_listen, 0), embed_prev_action),
+                    -1,
+                ),
+            ))
+
+            triag_mask = tf.expand_dims(
+                common_attention.attention_bias_to_padding(self_attention_bias[0, 0, :, tf.newaxis, tf.newaxis, :]), 0)
+            diag_mask = 1 - (1 - triag_mask) * tf.cumprod(triag_mask, axis=-1, exclusive=True, reverse=True)
+
+            bias = self_attention_bias + common_attention.attention_bias_ignore_padding(ignore_mask_listen) * tf.expand_dims(diag_mask, 1)
+
+            copy_weights = {}
+            common_attention.multihead_attention(x_in,
+                                                 embed_prev_action,
+                                                 bias,
+                                                 self.rnn_size,
+                                                 self.embed_dim,
+                                                 self.embed_dim,
+                                                 1,
+                                                 0,
+                                                 save_weights_to=copy_weights)
+
+        copy_weights = copy_weights['copy/multihead_attention/dot_product_attention'][:, 0, :, :]
+        bias = bias[:, 0, :, :]
+        shape = tf.shape(copy_weights)
+        copy_weights = tf.reshape(copy_weights, (-1, shape[-1]))
+        x_flat = tf.reshape(x_in, (-1, x_in.shape[-1]))
+        bias = tf.reshape(bias, (-1, shape[-1]))
+        ignore_mask = common_attention.attention_bias_to_padding(bias[:, tf.newaxis, tf.newaxis, :], tf.to_int32)
+
+        s_w = tf.layers.dense(
+            inputs=x_flat,
+            units=2 * self.attn_shift_range + 1,
+            activation=tf.nn.softmax,
+            name="shift_weight",
+            reuse=tf.AUTO_REUSE
+        )
+        mask = 1 - ignore_mask
+        conv_weights = tf.map_fn(
+            self._rearrange_fn, [copy_weights, mask], dtype=copy_weights.dtype
+        )
+
+        conv_weights = tf.reverse(conv_weights, axis=[1])
+
+        # preare probs for tf.nn.depthwise_conv2d
+        # [in_width, in_channels=batch]
+        conv_weights = tf.transpose(conv_weights, [1, 0])
+        # [batch=1, in_height=1, in_width=time+1, in_channels=batch]
+        conv_weights = conv_weights[tf.newaxis, tf.newaxis, :, :]
+
+        # [filter_height=1, filter_width=2*attn_shift_range+1,
+        #   in_channels=batch, channel_multiplier=1]
+        conv_s_w = tf.transpose(s_w, [1, 0])
+        conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis]
+
+        # perform 1d convolution
+        # [batch=1, out_height=1, out_width=time+1, out_channels=batch]
+        conv_weights = tf.nn.depthwise_conv2d_native(
+            conv_weights, conv_s_w, [1, 1, 1, 1], "SAME"
+        )
+        conv_weights = conv_weights[0, 0, :, :]
+        conv_weights = tf.transpose(conv_weights, [1, 0])
+
+        conv_weights = tf.reverse(conv_weights, axis=[1])
+
+        # arrange probs back to their original time order
+        copy_weights = tf.map_fn(
+            self._arrange_back_fn, [conv_weights, mask], dtype=conv_weights.dtype
+        )
+
+        # sharpening parameter
+        g_sh = tf.layers.dense(
+            inputs=x_flat,
+            units=1,
+            activation=lambda a: tf.nn.softplus(a) + 1,
+            bias_initializer=tf.constant_initializer(1),
+            name="gamma_sharp",
+            reuse=tf.AUTO_REUSE
+        )
+
+        powed_weights = tf.pow(copy_weights, g_sh)
+        copy_weights = powed_weights / (tf.reduce_sum(powed_weights, 1, keepdims=True) + 1e-32)
+
+        copy_weights = tf.reshape(copy_weights, shape)
+
+        # remove current time
+        copy_prev = copy_weights * diag_mask
+        keep_current = copy_weights * (1 - diag_mask)
+        dial_embed = self._create_embed(x, layer_name_suffix="out")
+        return tf.matmul(copy_prev, embed_prev_action) + tf.matmul(keep_current, dial_embed), copy_weights
+
     @staticmethod
-    def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> tf.Tensor:
+    def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor':
         """Extract alignments history form final rnn cell state."""
 
         alignments_from_state = final_state.alignment_history
@@ -764,14 +1055,14 @@ def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> tf.Ten
         return tf.concat(alignment_history, -1)
 
     @staticmethod
-    def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> tf.Tensor:
+    def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor':
         """Extract all time masks form final rnn cell state."""
 
         # reshape to (batch, time, memory_time) and ignore last time
         # because time_mask is created for the next time step
         return tf.transpose(final_state.all_time_masks.stack(), [1, 0, 2])[:, :-1, :]
 
-    def _sims_rnn_to_max_from(self, cell_output: tf.Tensor) -> List[tf.Tensor]:
+    def _sims_rnn_to_max_from(self, cell_output: 'tf.Tensor') -> List['tf.Tensor']:
         """Save intermediate tensors for debug purposes."""
 
         if self.attn_after_rnn:
@@ -786,7 +1077,7 @@ def _sims_rnn_to_max_from(self, cell_output: tf.Tensor) -> List[tf.Tensor]:
         else:
             return []
 
-    def _embed_dialogue_from(self, cell_output: tf.Tensor) -> tf.Tensor:
+    def _embed_dialogue_from(self, cell_output: 'tf.Tensor') -> 'tf.Tensor':
         """Extract or calculate dialogue level embedding from cell_output."""
 
         if self.attn_after_rnn:
@@ -812,12 +1103,65 @@ def _embed_dialogue_from(self, cell_output: tf.Tensor) -> tf.Tensor:
 
         return embed_dialogue
 
+    def _tf_sample_neg(self,
+                       pos_b,
+                       neg_bs=None,
+                       neg_ids=None,
+                       batch_size=None,
+                       first_only=False
+                       ) -> 'tf.Tensor':
+
+        all_b = pos_b[tf.newaxis, :, :]
+        if batch_size is None:
+            batch_size = tf.shape(pos_b)[0]
+        all_b = tf.tile(all_b, [batch_size, 1, 1])
+        if neg_bs is None and neg_ids is None:
+            return all_b
+
+        def sample_neg_b():
+            if neg_bs is not None:
+                _neg_bs = neg_bs
+            elif neg_ids is not None:
+                _neg_bs = tf.batch_gather(all_b, neg_ids)
+            else:
+                raise
+            return tf.concat([pos_b[:, tf.newaxis, :], _neg_bs], 1)
+
+        if first_only:
+            out_b = pos_b[:, tf.newaxis, :]
+        else:
+            out_b = all_b
+
+        if neg_bs is not None:
+            cond = tf.logical_and(self._is_training, tf.shape(neg_bs)[0] > 1)
+        elif neg_ids is not None:
+            cond = tf.logical_and(self._is_training, tf.shape(neg_ids)[0] > 1)
+        else:
+            raise
+
+        return tf.cond(cond, sample_neg_b, lambda: out_b)
+
+    def _tf_calc_iou(self,
+                     b_raw,
+                     neg_bs=None,
+                     neg_ids=None
+                     ) -> 'tf.Tensor':
+
+        tiled_intent_raw = self._tf_sample_neg(b_raw, neg_bs=neg_bs, neg_ids=neg_ids)
+        pos_b_raw = tiled_intent_raw[:, :1, :]
+        neg_b_raw = tiled_intent_raw[:, 1:, :]
+        intersection_b_raw = tf.minimum(neg_b_raw, pos_b_raw)
+        union_b_raw = tf.maximum(neg_b_raw, pos_b_raw)
+
+        return tf.reduce_sum(intersection_b_raw, -1) / tf.reduce_sum(union_b_raw, -1)
+
     def _tf_sim(
         self,
-        embed_dialogue: tf.Tensor,
-        embed_action: tf.Tensor,
-        mask: Optional[tf.Tensor],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        embed_dialogue: 'tf.Tensor',
+        embed_action: 'tf.Tensor',
+        mask: Optional['tf.Tensor'],
+    ) -> Union[Tuple['tf.Tensor', 'tf.Tensor'],
+               Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor']]:
         """Define similarity.
 
         This method has two roles:
@@ -832,93 +1176,183 @@ def _tf_sim(
         because it is necessary for them to be mathematically identical.
         """
 
-        if self.similarity_type == "cosine":
-            # normalize embedding vectors for cosine similarity
+        if self.similarity_type not in {"cosine", "inner"}:
+            raise ValueError(
+                "Wrong similarity type {}, "
+                "should be 'cosine' or 'inner'"
+                "".format(self.similarity_type)
+            )
+
+        if len(embed_dialogue.shape) == 2 and len(embed_action.shape) == 2:
+            # calculate similarity between
+            # two embedding vectors of the same size
+
+            # always use cosine sim for copy mech
             embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1)
             embed_action = tf.nn.l2_normalize(embed_action, -1)
 
-        if self.similarity_type in {"cosine", "inner"}:
+            cos_sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True)
 
-            if len(embed_dialogue.shape) == len(embed_action.shape):
-                # calculate similarity between
-                # two embedding vectors of the same size
-                sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True)
-                bin_sim = tf.where(
-                    sim > (self.mu_pos - self.mu_neg) / 2.0,
-                    tf.ones_like(sim),
-                    tf.zeros_like(sim),
-                )
+            bin_sim = tf.where(
+                cos_sim > (self.mu_pos - self.mu_neg) / 2.0,
+                tf.ones_like(cos_sim),
+                tf.zeros_like(cos_sim),
+            )
+
+            # output binary mask and similarity
+            return bin_sim, cos_sim
 
-                # output binary mask and similarity
-                return bin_sim, sim
+        else:
+            # calculate similarity with several
+            # embedded actions for the loss
 
+            if self.similarity_type == "cosine":
+                # normalize embedding vectors for cosine similarity
+                embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1)
+                embed_action = tf.nn.l2_normalize(embed_action, -1)
+
+            if len(embed_dialogue.shape) == 4:
+                embed_dialogue_pos = embed_dialogue[:, :, :1, :]
             else:
-                # calculate similarity with several
-                # embedded actions for the loss
-                sim = tf.reduce_sum(
-                    tf.expand_dims(embed_dialogue, -2) * embed_action, -1
-                )
-                sim *= tf.expand_dims(mask, 2)
+                embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2)
 
-                sim_act = tf.reduce_sum(
-                    embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1
-                )
-                sim_act *= tf.expand_dims(mask, 2)
+            sim = tf.reduce_sum(
+                embed_dialogue_pos * embed_action, -1
+            ) * tf.expand_dims(mask, 2)
+
+            sim_bot_emb = tf.reduce_sum(
+                embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1
+            ) * tf.expand_dims(mask, 2)
 
-                # output similarities between user input and bot actions
-                # and similarities between bot actions
-                return sim, sim_act
+            if len(embed_dialogue.shape) == 4:
+                sim_dial_emb = tf.reduce_sum(
+                    embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1
+                ) * tf.expand_dims(mask, 2)
+            else:
+                sim_dial_emb = None
 
+            if len(embed_dialogue.shape) == 4:
+                sim_dial_bot_emb = tf.reduce_sum(
+                    embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1
+                ) * tf.expand_dims(mask, 2)
+            else:
+                sim_dial_bot_emb = None
+
+            # output similarities between user input and bot actions
+            # and similarities between bot actions
+            return sim,  sim_bot_emb, sim_dial_emb, sim_dial_bot_emb
+
+    # noinspection PyPep8Naming
+    def _scale_loss_by_count_actions(
+        self,
+        X,
+        Y,
+        slots,
+        previous_actions,
+    ) -> Union[np.ndarray, List[List]]:
+        """Calculate inverse proportionality of repeated actions."""
+
+        if self.scale_loss_by_action_counts:
+            # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
+            #     full = tf.concat([X, slots, previous_actions, Y], -1)
+            # else:
+            full = Y
+
+            flat = tf.reshape(full, (-1, full.shape[-1]))
+            _, i, c = gen_array_ops.unique_with_counts_v2(flat, axis=[0])
+            c = tf.cast(c, tf.float32)
+
+            counts = tf.reshape(tf.gather(c, i), (tf.shape(Y)[0], tf.shape(Y)[1]))
+
+            # do not include [-1 -1 ... -1 0] in averaging
+            # and smooth it by taking sqrt
+
+            if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
+                # action_listen is the top one by an order
+                max_c = tf.math.top_k(c, 2)[0][1]
+            else:
+                max_c = tf.reduce_max(c)
+            # max_c = tf.math.top_k(c, 2)[0][1]
+            # max_c = tf.cond(tf.shape(c)[0] > 1, lambda: tf.math.top_k(c, 2)[0][1], lambda: tf.reduce_max(c))
+            # max_c = tf.reduce_max(c)
+
+            return tf.maximum(max_c / counts, 1)
+            # return tf.maximum(tf.square(max_c / counts), 1)
+
+            # exit()
+        #     full_X = tf.concat(
+        #         [X, slots, previous_actions, Y], -1
+        #     )
+        #     full_X = tf.reshape(full_X, (-1, full_X.shape[-1]))
+        #     # include [-1 -1 ... -1 0] as first
+        #     # full_X = tf.concat([full_X[-1:], full_X], 0)
+        #
+        #     _, i, c = gen_array_ops.unique_with_counts_v2(full_X, axis=[0])
+        #     c = tf.cast(c, tf.float32)
+        #
+        #     counts = tf.reshape(tf.gather(c, i), (tf.shape(X)[0], tf.shape(X)[1]))
+        #
+        #     # do not include [-1 -1 ... -1 0] in averaging
+        #     # and smooth it by taking sqrt
+        #     return tf.maximum(tf.sqrt(tf.reduce_mean(c) / counts), 1)
         else:
-            raise ValueError(
-                "Wrong similarity type {}, "
-                "should be 'cosine' or 'inner'"
-                "".format(self.similarity_type)
-            )
+            return [[None]]
 
-    def _regularization_loss(self) -> Union[tf.Tensor, int]:
+    def _regularization_loss(self):
+        # type: () -> Union['tf.Tensor', int]
         """Add regularization to the embed layer inside rnn cell."""
 
         if self.attn_after_rnn:
-            return self.C2 * tf.add_n(
-                [
-                    tf.nn.l2_loss(tf_var)
-                    for tf_var in tf.trainable_variables()
-                    if "cell/out_layer/kernel" in tf_var.name
-                ]
-            )
-        else:
-            return 0
+            vars_to_reg = [
+                tf.nn.l2_loss(tf_var)
+                for tf_var in tf.trainable_variables()
+                if "cell/out_layer/kernel" in tf_var.name
+            ]
+            if vars_to_reg:
+                return self.C2 * tf.add_n(vars_to_reg)
+
+        return 0
 
     def _tf_loss(
         self,
-        sim: tf.Tensor,
-        sim_act: tf.Tensor,
-        sims_rnn_to_max: List[tf.Tensor],
-        mask: tf.Tensor,
-    ) -> tf.Tensor:
+        sim: 'tf.Tensor',
+        sim_bot_emb: 'tf.Tensor',
+        sim_dial_emb: 'tf.Tensor',
+        sims_rnn_to_max: List['tf.Tensor'],
+        bad_negs,
+        mask: 'tf.Tensor',
+        batch_bad_negs
+    ) -> 'tf.Tensor':
         """Define loss."""
 
         # loss for maximizing similarity with correct action
-        loss = tf.maximum(0.0, self.mu_pos - sim[:, :, 0])
+        loss = tf.maximum(0., self.mu_pos - sim[:, :, 0])
 
         # loss for minimizing similarity with `num_neg` incorrect actions
+        sim_neg = sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs
         if self.use_max_sim_neg:
             # minimize only maximum similarity over incorrect actions
-            max_sim_neg = tf.reduce_max(sim[:, :, 1:], -1)
-            loss += tf.maximum(0.0, self.mu_neg + max_sim_neg)
+            max_sim_neg = tf.reduce_max(sim_neg, -1)
+            loss += tf.maximum(0., self.mu_neg + max_sim_neg)
         else:
             # minimize all similarities with incorrect actions
-            max_margin = tf.maximum(0.0, self.mu_neg + sim[:, :, 1:])
+            max_margin = tf.maximum(0., self.mu_neg + sim_neg)
             loss += tf.reduce_sum(max_margin, -1)
 
-        if self.scale_loss_by_action_counts:
+        if isinstance(self.featurizer, FullDialogueTrackerFeaturizer) and self.scale_loss_by_action_counts:
             # scale loss inverse proportionally to number of action counts
             loss *= self._loss_scales
 
-        # penalize max similarity between intent embeddings
-        loss_act = tf.maximum(0.0, tf.reduce_max(sim_act, -1))
-        loss += loss_act * self.C_emb
+        # penalize max similarity between bot embeddings
+        sim_bot_emb += common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs
+        max_sim_bot_emb = tf.maximum(0., tf.reduce_max(sim_bot_emb, -1))
+        loss += max_sim_bot_emb * self.C_emb
+
+        # penalize max similarity between dial embeddings
+        if sim_dial_emb is not None:
+            sim_dial_emb += common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs
+            max_sim_input_emb = tf.maximum(0., tf.reduce_max(sim_dial_emb, -1))
+            loss += max_sim_input_emb * self.C_emb
 
         # maximize similarity returned by time attention wrapper
         for sim_to_add in sims_rnn_to_max:
@@ -938,12 +1372,80 @@ def _tf_loss(
         )
         return loss
 
-        # training methods
+    def _tf_loss_2(
+        self,
+        sim: 'tf.Tensor',
+        sim_bot_emb: 'tf.Tensor',
+        sim_dial_emb: 'tf.Tensor',
+        sim_dial_bot_emb,
+        sims_rnn_to_max: List['tf.Tensor'],
+        bad_negs,
+        mask: 'tf.Tensor',
+        batch_bad_negs=None,
+    ) -> 'tf.Tensor':
+        """Define loss."""
+
+        all_sim = [sim[:, :, :1],
+                   sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs,
+                   sim_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs,
+                   ]
+        if sim_dial_emb is not None:
+            all_sim.append(sim_dial_emb + common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs)
+
+        if sim_dial_bot_emb is not None:
+            all_sim.append(sim_dial_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs)
+
+        logits = tf.concat(all_sim, -1)
+        pos_labels = tf.ones_like(logits[:, :, :1])
+        neg_labels = tf.zeros_like(logits[:, :, 1:])
+        labels = tf.concat([pos_labels, neg_labels], -1)
+
+        pred = tf.nn.softmax(logits)
+        # fake_logits = tf.concat([logits[:, :, :1] - common_attention.large_compatible_negative(logits.dtype),
+        #                          logits[:, :, 1:] + common_attention.large_compatible_negative(logits.dtype)], -1)
+
+        # ones = tf.ones_like(pred[:, :, 0])
+        # zeros = tf.zeros_like(pred[:, :, 0])
+
+        # already_learned = tf.where(pred[:, :, 0] > 0.8, zeros, ones)
+        already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
+
+        # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
+        # if self.scale_loss_by_action_counts:
+        #     scale_mask = self._loss_scales * mask
+        # else:
+        scale_mask = mask
+        # else:
+        #     scale_mask = 1.0
+
+        loss = tf.losses.softmax_cross_entropy(labels,
+                                               logits,
+                                               scale_mask * already_learned)
+        # add regularization losses
+        loss += self._regularization_loss() + tf.losses.get_regularization_loss()
+
+        # maximize similarity returned by time attention wrapper
+        add_loss = []
+        for sim_to_add in sims_rnn_to_max:
+            add_loss.append(tf.maximum(0.0, 1.0 - sim_to_add))
+
+        if add_loss:
+            # mask loss for different length sequences
+            add_loss = sum(add_loss) * mask
+            # average the loss over sequence length
+            add_loss = tf.reduce_sum(add_loss, -1) / tf.reduce_sum(mask, 1)
+            # average the loss over the batch
+            add_loss = tf.reduce_mean(add_loss)
 
+            loss += add_loss
+
+        return loss
+
+    # training methods
     def train(
         self,
-        training_trackers: List[DialogueStateTracker],
-        domain: Domain,
+        training_trackers: List['DialogueStateTracker'],
+        domain: 'Domain',
         **kwargs: Any
     ) -> None:
         """Train the policy on given training trackers."""
@@ -985,61 +1487,70 @@ def train(
             # set random seed in tf
             tf.set_random_seed(self.random_seed)
 
-            dialogue_len = None  # use dynamic time for rnn
-            # create placeholders
-            self.a_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.X.shape[-1]),
-                name="a",
-            )
-            self.b_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
-                name="b",
-            )
-            self.c_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.slots.shape[-1]),
-                name="slt",
-            )
-            self.b_prev_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.Y.shape[-1]),
-                name="b_prev",
-            )
-            self._dialogue_len = tf.placeholder(
-                dtype=tf.int32, shape=(), name="dialogue_len"
-            )
-            self._x_for_no_intent_in = tf.placeholder(
+            batch_size_in = tf.placeholder(tf.int64)
+            train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X,
+                                                                session_data.Y,
+                                                                session_data.slots,
+                                                                session_data.previous_actions))
+            train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
+            train_dataset = train_dataset.batch(batch_size_in)
+
+            if self.evaluate_on_num_examples:
+                ids = np.random.permutation(len(session_data.X))[:self.evaluate_on_num_examples]
+
+                val_dataset = tf.data.Dataset.from_tensor_slices((session_data.X[ids],
+                                                                  session_data.Y[ids],
+                                                                  session_data.slots[ids],
+                                                                  session_data.previous_actions[ids])
+                                                                 ).batch(self.evaluate_on_num_examples)
+            else:
+                val_dataset = None
+
+            iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
+                                                       train_dataset.output_shapes,
+                                                       output_classes=train_dataset.output_classes)
+
+            self.a_in, self.b_in, self.c_in, self.b_prev_in = iterator.get_next()
+
+            self.a_in = tf.cast(self.a_in, tf.float32)
+            self.b_in = tf.cast(self.b_in, tf.float32)
+            self.c_in = tf.cast(self.c_in, tf.float32)
+            self.b_prev_in = tf.cast(self.b_prev_in, tf.float32)
+
+            # they don't change
+            self._x_for_no_intent_in = tf.constant(
+                session_data.x_for_no_intent,
                 dtype=tf.float32,
-                shape=(1, session_data.X.shape[-1]),
                 name="x_for_no_intent",
             )
-            self._y_for_no_action_in = tf.placeholder(
+            self._y_for_no_action_in = tf.constant(
+                session_data.y_for_no_action,
                 dtype=tf.float32,
-                shape=(1, session_data.Y.shape[-1]),
                 name="y_for_no_action",
             )
-            self._y_for_action_listen_in = tf.placeholder(
+            self._y_for_action_listen_in = tf.constant(
+                session_data.y_for_action_listen,
                 dtype=tf.float32,
-                shape=(1, session_data.Y.shape[-1]),
                 name="y_for_action_listen",
             )
-            self._is_training = tf.placeholder_with_default(False, shape=())
+            all_actions = tf.constant(self.encoded_all_actions,
+                                      dtype=tf.float32,
+                                      name="all_actions")
 
-            self._loss_scales = tf.placeholder(
-                dtype=tf.float32, shape=(None, dialogue_len)
+            # dynamic variables
+            self._is_training = tf.placeholder_with_default(False, shape=())
+            self._dialogue_len = tf.placeholder(
+                dtype=tf.int32, shape=(), name="dialogue_len"
             )
 
-            # create embedding vectors
-            self.user_embed = self._create_tf_user_embed(self.a_in)
+            # mask different length sequences
+            # if there is at least one `-1` it should be masked
+            mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+
             self.bot_embed = self._create_tf_bot_embed(self.b_in)
-            self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt")
+            all_actions_embed = self._create_tf_bot_embed(all_actions)
 
             embed_prev_action = self._create_tf_bot_embed(self.b_prev_in)
-            embed_for_no_intent = self._create_tf_no_intent_embed(
-                self._x_for_no_intent_in
-            )
             embed_for_no_action = self._create_tf_no_action_embed(
                 self._y_for_no_action_in
             )
@@ -1047,42 +1558,185 @@ def train(
                 self._y_for_action_listen_in
             )
 
-            # mask different length sequences
-            # if there is at least one `-1` it should be masked
-            mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+            if self.transformer:
+                self.attention_weights = {}
+                tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
+                # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action)
+                self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out") #+ self._create_embed(self.c_in, layer_name_suffix="slots")
+                sims_rnn_to_max = []
+            else:
+                # create embedding vectors
+                self.user_embed = self._create_tf_user_embed(self.a_in)
+                self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt")
 
-            # get rnn output
-            cell_output, final_state = self._create_tf_dial_embed(
-                self.user_embed,
-                self.slot_embed,
-                embed_prev_action,
-                mask,
-                embed_for_no_intent,
-                embed_for_no_action,
-                embed_for_action_listen,
-            )
-            # process rnn output
-            if self.is_using_attention():
-                self.alignment_history = self._alignments_history_from(final_state)
+                embed_for_no_intent = self._create_tf_no_intent_embed(
+                    self._x_for_no_intent_in
+                )
 
-                self.all_time_masks = self._all_time_masks_from(final_state)
+                # get rnn output
+                cell_output, final_state = self._create_tf_dial_embed(
+                    self.user_embed,
+                    self.slot_embed,
+                    embed_prev_action,
+                    mask,
+                    embed_for_no_intent,
+                    embed_for_no_action,
+                    embed_for_action_listen,
+                )
+                # process rnn output
+                if self.is_using_attention():
+                    self.alignment_history = self._alignments_history_from(final_state)
 
-            sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output)
-            self.dial_embed = self._embed_dialogue_from(cell_output)
+                    self.all_time_masks = self._all_time_masks_from(final_state)
+
+                sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output)
+                self.dial_embed = self._embed_dialogue_from(cell_output)
 
             # calculate similarities
-            self.sim_op, sim_act = self._tf_sim(self.dial_embed, self.bot_embed, mask)
+            if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+                self.b_in = tf.expand_dims(self.b_in, 1)
+                self.bot_embed = tf.expand_dims(self.bot_embed, 1)
+                self.dial_embed = self.dial_embed[:, -1:, :]
+                mask = mask[:, -1:]
+
+            b_raw = tf.reshape(self.b_in, (-1, self.b_in.shape[-1]))
+
+            _, i, c = gen_array_ops.unique_with_counts_v2(b_raw, axis=[0])
+            counts = tf.expand_dims(tf.reshape(tf.gather(tf.cast(c, tf.float32), i), (tf.shape(b_raw)[0],)), 0)
+            batch_neg_ids = tf.random.categorical(tf.log((1. - tf.eye(tf.shape(b_raw)[0])/counts)), self.num_neg)
+
+            batch_iou_bot = self._tf_calc_iou(b_raw, neg_ids=batch_neg_ids)
+            batch_bad_negs = 1. - tf.nn.relu(tf.sign(1. - batch_iou_bot))
+            batch_bad_negs = tf.reshape(batch_bad_negs, (tf.shape(self.dial_embed)[0],
+                                                         tf.shape(self.dial_embed)[1],
+                                                         -1))
+
+            neg_ids = tf.random.categorical(tf.log(tf.ones((tf.shape(b_raw)[0], tf.shape(all_actions)[0]))), self.num_neg)
+
+            tiled_all_actions = tf.tile(tf.expand_dims(all_actions, 0), (tf.shape(b_raw)[0], 1, 1))
+            neg_bs = tf.batch_gather(tiled_all_actions, neg_ids)
+            iou_bot = self._tf_calc_iou(b_raw, neg_bs)
+            bad_negs = 1. - tf.nn.relu(tf.sign(1. - iou_bot))
+            bad_negs = tf.reshape(bad_negs, (tf.shape(self.bot_embed)[0],
+                                             tf.shape(self.bot_embed)[1],
+                                             -1))
+
+            dial_embed_flat = tf.reshape(self.dial_embed, (-1, self.dial_embed.shape[-1]))
+
+            tiled_dial_embed = self._tf_sample_neg(dial_embed_flat, neg_ids=batch_neg_ids, first_only=True)
+            tiled_dial_embed = tf.reshape(tiled_dial_embed, (tf.shape(self.dial_embed)[0],
+                                                             tf.shape(self.dial_embed)[1],
+                                                             -1,
+                                                             self.dial_embed.shape[-1]))
+
+            bot_embed_flat = tf.reshape(self.bot_embed, (-1, self.bot_embed.shape[-1]))
+            tiled_all_actions_embed = tf.tile(tf.expand_dims(all_actions_embed, 0), (tf.shape(b_raw)[0], 1, 1))
+            neg_embs = tf.batch_gather(tiled_all_actions_embed, neg_ids)
+            tiled_bot_embed = self._tf_sample_neg(bot_embed_flat, neg_bs=neg_embs)
+            tiled_bot_embed = tf.reshape(tiled_bot_embed, (tf.shape(self.bot_embed)[0],
+                                                           tf.shape(self.bot_embed)[1],
+                                                           -1,
+                                                           self.bot_embed.shape[-1]))
+
+            # self.sim_op, sim_bot_emb, sim_dial_emb = self._tf_sim(self.dial_embed, tiled_bot_embed, mask)
+            self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb = self._tf_sim(tiled_dial_embed, tiled_bot_embed, mask)
+
             # construct loss
-            loss = self._tf_loss(self.sim_op, sim_act, sims_rnn_to_max, mask)
+            if self.scale_loss_by_action_counts:
+                self._loss_scales = self._scale_loss_by_count_actions(self.a_in, self.b_in, self.c_in, self.b_prev_in)
+            else:
+                self._loss_scales = None
+            # loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sims_rnn_to_max, bad_negs, mask)
+            loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb, sims_rnn_to_max, bad_negs, mask, batch_bad_negs)
 
             # define which optimizer to use
             self._train_op = tf.train.AdamOptimizer(
-                learning_rate=0.001, epsilon=1e-16
+                # learning_rate=0.001, epsilon=1e-16
             ).minimize(loss)
+
+            train_init_op = iterator.make_initializer(train_dataset)
+            if self.evaluate_on_num_examples:
+                val_init_op = iterator.make_initializer(val_dataset)
+            else:
+                val_init_op = None
+
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
 
-            self._train_tf(session_data, loss, mask)
+            # self._train_tf(session_data, loss, mask)
+            self._train_tf_dataset(train_init_op, val_init_op, batch_size_in, loss, mask, session_data.X.shape[1])
+
+            dialogue_len = None  # use dynamic time for rnn
+            # create placeholders
+            self.a_in = tf.placeholder(
+                dtype=tf.float32,
+                shape=(None, dialogue_len, session_data.X.shape[-1]),
+                name="a",
+            )
+            self.b_in = tf.placeholder(
+                dtype=tf.float32,
+                shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
+                name="b",
+            )
+            self.c_in = tf.placeholder(
+                dtype=tf.float32,
+                shape=(None, dialogue_len, session_data.slots.shape[-1]),
+                name="slt",
+            )
+            self.b_prev_in = tf.placeholder(
+                dtype=tf.float32,
+                shape=(None, dialogue_len, session_data.Y.shape[-1]),
+                name="b_prev",
+            )
+
+            # mask different length sequences
+            # if there is at least one `-1` it should be masked
+            mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+
+            self.bot_embed = self._create_tf_bot_embed(self.b_in)
+            embed_prev_action = self._create_tf_bot_embed(self.b_prev_in)
+
+            if self.transformer:
+                self.attention_weights = {}
+                tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask,
+                                                                            self.attention_weights)
+                # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias,
+                #                                                               embed_prev_action,
+                #                                                               embed_for_action_listen,
+                #                                                               embed_for_no_action)
+                self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out")
+
+            else:
+                self.user_embed = self._create_tf_user_embed(self.a_in)
+                self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt")
+
+                # get rnn output
+                cell_output, final_state = self._create_tf_dial_embed(
+                    self.user_embed,
+                    self.slot_embed,
+                    embed_prev_action,
+                    mask,
+                    embed_for_no_intent,
+                    embed_for_no_action,
+                    embed_for_action_listen,
+                )
+                # process rnn output
+                if self.is_using_attention():
+                    self.alignment_history = self._alignments_history_from(final_state)
+
+                    self.all_time_masks = self._all_time_masks_from(final_state)
+
+                self.dial_embed = self._embed_dialogue_from(cell_output)
+
+            if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+                self.dial_embed = self.dial_embed[:, -1:, :]
+
+            self.sim_op, _, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask)
+
+            # if self.attention_weights.items():
+            #     self.attention_weights = tf.concat([tf.expand_dims(t, 0)
+            #                                         for name, t in self.attention_weights.items()
+            #                                         if name.endswith('multihead_attention/dot_product_attention')], 0)
 
     # training helpers
     def _linearly_increasing_batch_size(self, epoch: int) -> int:
@@ -1102,73 +1756,15 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int:
         else:
             return int(self.batch_size[0])
 
-    def _create_batch_b(
-        self, batch_pos_b: np.ndarray, intent_ids: np.ndarray
-    ) -> np.ndarray:
-        """Create batch of actions.
-
-        The first is correct action
-        and the rest are wrong actions sampled randomly.
-        """
-
-        batch_pos_b = batch_pos_b[:, :, np.newaxis, :]
-
-        # sample negatives
-        batch_neg_b = np.zeros(
-            (
-                batch_pos_b.shape[0],
-                batch_pos_b.shape[1],
-                self.num_neg,
-                batch_pos_b.shape[-1],
-            ),
-            dtype=int,
-        )
-        for b in range(batch_pos_b.shape[0]):
-            for h in range(batch_pos_b.shape[1]):
-                # create negative indexes out of possible ones
-                # except for correct index of b
-                negative_indexes = [
-                    i
-                    for i in range(self.encoded_all_actions.shape[0])
-                    if i != intent_ids[b, h]
-                ]
-
-                negs = np.random.choice(negative_indexes, size=self.num_neg)
-
-                batch_neg_b[b, h] = self.encoded_all_actions[negs]
-
-        return np.concatenate([batch_pos_b, batch_neg_b], -2)
-
-    # noinspection PyPep8Naming
-    def _scale_loss_by_count_actions(
-        self,
-        X: np.ndarray,
-        slots: np.ndarray,
-        previous_actions: np.ndarray,
-        actions_for_Y: np.ndarray,
-    ) -> Union[np.ndarray, List[List]]:
-        """Calculate inverse proportionality of repeated actions."""
-
-        if self.scale_loss_by_action_counts:
-            full_X = np.concatenate(
-                [X, slots, previous_actions, actions_for_Y[:, :, np.newaxis]], -1
-            )
-            full_X = full_X.reshape((-1, full_X.shape[-1]))
-
-            _, i, c = np.unique(full_X, return_inverse=True, return_counts=True, axis=0)
-
-            counts = c[i].reshape((X.shape[0], X.shape[1]))
-
-            # do not include [-1 -1 ... -1 0] in averaging
-            # and smooth it by taking sqrt
-            return np.maximum(np.sqrt(np.mean(c[1:]) / counts), 1)
-        else:
-            return [[None]]
-
-    def _train_tf(
-        self, session_data: SessionData, loss: tf.Tensor, mask: tf.Tensor
-    ) -> None:
-        """Train tf graph."""
+    def _train_tf_dataset(self,
+                          train_init_op,
+                          val_init_op,
+                          batch_size_in,
+                          loss: 'tf.Tensor',
+                          mask,
+                          dialogue_len,
+                          ) -> None:
+        """Train tf graph"""
 
         self.session.run(tf.global_variables_initializer())
 
@@ -1178,115 +1774,66 @@ def _train_tf(
                 "".format(self.evaluate_every_num_epochs)
             )
         pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled())
+
         train_acc = 0
         last_loss = 0
         for ep in pbar:
-            # randomize training data for the current epoch
-            ids = np.random.permutation(session_data.X.shape[0])
 
-            # calculate batch size for the current epoch
             batch_size = self._linearly_increasing_batch_size(ep)
-            # calculate number of batches in the current epoch
-            batches_per_epoch = session_data.X.shape[0] // batch_size + int(
-                session_data.X.shape[0] % batch_size > 0
-            )
 
-            # collect average loss over the batches
-            ep_loss = 0
-            for i in range(batches_per_epoch):
-                start_idx = i * batch_size
-                end_idx = (i + 1) * batch_size
-                batch_ids = ids[start_idx:end_idx]
-
-                # get randomized data for current batch
-                batch_a = session_data.X[batch_ids]
-                batch_pos_b = session_data.Y[batch_ids]
-                actions_for_b = session_data.actions_for_Y[batch_ids]
-
-                # add negatives - incorrect bot actions predictions
-                batch_b = self._create_batch_b(batch_pos_b, actions_for_b)
-
-                batch_c = session_data.slots[batch_ids]
-                batch_b_prev = session_data.previous_actions[batch_ids]
-
-                # calculate how much the loss from each action
-                # should be scaled based on action rarity
-                batch_loss_scales = self._scale_loss_by_count_actions(
-                    batch_a, batch_c, batch_b_prev, actions_for_b
-                )
+            self.session.run(train_init_op, feed_dict={batch_size_in: batch_size})
 
-                # minimize and calculate loss
-                _loss, _ = self.session.run(
-                    [loss, self._train_op],
-                    feed_dict={
-                        self.a_in: batch_a,
-                        self.b_in: batch_b,
-                        self.c_in: batch_c,
-                        self.b_prev_in: batch_b_prev,
-                        self._dialogue_len: session_data.X.shape[1],
-                        self._x_for_no_intent_in: session_data.x_for_no_intent,
-                        self._y_for_no_action_in: session_data.y_for_no_action,
-                        self._y_for_action_listen_in: session_data.y_for_action_listen,
-                        self._is_training: True,
-                        self._loss_scales: batch_loss_scales,
-                    },
-                )
-                # collect average loss over the batches
-                ep_loss += _loss / batches_per_epoch
-
-            # calculate train accuracy
-            if self.evaluate_on_num_examples:
-                if (
-                    (ep + 1) == 1
-                    or (ep + 1) % self.evaluate_every_num_epochs == 0
-                    or (ep + 1) == self.epochs
-                ):
-                    train_acc = self._calc_train_acc(session_data, mask)
+            ep_loss = 0
+            batches_per_epoch = 0
+            while True:
+                try:
+                    _, batch_loss = self.session.run((self._train_op, loss),
+                                                     feed_dict={self._is_training: True,
+                                                                self._dialogue_len: dialogue_len})
+
+                except tf.errors.OutOfRangeError:
+                    break
+
+                batches_per_epoch += 1
+                ep_loss += batch_loss
+
+            ep_loss /= batches_per_epoch
+
+            if self.evaluate_on_num_examples and val_init_op is not None:
+                if (ep == 0 or
+                        (ep + 1) % self.evaluate_every_num_epochs == 0 or
+                        (ep + 1) == self.epochs):
+                    train_acc = self._output_training_stat_dataset(val_init_op, mask, dialogue_len)
                     last_loss = ep_loss
 
-                pbar.set_postfix(
-                    {
-                        "loss": "{:.3f}".format(ep_loss),
-                        "acc": "{:.3f}".format(train_acc),
-                    }
-                )
+                pbar.set_postfix({
+                    "loss": "{:.3f}".format(ep_loss),
+                    "acc": "{:.3f}".format(train_acc)
+                })
             else:
-                pbar.set_postfix({"loss": "{:.3f}".format(ep_loss)})
+                pbar.set_postfix({
+                    "loss": "{:.3f}".format(ep_loss)
+                })
 
         if self.evaluate_on_num_examples:
-            logger.info(
-                "Finished training embedding policy, "
-                "loss={:.3f}, train accuracy={:.3f}"
-                "".format(last_loss, train_acc)
-            )
+            logger.info("Finished training embedding classifier, "
+                        "loss={:.3f}, train accuracy={:.3f}"
+                        "".format(last_loss, train_acc))
 
-    def _calc_train_acc(self, session_data: SessionData, mask: tf.Tensor) -> np.float32:
-        """Calculate training accuracy."""
+    def _output_training_stat_dataset(self, val_init_op, mask, dialogue_len) -> np.ndarray:
+        """Output training statistics"""
 
-        # choose n examples to calculate train accuracy
-        n = self.evaluate_on_num_examples
-        ids = np.random.permutation(len(session_data.X))[:n]
-        # noinspection PyPep8Naming
-        all_Y_d_x = np.stack(
-            [session_data.all_Y_d for _ in range(session_data.X[ids].shape[0])]
-        )
+        self.session.run(val_init_op)
 
-        _sim, _mask = self.session.run(
-            [self.sim_op, mask],
-            feed_dict={
-                self.a_in: session_data.X[ids],
-                self.b_in: all_Y_d_x,
-                self.c_in: session_data.slots[ids],
-                self.b_prev_in: session_data.previous_actions[ids],
-                self._dialogue_len: session_data.X.shape[1],
-                self._x_for_no_intent_in: session_data.x_for_no_intent,
-                self._y_for_no_action_in: session_data.y_for_no_action,
-                self._y_for_action_listen_in: session_data.y_for_action_listen,
-            },
-        )
-        return np.sum(
-            (np.argmax(_sim, -1) == session_data.actions_for_Y[ids]) * _mask
-        ) / np.sum(_mask)
+        sim_, mask_ = self.session.run([self.sim_op, mask],
+                                       feed_dict={self._is_training: False,
+                                                  self._dialogue_len: dialogue_len})
+        sim_ = sim_.reshape((-1, sim_.shape[-1]))
+        mask_ = mask_.reshape((-1,))
+
+        train_acc = np.sum((np.max(sim_, -1) == sim_.diagonal()) * mask_) / np.sum(mask_)
+
+        return train_acc
 
     def continue_training(
         self,
@@ -1334,6 +1881,22 @@ def continue_training(
                 },
             )
 
+    def tf_feed_dict_for_prediction(self,
+                                    tracker: DialogueStateTracker,
+                                    domain: Domain) -> Dict:
+        # noinspection PyPep8Naming
+        data_X = self.featurizer.create_X([tracker], domain)
+        session_data = self._create_tf_session_data(domain, data_X)
+        # noinspection PyPep8Naming
+        all_Y_d_x = np.stack([session_data.all_Y_d
+                              for _ in range(session_data.X.shape[0])])
+
+        return {self.a_in: session_data.X,
+                self.b_in: all_Y_d_x,
+                self.c_in: session_data.slots,
+                self.b_prev_in: session_data.previous_actions,
+                self._dialogue_len: session_data.X.shape[1]}
+
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
     ) -> List[float]:
@@ -1357,7 +1920,9 @@ def predict_action_probabilities(
         all_Y_d_x = np.stack(
             [session_data.all_Y_d for _ in range(session_data.X.shape[0])]
         )
-
+        # self.similarity_type = 'cosine'
+        # mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+        # self.sim_op, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask)
         _sim = self.session.run(
             self.sim_op,
             feed_dict={
@@ -1366,24 +1931,26 @@ def predict_action_probabilities(
                 self.c_in: session_data.slots,
                 self.b_prev_in: session_data.previous_actions,
                 self._dialogue_len: session_data.X.shape[1],
-                self._x_for_no_intent_in: session_data.x_for_no_intent,
-                self._y_for_no_action_in: session_data.y_for_no_action,
-                self._y_for_action_listen_in: session_data.y_for_action_listen,
             },
         )
 
+        # TODO assume we used inner:
+        self.similarity_type = "inner"
+
         result = _sim[0, -1, :]
         if self.similarity_type == "cosine":
             # clip negative values to zero
             result[result < 0] = 0
         elif self.similarity_type == "inner":
-            # normalize result to [0, 1] with softmax
+            # normalize result to [0, 1] with softmax but only over 3*num_neg+1 values
+            low_ids = result.argsort()[::-1][4*self.num_neg+1:]
+            result[low_ids] += -np.inf
             result = np.exp(result)
             result /= np.sum(result)
 
         return result.tolist()
 
-    def _persist_tensor(self, name: Text, tensor: tf.Tensor) -> None:
+    def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None:
         if tensor is not None:
             self.graph.clear_collection(name)
             self.graph.add_to_collection(name, tensor)
@@ -1408,7 +1975,7 @@ def persist(self, path: Text) -> None:
 
         file_name = "tensorflow_embedding.ckpt"
         checkpoint = os.path.join(path, file_name)
-        rasa.utils.io.create_directory_for_file(checkpoint)
+        utils.create_dir_for_file(checkpoint)
 
         with self.graph.as_default():
             self._persist_tensor("intent_placeholder", self.a_in)
@@ -1435,6 +2002,8 @@ def persist(self, path: Text) -> None:
 
             self._persist_tensor("all_time_masks", self.all_time_masks)
 
+            self._persist_tensor("attention_weights", self.attention_weights)
+
             saver = tf.train.Saver()
             saver.save(self.session, checkpoint)
 
@@ -1449,7 +2018,7 @@ def persist(self, path: Text) -> None:
             pickle.dump(self._tf_config, f)
 
     @staticmethod
-    def load_tensor(name: Text) -> Optional[tf.Tensor]:
+    def load_tensor(name: Text) -> Optional['tf.Tensor']:
         tensor_list = tf.get_collection(name)
         return tensor_list[0] if tensor_list else None
 
@@ -1512,6 +2081,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
             all_time_masks = cls.load_tensor("all_time_masks")
 
+            attention_weights = cls.load_tensor("attention_weights")
+
         encoded_actions_file = os.path.join(
             path, "{}.encoded_all_actions.pkl".format(file_name)
         )
@@ -1543,4 +2114,5 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             attn_embed=attn_embed,
             copy_attn_debug=copy_attn_debug,
             all_time_masks=all_time_masks,
+            attention_weights=attention_weights
         )

From 92d24f07b28999e3f4869f9ba35a7011b4426234 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 9 Jul 2019 18:16:41 +0200
Subject: [PATCH 02/50] remove trash

---
 rasa/core/featurizers.py               |    4 +-
 rasa/core/policies/embedding_policy.py | 1011 +++---------------------
 2 files changed, 118 insertions(+), 897 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index 737b4e22a80e..f0e722975078 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -412,7 +412,7 @@ def create_X(
 
     def persist(self, path):
         featurizer_file = os.path.join(path, "featurizer.json")
-        utils.create_dir_for_file(featurizer_file)
+        rasa.utils.io.create_directory_for_file(featurizer_file)
         with open(featurizer_file, "w", encoding="utf-8") as f:
             # noinspection PyTypeChecker
             f.write(str(jsonpickle.encode(self)))
@@ -568,7 +568,7 @@ def _hash_example(states, action):
 
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[Dict]], List[List[Text]]]:
+    ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]:
 
         trackers_as_states = []
         trackers_as_actions = []
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index bd6f9bcd9fb6..68bc2808a67d 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -12,7 +12,6 @@
 
 import rasa.utils.io
 from rasa.core import utils
-from rasa.core.actions.action import ACTION_LISTEN_NAME
 from rasa.core.domain import Domain
 from rasa.core.featurizers import (
     TrackerFeaturizer,
@@ -21,23 +20,20 @@
     MaxHistoryTrackerFeaturizer
 )
 from rasa.core.policies.policy import Policy
+from rasa.core.trackers import DialogueStateTracker
+from rasa.utils.common import is_logging_disabled
 
 import tensorflow as tf
 from tensorflow.python.ops import gen_array_ops
-from tensor2tensor.layers import common_attention
-from tensor2tensor.layers import common_layers
-from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder
-from tensor2tensor.models.evolved_transformer import evolved_transformer_encoder
-
-from rasa.core.policies.tf_utils import (
-    TimeAttentionWrapper,
-    ChronoBiasLayerNormBasicLSTMCell,
-)
-from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.common import is_logging_disabled
 
-if typing.TYPE_CHECKING:
-    from rasa.core.policies.tf_utils import TimeAttentionWrapperState
+try:
+    from tensor2tensor.layers import common_attention
+    from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder
+except ImportError:
+    common_attention = None
+    transformer_base = None
+    transformer_prepare_encoder = None
+    transformer_encoder = None
 
 try:
     import cPickle as pickle
@@ -56,9 +52,6 @@
         "slots",
         "previous_actions",
         "actions_for_Y",
-        "x_for_no_intent",
-        "y_for_no_action",
-        "y_for_action_listen",
         "all_Y_d",
     ),
 )
@@ -82,7 +75,6 @@ class EmbeddingPolicy(Policy):
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_b": [],
 
-        "transformer": False,
         "pos_encoding": "timing",  # {"timing", "emb", "custom_timing"}
         # introduce phase shift in time encodings between transformers
         # 0.5 - 0.8 works on small dataset
@@ -123,8 +115,6 @@ class EmbeddingPolicy(Policy):
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different actions
         "C_emb": 0.8,
-        # scale loss with inverse frequency of bot actions
-        "scale_loss_by_action_counts": True,
         # dropout rate for user nn
         "droprate_a": 0.0,
         # dropout rate for bot nn
@@ -158,6 +148,11 @@ def _standard_featurizer(max_history=None):
         else:
             return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history)
 
+    @staticmethod
+    def _check_t2t():
+        if common_attention is None:
+            raise ImportError("Please install tensor2tensor")
+
     def __init__(
         self,
         featurizer: Optional['FullDialogueTrackerFeaturizer'] = None,
@@ -170,9 +165,6 @@ def __init__(
         slots_placeholder: Optional['tf.Tensor'] = None,
         prev_act_placeholder: Optional['tf.Tensor'] = None,
         dialogue_len: Optional['tf.Tensor'] = None,
-        x_for_no_intent: Optional['tf.Tensor'] = None,
-        y_for_no_action: Optional['tf.Tensor'] = None,
-        y_for_action_listen: Optional['tf.Tensor'] = None,
         similarity_op: Optional['tf.Tensor'] = None,
         alignment_history: Optional['tf.Tensor'] = None,
         user_embed: Optional['tf.Tensor'] = None,
@@ -187,13 +179,9 @@ def __init__(
         max_history: Optional[int] = None,
         **kwargs: Any
     ) -> None:
-        # if featurizer:
-        #     if not isinstance(featurizer, FullDialogueTrackerFeaturizer):
-        #         raise TypeError(
-        #             "Passed tracker featurizer of type {}, "
-        #             "should be FullDialogueTrackerFeaturizer."
-        #             "".format(type(featurizer).__name__)
-        #         )
+        # check if t2t is installed
+        self._check_t2t()
+
         if not featurizer:
             featurizer = self._standard_featurizer(max_history)
         super(EmbeddingPolicy, self).__init__(featurizer, priority)
@@ -221,9 +209,6 @@ def __init__(
         self.c_in = slots_placeholder
         self.b_prev_in = prev_act_placeholder
         self._dialogue_len = dialogue_len
-        self._x_for_no_intent_in = x_for_no_intent
-        self._y_for_no_action_in = y_for_no_action
-        self._y_for_action_listen_in = y_for_action_listen
         self.sim_op = similarity_op
 
         # store attention probability distribution as
@@ -245,7 +230,6 @@ def __init__(
         # internal tf instances
         self._train_op = None
         self._is_training = None
-        self._loss_scales = None
 
     # init helpers
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
@@ -266,7 +250,6 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
                         self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"]
                     )
                 )
-        self.transformer = config['transformer']
         self.pos_encoding = config['pos_encoding']
         self.pos_max_timescale = config['pos_max_timescale']
         self.max_seq_length = config['max_seq_length']
@@ -293,7 +276,6 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
     def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
         self.C2 = config["C2"]
         self.C_emb = config["C_emb"]
-        self.scale_loss_by_action_counts = config["scale_loss_by_action_counts"]
         self.droprate = {
             "a": config["droprate_a"],
             "b": config["droprate_b"],
@@ -373,18 +355,6 @@ def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray:
                 ]
             )
 
-    # noinspection PyPep8Naming
-    @staticmethod
-    def _create_zero_vector(X: np.ndarray) -> np.ndarray:
-        """Create zero vector of shape (1, X.shape[-1])."""
-
-        return np.zeros((1, X.shape[-1]), X.dtype)
-
-    def _create_y_for_action_listen(self, domain: "Domain") -> np.ndarray:
-        """Extract feature vector for action_listen"""
-        action_listen_idx = domain.index_for_action(ACTION_LISTEN_NAME)
-        return self.encoded_all_actions[action_listen_idx : action_listen_idx + 1]
-
     # noinspection PyPep8Naming
     def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray:
         """Stack encoded_all_intents on top of each other
@@ -396,8 +366,8 @@ def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray:
         return np.stack([self.encoded_all_actions] * dialogue_len)
 
     # noinspection PyPep8Naming
-    def _create_tf_session_data(
-        self, domain: "Domain", data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
+    def _create_session_data(
+        self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
     ) -> SessionData:
         """Combine all tf session related data into a named tuple"""
 
@@ -412,10 +382,6 @@ def _create_tf_session_data(
             actions_for_Y = None
             Y = None
 
-        x_for_no_intent = self._create_zero_vector(X)
-        y_for_no_action = self._create_zero_vector(previous_actions)
-        y_for_action_listen = self._create_y_for_action_listen(domain)
-
         # is needed to calculate train accuracy
         if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
             dial_len = X.shape[1]
@@ -429,13 +395,34 @@ def _create_tf_session_data(
             slots=slots,
             previous_actions=previous_actions,
             actions_for_Y=actions_for_Y,
-            x_for_no_intent=x_for_no_intent,
-            y_for_no_action=y_for_no_action,
-            y_for_action_listen=y_for_action_listen,
             all_Y_d=all_Y_d,
         )
 
+    @staticmethod
+    def _sample_session_data(session_data: 'SessionData',
+                             num_samples: int) -> 'SessionData':
+        ids = np.random.permutation(len(session_data.X))[:num_samples]
+        return SessionData(
+            X=session_data.X[ids],
+            Y=session_data.Y[ids],
+            slots=session_data.slots[ids],
+            previous_actions=session_data.previous_actions[ids],
+            actions_for_Y=session_data.actions_for_Y[ids],
+            all_Y_d=session_data.all_Y_d,
+        )
+
     # tf helpers:
+    @staticmethod
+    def _create_tf_dataset(session_data: 'SessionData',
+                           batch_size: Union['tf.Tensor', int]) -> 'tf.data.Dataset':
+        train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X,
+                                                            session_data.Y,
+                                                            session_data.slots,
+                                                            session_data.previous_actions))
+        train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
+        train_dataset = train_dataset.batch(batch_size)
+        return train_dataset
+
     def _create_tf_nn(
         self,
         x_in: 'tf.Tensor',
@@ -473,19 +460,6 @@ def _create_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
         )
         return embed_x
 
-    def _create_tf_user_embed(self, a_in: 'tf.Tensor') -> 'tf.Tensor':
-        """Create embedding user vector."""
-
-        layer_name_suffix = "a_and_b" if self.share_embedding else "a"
-
-        a = self._create_tf_nn(
-            a_in,
-            self.hidden_layer_sizes["a"],
-            self.droprate["a"],
-            layer_name_suffix=layer_name_suffix,
-        )
-        return self._create_embed(a, layer_name_suffix=layer_name_suffix)
-
     def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding bot vector."""
 
@@ -499,302 +473,7 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         )
         return self._create_embed(b, layer_name_suffix=layer_name_suffix)
 
-    def _create_tf_no_intent_embed(self, x_for_no_intent_i: 'tf.Tensor') -> 'tf.Tensor':
-        """Create embedding user vector for empty intent."""
-
-        layer_name_suffix = "a_and_b" if self.share_embedding else "a"
-
-        x_for_no_intent = self._create_tf_nn(
-            x_for_no_intent_i,
-            self.hidden_layer_sizes["a"],
-            droprate=0,
-            layer_name_suffix=layer_name_suffix,
-        )
-        return tf.stop_gradient(
-            self._create_embed(x_for_no_intent, layer_name_suffix=layer_name_suffix)
-        )
-
-    def _create_tf_no_action_embed(self, y_for_no_action_in: 'tf.Tensor') -> 'tf.Tensor':
-        """Create embedding bot vector for empty action and action_listen."""
-
-        layer_name_suffix = "a_and_b" if self.share_embedding else "b"
-
-        y_for_no_action = self._create_tf_nn(
-            y_for_no_action_in,
-            self.hidden_layer_sizes["b"],
-            droprate=0,
-            layer_name_suffix=layer_name_suffix,
-        )
-        return tf.stop_gradient(
-            self._create_embed(y_for_no_action, layer_name_suffix=layer_name_suffix)
-        )
-
-    def _create_rnn_cell(self):
-        # type: () -> tf.contrib.rnn.RNNCell
-        """Create one rnn cell."""
-
-        # chrono initialization for forget bias
-        # assuming that characteristic time is max dialogue length
-        # left border that initializes forget gate close to 0
-        bias_0 = -1.0
-
-        # right border that initializes forget gate close to 1
-        bias_1 = np.log(self.characteristic_time - 1.0)
-        fbias = (bias_1 - bias_0) * np.random.random(self.rnn_size) + bias_0
-
-        if self.attn_after_rnn:
-            # since attention is copied to rnn output,
-            # embedding should be performed inside the cell
-            embed_layer_size = self.embed_dim
-        else:
-            embed_layer_size = None
-
-        keep_prob = 1.0 - (
-            self.droprate["rnn"] * tf.cast(self._is_training, tf.float32)
-        )
-
-        return ChronoBiasLayerNormBasicLSTMCell(
-            num_units=self.rnn_size,
-            layer_norm=self.layer_norm,
-            forget_bias=fbias,
-            input_bias=-fbias,
-            dropout_keep_prob=keep_prob,
-            out_layer_size=embed_layer_size,
-        )
-
-    @staticmethod
-    def _num_units(memory: 'tf.Tensor') -> int:
-        return memory.shape[-1].value
-
-    def _create_attn_mech(
-        self, memory: 'tf.Tensor', real_length: 'tf.Tensor'
-    ) -> tf.contrib.seq2seq.AttentionMechanism:
-
-        return tf.contrib.seq2seq.BahdanauAttention(
-            num_units=self._num_units(memory),
-            memory=memory,
-            memory_sequence_length=real_length,
-            normalize=True,
-            probability_fn=tf.identity,
-            # we only attend to memory up to a current time step
-            # it does not affect alignments, but
-            # is important for interpolation gate
-            score_mask_value=0,
-        )
-
-    def cell_input_fn(
-        self,
-        rnn_inputs: 'tf.Tensor',
-        attention: 'tf.Tensor',
-        num_cell_input_memory_units: int,
-    ) -> 'tf.Tensor':
-        """Combine rnn inputs and attention into cell input.
-
-        Args:
-          rnn_inputs: Tensor, first output from `rnn_and_attn_inputs_fn`.
-
-          attention: Tensor, concatenated all attentions for one time step.
-
-          num_cell_input_memory_units: int, number of the first units in
-                                       `attention` that are responsible for
-                                       enhancing cell input.
-
-        Returns:
-          A Tensor `cell_inputs` to feed to an rnn cell.
-        """
-
-        if num_cell_input_memory_units:
-            if num_cell_input_memory_units == self.embed_dim:
-                # since attention can contain additional
-                # attention mechanisms, only attention
-                # from previous user input is used as an input
-                # for rnn cell and only if memory before rnn
-                # is the same size as embed_utter
-                return tf.concat(
-                    [
-                        rnn_inputs[:, : self.embed_dim]
-                        + attention[:, :num_cell_input_memory_units],
-                        rnn_inputs[:, self.embed_dim :],
-                    ],
-                    -1,
-                )
-            else:
-                # in current implementation it cannot fall here,
-                # but this Exception exists in case
-                # attention before rnn is changed
-                raise ValueError(
-                    "Number of memory units {} is not "
-                    "equal to number of utter units {}. "
-                    "Please modify cell input function "
-                    "accordingly."
-                    "".format(num_cell_input_memory_units, self.embed_dim)
-                )
-        else:
-            return rnn_inputs
-
-    def rnn_and_attn_inputs_fn(
-        self, inputs: 'tf.Tensor', cell_state: 'tf.Tensor'
-    ) -> Tuple['tf.Tensor', 'tf.Tensor']:
-        """Construct rnn input and attention mechanism input.
-
-        Args:
-          inputs: Tensor, concatenated all embeddings for one time step:
-                  [embed_utter, embed_slots, embed_prev_action].
-
-          cell_state: Tensor, state of an rnn cell.
-
-        Returns:
-          Tuple of Tensors `rnn_inputs, attn_inputs` to feed to
-          rnn and attention mechanisms.
-        """
-
-        # the hidden state c and slots are not included,
-        # in hope that algorithm would learn correct attention
-        # regardless of the hidden state c of an lstm and slots
-        if isinstance(cell_state, tf.contrib.rnn.LSTMStateTuple):
-            attn_inputs = tf.concat([inputs[:, : self.embed_dim], cell_state.h], -1)
-        else:
-            attn_inputs = tf.concat([inputs[:, : self.embed_dim], cell_state], -1)
-
-        # include slots in inputs but exclude previous action, since
-        # rnn should get previous action from its hidden state
-        rnn_inputs = inputs[:, : (self.embed_dim + self.embed_dim)]
-
-        return rnn_inputs, attn_inputs
-
-    def _create_attn_cell(
-        self,
-        cell: tf.contrib.rnn.RNNCell,
-        embed_utter: 'tf.Tensor',
-        embed_prev_action: 'tf.Tensor',
-        real_length: 'tf.Tensor',
-        embed_for_no_intent: 'tf.Tensor',
-        embed_for_no_action: 'tf.Tensor',
-        embed_for_action_listen: 'tf.Tensor',
-    ) -> tf.contrib.rnn.RNNCell:
-        """Wrap cell in attention wrapper with given memory."""
-
-        if self.attn_before_rnn:
-            # create attention over previous user input
-            num_memory_units_before_rnn = self._num_units(embed_utter)
-            with tf.variable_scope('before', reuse=tf.AUTO_REUSE):
-                attn_mech = self._create_attn_mech(embed_utter, real_length)
-
-            # create mask for empty user input not to pay attention to it
-            ignore_mask = tf.reduce_all(
-                tf.equal(tf.expand_dims(embed_for_no_intent, 0), embed_utter), -1
-            )
-
-            # do not use attention by location before rnn
-            attn_shift_range = 0
-        else:
-            attn_mech = None
-            ignore_mask = None
-            num_memory_units_before_rnn = None
-            attn_shift_range = None
-
-        if self.attn_after_rnn:
-            # create attention over previous bot actions
-            with tf.variable_scope('after', reuse=tf.AUTO_REUSE):
-                attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length)
-
-            # create mask for empty bot action or action_listen
-            # not to pay attention to them
-            ignore_mask_listen = tf.logical_or(
-                tf.reduce_all(
-                    tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action),
-                    -1,
-                ),
-                tf.reduce_all(
-                    tf.equal(
-                        tf.expand_dims(embed_for_action_listen, 0), embed_prev_action
-                    ),
-                    -1,
-                ),
-            )
-
-            if attn_mech is not None:
-                # if there is another attention mechanism,
-                # create a list of attention mechanisms
-                attn_mech = [attn_mech, attn_mech_after_rnn]
-                ignore_mask = [ignore_mask, ignore_mask_listen]
-                attn_shift_range = [attn_shift_range, self.attn_shift_range]
-            else:
-                attn_mech = attn_mech_after_rnn
-                ignore_mask = ignore_mask_listen
-                attn_shift_range = self.attn_shift_range
-
-            # this particular attention mechanism is unusual
-            # in the sense that its calculated attention vector is directly
-            # added to cell output, therefore enabling copy mechanism
-
-            # `index_of_attn_to_copy` is used by `TimeAttentionWrapper`,
-            # to know which attention to copy
-            index_of_attn_to_copy = -1
-        else:
-            index_of_attn_to_copy = None
-
-        return TimeAttentionWrapper(
-            cell=cell,
-            attention_mechanism=attn_mech,
-            sequence_len=self._dialogue_len,
-            attn_shift_range=attn_shift_range,
-            sparse_attention=self.sparse_attention,
-            rnn_and_attn_inputs_fn=self.rnn_and_attn_inputs_fn,
-            ignore_mask=ignore_mask,
-            cell_input_fn=lambda inputs, attention: (
-                self.cell_input_fn(inputs, attention, num_memory_units_before_rnn)
-            ),
-            index_of_attn_to_copy=index_of_attn_to_copy,
-            likelihood_fn=lambda emb_1, emb_2: (self._tf_sim(emb_1, emb_2, None)),
-            tensor_not_to_copy=embed_for_action_listen,
-            output_attention=True,
-            alignment_history=True,
-        )
-
-    def _create_tf_dial_embed(
-        self,
-        embed_utter: 'tf.Tensor',
-        embed_slots: 'tf.Tensor',
-        embed_prev_action: 'tf.Tensor',
-        mask: 'tf.Tensor',
-        embed_for_no_intent: 'tf.Tensor',
-        embed_for_no_action: 'tf.Tensor',
-        embed_for_action_listen: 'tf.Tensor',
-    ) -> Tuple['tf.Tensor', Union['tf.Tensor', "TimeAttentionWrapperState"]]:
-        """Create rnn for dialogue level embedding."""
-
-        cell_input = tf.concat([embed_utter, embed_slots, embed_prev_action], -1)
-
-        cell = self._create_rnn_cell()
-
-        real_length = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
-
-        if self.is_using_attention():
-            cell = self._create_attn_cell(
-                cell,
-                embed_utter,
-                embed_prev_action,
-                real_length,
-                embed_for_no_intent,
-                embed_for_no_action,
-                embed_for_action_listen,
-            )
-
-        with tf.variable_scope('rnn_decoder', reuse=tf.AUTO_REUSE):
-            return tf.nn.dynamic_rnn(
-                cell,
-                cell_input,
-                dtype=tf.float32,
-                sequence_length=real_length,
-            )
-
-    def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights):
-        x_in = tf.concat([a_in, b_prev_in], -1)
-        # print(x_in.shape[-1])
-        # exit()
-
-        # x = x_in
+    def _create_hparams(self):
         hparams = transformer_base()
 
         hparams.num_hidden_layers = self.num_rnn_layers
@@ -812,8 +491,12 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei
         hparams.self_attention_type = "dot_product_relative_v2"
         hparams.max_relative_position = 5
         hparams.add_relative_to_values = True
+        return hparams
 
-        # hparams.proximity_bias = True
+    def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights):
+        hparams = self._create_hparams()
+
+        x_in = tf.concat([a_in, b_prev_in, c_in], -1)
 
         # When not in training mode, set all forms of dropout to zero.
         for key, value in hparams.values().items():
@@ -828,37 +511,11 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei
                             kernel_regularizer=reg,
                             name='transformer_embed_layer',
                             reuse=tf.AUTO_REUSE)
-        # a = tf.layers.dense(inputs=a_in,
-        #                     units=hparams.hidden_size/3,
-        #                     use_bias=False,
-        #                     kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
-        #                     kernel_regularizer=reg,
-        #                     name='transformer_embed_layer_a',
-        #                     reuse=tf.AUTO_REUSE)
-        #
-        c = tf.layers.dense(inputs=c_in,
-                            units=hparams.hidden_size,
-                            use_bias=False,
-                            kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
-                            kernel_regularizer=reg,
-                            name='transformer_embed_layer_c',
-                            reuse=tf.AUTO_REUSE)
-        #
-        # b = tf.layers.dense(inputs=b_prev_in,
-        #                     units=hparams.hidden_size/3,
-        #                     use_bias=False,
-        #                     kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
-        #                     kernel_regularizer=reg,
-        #                     name='transformer_embed_layer_b',
-        #                     reuse=tf.AUTO_REUSE)
-
-        # x = tf.concat([a, c, b], -1)
 
         x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training)
 
         if hparams.multiply_embedding_mode == "sqrt_depth":
             x *= hparams.hidden_size ** 0.5
-            c *= hparams.hidden_size ** 0.5
 
         x *= tf.expand_dims(mask, -1)
 
@@ -889,219 +546,9 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei
                 attn_bias_for_padding=attn_bias_for_padding,
             )
 
-            # x = tf.concat([x, c_in], -1)
-            # c_gate = tf.layers.dense(inputs=x,
-            #                          # units=hparams.hidden_size,
-            #                          # activation=tf.nn.softmax,
-            #                          units=1,
-            #                          activation=tf.math.sigmoid,
-            #                          bias_initializer=tf.constant_initializer(-1),
-            #                          # use_bias=False,
-            #                          # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
-            #                          kernel_regularizer=reg,
-            #                          name='slots_gate_layer_c',
-            #                          reuse=tf.AUTO_REUSE)
-            x += c #* c_gate
-            # x = common_layers.layer_postprocess(x, c, hparams)
             x *= tf.expand_dims(mask, -1)
 
-            return tf.nn.relu(x), self_attention_bias, x_in
-
-    @staticmethod
-    def _rearrange_fn(list_tensor_1d_mask_1d):
-        """Rearranges tensor_1d to put all the values
-            where mask_1d=1 to the right and
-            where mask_1d=0 to the left"""
-        tensor_1d, mask_1d = list_tensor_1d_mask_1d
-
-        partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2)
-
-        return tf.concat(partitioned_tensor, 0)
-
-    @staticmethod
-    def _arrange_back_fn(list_tensor_1d_mask_1d):
-        """Arranges back tensor_1d to restore original order
-            modified by `_rearrange_fn` according to mask_1d:
-            - number of 0s in mask_1d values on the left are set to
-              their corresponding places where mask_1d=0,
-            - number of 1s in mask_1d values on the right are set to
-              their corresponding places where mask_1d=1"""
-        tensor_1d, mask_1d = list_tensor_1d_mask_1d
-
-        mask_indices = tf.dynamic_partition(
-            tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2
-        )
-
-        mask_sum = tf.reduce_sum(mask_1d, axis=0)
-        partitioned_tensor = [
-            tf.zeros_like(tensor_1d[:-mask_sum]),
-            tensor_1d[-mask_sum:],
-        ]
-
-        return tf.dynamic_stitch(mask_indices, partitioned_tensor)
-
-    def _action_to_copy(self, x_in, x, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action):
-        with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
-            ignore_mask_listen = tf.to_float(tf.logical_or(
-                tf.reduce_all(
-                    tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action),
-                    -1,
-                ),
-                tf.reduce_all(
-                    tf.equal(tf.expand_dims(embed_for_action_listen, 0), embed_prev_action),
-                    -1,
-                ),
-            ))
-
-            triag_mask = tf.expand_dims(
-                common_attention.attention_bias_to_padding(self_attention_bias[0, 0, :, tf.newaxis, tf.newaxis, :]), 0)
-            diag_mask = 1 - (1 - triag_mask) * tf.cumprod(triag_mask, axis=-1, exclusive=True, reverse=True)
-
-            bias = self_attention_bias + common_attention.attention_bias_ignore_padding(ignore_mask_listen) * tf.expand_dims(diag_mask, 1)
-
-            copy_weights = {}
-            common_attention.multihead_attention(x_in,
-                                                 embed_prev_action,
-                                                 bias,
-                                                 self.rnn_size,
-                                                 self.embed_dim,
-                                                 self.embed_dim,
-                                                 1,
-                                                 0,
-                                                 save_weights_to=copy_weights)
-
-        copy_weights = copy_weights['copy/multihead_attention/dot_product_attention'][:, 0, :, :]
-        bias = bias[:, 0, :, :]
-        shape = tf.shape(copy_weights)
-        copy_weights = tf.reshape(copy_weights, (-1, shape[-1]))
-        x_flat = tf.reshape(x_in, (-1, x_in.shape[-1]))
-        bias = tf.reshape(bias, (-1, shape[-1]))
-        ignore_mask = common_attention.attention_bias_to_padding(bias[:, tf.newaxis, tf.newaxis, :], tf.to_int32)
-
-        s_w = tf.layers.dense(
-            inputs=x_flat,
-            units=2 * self.attn_shift_range + 1,
-            activation=tf.nn.softmax,
-            name="shift_weight",
-            reuse=tf.AUTO_REUSE
-        )
-        mask = 1 - ignore_mask
-        conv_weights = tf.map_fn(
-            self._rearrange_fn, [copy_weights, mask], dtype=copy_weights.dtype
-        )
-
-        conv_weights = tf.reverse(conv_weights, axis=[1])
-
-        # preare probs for tf.nn.depthwise_conv2d
-        # [in_width, in_channels=batch]
-        conv_weights = tf.transpose(conv_weights, [1, 0])
-        # [batch=1, in_height=1, in_width=time+1, in_channels=batch]
-        conv_weights = conv_weights[tf.newaxis, tf.newaxis, :, :]
-
-        # [filter_height=1, filter_width=2*attn_shift_range+1,
-        #   in_channels=batch, channel_multiplier=1]
-        conv_s_w = tf.transpose(s_w, [1, 0])
-        conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis]
-
-        # perform 1d convolution
-        # [batch=1, out_height=1, out_width=time+1, out_channels=batch]
-        conv_weights = tf.nn.depthwise_conv2d_native(
-            conv_weights, conv_s_w, [1, 1, 1, 1], "SAME"
-        )
-        conv_weights = conv_weights[0, 0, :, :]
-        conv_weights = tf.transpose(conv_weights, [1, 0])
-
-        conv_weights = tf.reverse(conv_weights, axis=[1])
-
-        # arrange probs back to their original time order
-        copy_weights = tf.map_fn(
-            self._arrange_back_fn, [conv_weights, mask], dtype=conv_weights.dtype
-        )
-
-        # sharpening parameter
-        g_sh = tf.layers.dense(
-            inputs=x_flat,
-            units=1,
-            activation=lambda a: tf.nn.softplus(a) + 1,
-            bias_initializer=tf.constant_initializer(1),
-            name="gamma_sharp",
-            reuse=tf.AUTO_REUSE
-        )
-
-        powed_weights = tf.pow(copy_weights, g_sh)
-        copy_weights = powed_weights / (tf.reduce_sum(powed_weights, 1, keepdims=True) + 1e-32)
-
-        copy_weights = tf.reshape(copy_weights, shape)
-
-        # remove current time
-        copy_prev = copy_weights * diag_mask
-        keep_current = copy_weights * (1 - diag_mask)
-        dial_embed = self._create_embed(x, layer_name_suffix="out")
-        return tf.matmul(copy_prev, embed_prev_action) + tf.matmul(keep_current, dial_embed), copy_weights
-
-    @staticmethod
-    def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor':
-        """Extract alignments history form final rnn cell state."""
-
-        alignments_from_state = final_state.alignment_history
-        if not isinstance(alignments_from_state, tuple):
-            alignments_from_state = [alignments_from_state]
-
-        alignment_history = []
-        for alignments in alignments_from_state:
-            # reshape to (batch, time, memory_time)
-            alignment_history.append(tf.transpose(alignments.stack(), [1, 0, 2]))
-
-        return tf.concat(alignment_history, -1)
-
-    @staticmethod
-    def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor':
-        """Extract all time masks form final rnn cell state."""
-
-        # reshape to (batch, time, memory_time) and ignore last time
-        # because time_mask is created for the next time step
-        return tf.transpose(final_state.all_time_masks.stack(), [1, 0, 2])[:, :-1, :]
-
-    def _sims_rnn_to_max_from(self, cell_output: 'tf.Tensor') -> List['tf.Tensor']:
-        """Save intermediate tensors for debug purposes."""
-
-        if self.attn_after_rnn:
-            # extract additional debug tensors
-            num_add = TimeAttentionWrapper.additional_output_size()
-            self.copy_attn_debug = cell_output[:, :, -num_add:]
-
-            # extract additional similarity to maximize
-            sim_attn_to_max = cell_output[:, :, -num_add]
-            sim_state_to_max = cell_output[:, :, -num_add + 1]
-            return [sim_attn_to_max, sim_state_to_max]
-        else:
-            return []
-
-    def _embed_dialogue_from(self, cell_output: 'tf.Tensor') -> 'tf.Tensor':
-        """Extract or calculate dialogue level embedding from cell_output."""
-
-        if self.attn_after_rnn:
-            # embedding layer is inside rnn cell
-            embed_dialogue = cell_output[:, :, : self.embed_dim]
-
-            # extract additional debug tensors
-            num_add = TimeAttentionWrapper.additional_output_size()
-            self.rnn_embed = cell_output[
-                :, :, self.embed_dim : (self.embed_dim + self.embed_dim)
-            ]
-            self.attn_embed = cell_output[
-                :, :, (self.embed_dim + self.embed_dim) : -num_add
-            ]
-        else:
-            # add embedding layer to rnn cell output
-            embed_dialogue = self._create_embed(
-                cell_output[:, :, : self.rnn_size], layer_name_suffix="out"
-            )
-            if self.attn_before_rnn:
-                # extract additional debug tensors
-                self.attn_embed = cell_output[:, :, self.rnn_size :]
-
-        return embed_dialogue
+            return tf.nn.relu(x)
 
     def _tf_sample_neg(self,
                        pos_b,
@@ -1160,8 +607,7 @@ def _tf_sim(
         embed_dialogue: 'tf.Tensor',
         embed_action: 'tf.Tensor',
         mask: Optional['tf.Tensor'],
-    ) -> Union[Tuple['tf.Tensor', 'tf.Tensor'],
-               Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor']]:
+    ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']:
         """Define similarity.
 
         This method has two roles:
@@ -1183,135 +629,44 @@ def _tf_sim(
                 "".format(self.similarity_type)
             )
 
-        if len(embed_dialogue.shape) == 2 and len(embed_action.shape) == 2:
-            # calculate similarity between
-            # two embedding vectors of the same size
+        # calculate similarity with several
+        # embedded actions for the loss
 
-            # always use cosine sim for copy mech
+        if self.similarity_type == "cosine":
+            # normalize embedding vectors for cosine similarity
             embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1)
             embed_action = tf.nn.l2_normalize(embed_action, -1)
 
-            cos_sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True)
-
-            bin_sim = tf.where(
-                cos_sim > (self.mu_pos - self.mu_neg) / 2.0,
-                tf.ones_like(cos_sim),
-                tf.zeros_like(cos_sim),
-            )
-
-            # output binary mask and similarity
-            return bin_sim, cos_sim
-
+        if len(embed_dialogue.shape) == 4:
+            embed_dialogue_pos = embed_dialogue[:, :, :1, :]
         else:
-            # calculate similarity with several
-            # embedded actions for the loss
+            embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2)
 
-            if self.similarity_type == "cosine":
-                # normalize embedding vectors for cosine similarity
-                embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1)
-                embed_action = tf.nn.l2_normalize(embed_action, -1)
+        sim = tf.reduce_sum(
+            embed_dialogue_pos * embed_action, -1
+        ) * tf.expand_dims(mask, 2)
 
-            if len(embed_dialogue.shape) == 4:
-                embed_dialogue_pos = embed_dialogue[:, :, :1, :]
-            else:
-                embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2)
+        sim_bot_emb = tf.reduce_sum(
+            embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1
+        ) * tf.expand_dims(mask, 2)
 
-            sim = tf.reduce_sum(
-                embed_dialogue_pos * embed_action, -1
+        if len(embed_dialogue.shape) == 4:
+            sim_dial_emb = tf.reduce_sum(
+                embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1
             ) * tf.expand_dims(mask, 2)
+        else:
+            sim_dial_emb = None
 
-            sim_bot_emb = tf.reduce_sum(
-                embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1
+        if len(embed_dialogue.shape) == 4:
+            sim_dial_bot_emb = tf.reduce_sum(
+                embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1
             ) * tf.expand_dims(mask, 2)
-
-            if len(embed_dialogue.shape) == 4:
-                sim_dial_emb = tf.reduce_sum(
-                    embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1
-                ) * tf.expand_dims(mask, 2)
-            else:
-                sim_dial_emb = None
-
-            if len(embed_dialogue.shape) == 4:
-                sim_dial_bot_emb = tf.reduce_sum(
-                    embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1
-                ) * tf.expand_dims(mask, 2)
-            else:
-                sim_dial_bot_emb = None
-
-            # output similarities between user input and bot actions
-            # and similarities between bot actions
-            return sim,  sim_bot_emb, sim_dial_emb, sim_dial_bot_emb
-
-    # noinspection PyPep8Naming
-    def _scale_loss_by_count_actions(
-        self,
-        X,
-        Y,
-        slots,
-        previous_actions,
-    ) -> Union[np.ndarray, List[List]]:
-        """Calculate inverse proportionality of repeated actions."""
-
-        if self.scale_loss_by_action_counts:
-            # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
-            #     full = tf.concat([X, slots, previous_actions, Y], -1)
-            # else:
-            full = Y
-
-            flat = tf.reshape(full, (-1, full.shape[-1]))
-            _, i, c = gen_array_ops.unique_with_counts_v2(flat, axis=[0])
-            c = tf.cast(c, tf.float32)
-
-            counts = tf.reshape(tf.gather(c, i), (tf.shape(Y)[0], tf.shape(Y)[1]))
-
-            # do not include [-1 -1 ... -1 0] in averaging
-            # and smooth it by taking sqrt
-
-            if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
-                # action_listen is the top one by an order
-                max_c = tf.math.top_k(c, 2)[0][1]
-            else:
-                max_c = tf.reduce_max(c)
-            # max_c = tf.math.top_k(c, 2)[0][1]
-            # max_c = tf.cond(tf.shape(c)[0] > 1, lambda: tf.math.top_k(c, 2)[0][1], lambda: tf.reduce_max(c))
-            # max_c = tf.reduce_max(c)
-
-            return tf.maximum(max_c / counts, 1)
-            # return tf.maximum(tf.square(max_c / counts), 1)
-
-            # exit()
-        #     full_X = tf.concat(
-        #         [X, slots, previous_actions, Y], -1
-        #     )
-        #     full_X = tf.reshape(full_X, (-1, full_X.shape[-1]))
-        #     # include [-1 -1 ... -1 0] as first
-        #     # full_X = tf.concat([full_X[-1:], full_X], 0)
-        #
-        #     _, i, c = gen_array_ops.unique_with_counts_v2(full_X, axis=[0])
-        #     c = tf.cast(c, tf.float32)
-        #
-        #     counts = tf.reshape(tf.gather(c, i), (tf.shape(X)[0], tf.shape(X)[1]))
-        #
-        #     # do not include [-1 -1 ... -1 0] in averaging
-        #     # and smooth it by taking sqrt
-        #     return tf.maximum(tf.sqrt(tf.reduce_mean(c) / counts), 1)
         else:
-            return [[None]]
-
-    def _regularization_loss(self):
-        # type: () -> Union['tf.Tensor', int]
-        """Add regularization to the embed layer inside rnn cell."""
-
-        if self.attn_after_rnn:
-            vars_to_reg = [
-                tf.nn.l2_loss(tf_var)
-                for tf_var in tf.trainable_variables()
-                if "cell/out_layer/kernel" in tf_var.name
-            ]
-            if vars_to_reg:
-                return self.C2 * tf.add_n(vars_to_reg)
+            sim_dial_bot_emb = None
 
-        return 0
+        # output similarities between user input and bot actions
+        # and similarities between bot actions
+        return sim,  sim_bot_emb, sim_dial_emb, sim_dial_bot_emb
 
     def _tf_loss(
         self,
@@ -1339,10 +694,6 @@ def _tf_loss(
             max_margin = tf.maximum(0., self.mu_neg + sim_neg)
             loss += tf.reduce_sum(max_margin, -1)
 
-        if isinstance(self.featurizer, FullDialogueTrackerFeaturizer) and self.scale_loss_by_action_counts:
-            # scale loss inverse proportionally to number of action counts
-            loss *= self._loss_scales
-
         # penalize max similarity between bot embeddings
         sim_bot_emb += common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs
         max_sim_bot_emb = tf.maximum(0., tf.reduce_max(sim_bot_emb, -1))
@@ -1410,19 +761,11 @@ def _tf_loss_2(
         # already_learned = tf.where(pred[:, :, 0] > 0.8, zeros, ones)
         already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
 
-        # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
-        # if self.scale_loss_by_action_counts:
-        #     scale_mask = self._loss_scales * mask
-        # else:
-        scale_mask = mask
-        # else:
-        #     scale_mask = 1.0
-
         loss = tf.losses.softmax_cross_entropy(labels,
                                                logits,
-                                               scale_mask * already_learned)
+                                               mask * already_learned)
         # add regularization losses
-        loss += self._regularization_loss() + tf.losses.get_regularization_loss()
+        loss += tf.losses.get_regularization_loss()
 
         # maximize similarity returned by time attention wrapper
         add_loss = []
@@ -1457,10 +800,6 @@ def train(
 
         # dealing with training data
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
-        # assume that characteristic time is the mean length of the dialogues
-        self.characteristic_time = np.mean(training_data.true_length)
-        if self.attn_shift_range is None:
-            self.attn_shift_range = int(self.characteristic_time / 2)
 
         # encode all actions with policies' featurizer
         self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
@@ -1477,9 +816,7 @@ def train(
         self.num_neg = min(self.num_neg, domain.num_actions - 1)
 
         # extract actual training data to feed to tf session
-        session_data = self._create_tf_session_data(
-            domain, training_data.X, training_data.y
-        )
+        session_data = self._create_session_data(training_data.X, training_data.y)
 
         self.graph = tf.Graph()
 
@@ -1487,52 +824,26 @@ def train(
             # set random seed in tf
             tf.set_random_seed(self.random_seed)
 
+            # allows increasing batch size
             batch_size_in = tf.placeholder(tf.int64)
-            train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X,
-                                                                session_data.Y,
-                                                                session_data.slots,
-                                                                session_data.previous_actions))
-            train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
-            train_dataset = train_dataset.batch(batch_size_in)
+            train_dataset = self._create_tf_dataset(session_data, batch_size_in)
 
             if self.evaluate_on_num_examples:
-                ids = np.random.permutation(len(session_data.X))[:self.evaluate_on_num_examples]
-
-                val_dataset = tf.data.Dataset.from_tensor_slices((session_data.X[ids],
-                                                                  session_data.Y[ids],
-                                                                  session_data.slots[ids],
-                                                                  session_data.previous_actions[ids])
-                                                                 ).batch(self.evaluate_on_num_examples)
+                eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples)
+                eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples)
             else:
-                val_dataset = None
+                eval_train_dataset = None
 
             iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
                                                        train_dataset.output_shapes,
                                                        output_classes=train_dataset.output_classes)
 
-            self.a_in, self.b_in, self.c_in, self.b_prev_in = iterator.get_next()
-
-            self.a_in = tf.cast(self.a_in, tf.float32)
-            self.b_in = tf.cast(self.b_in, tf.float32)
-            self.c_in = tf.cast(self.c_in, tf.float32)
-            self.b_prev_in = tf.cast(self.b_prev_in, tf.float32)
+            # session data are int counts but we need a float tensors
+            (self.a_in,
+             self.b_in,
+             self.c_in,
+             self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next())
 
-            # they don't change
-            self._x_for_no_intent_in = tf.constant(
-                session_data.x_for_no_intent,
-                dtype=tf.float32,
-                name="x_for_no_intent",
-            )
-            self._y_for_no_action_in = tf.constant(
-                session_data.y_for_no_action,
-                dtype=tf.float32,
-                name="y_for_no_action",
-            )
-            self._y_for_action_listen_in = tf.constant(
-                session_data.y_for_action_listen,
-                dtype=tf.float32,
-                name="y_for_action_listen",
-            )
             all_actions = tf.constant(self.encoded_all_actions,
                                       dtype=tf.float32,
                                       name="all_actions")
@@ -1547,55 +858,20 @@ def train(
             # if there is at least one `-1` it should be masked
             mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
+            self.attention_weights = {}
+            transformer_out = self._create_transformer_encoder(
+                self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
+            self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out")
+            sims_rnn_to_max = []
+
             self.bot_embed = self._create_tf_bot_embed(self.b_in)
             all_actions_embed = self._create_tf_bot_embed(all_actions)
 
-            embed_prev_action = self._create_tf_bot_embed(self.b_prev_in)
-            embed_for_no_action = self._create_tf_no_action_embed(
-                self._y_for_no_action_in
-            )
-            embed_for_action_listen = self._create_tf_no_action_embed(
-                self._y_for_action_listen_in
-            )
-
-            if self.transformer:
-                self.attention_weights = {}
-                tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
-                # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action)
-                self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out") #+ self._create_embed(self.c_in, layer_name_suffix="slots")
-                sims_rnn_to_max = []
-            else:
-                # create embedding vectors
-                self.user_embed = self._create_tf_user_embed(self.a_in)
-                self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt")
-
-                embed_for_no_intent = self._create_tf_no_intent_embed(
-                    self._x_for_no_intent_in
-                )
-
-                # get rnn output
-                cell_output, final_state = self._create_tf_dial_embed(
-                    self.user_embed,
-                    self.slot_embed,
-                    embed_prev_action,
-                    mask,
-                    embed_for_no_intent,
-                    embed_for_no_action,
-                    embed_for_action_listen,
-                )
-                # process rnn output
-                if self.is_using_attention():
-                    self.alignment_history = self._alignments_history_from(final_state)
-
-                    self.all_time_masks = self._all_time_masks_from(final_state)
-
-                sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output)
-                self.dial_embed = self._embed_dialogue_from(cell_output)
-
             # calculate similarities
             if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-                self.b_in = tf.expand_dims(self.b_in, 1)
-                self.bot_embed = tf.expand_dims(self.bot_embed, 1)
+                # pick last action if max history is used
+                self.b_in = self.b_in[:, tf.newaxis, :]
+                self.bot_embed = self.bot_embed[:, tf.newaxis, :]
                 self.dial_embed = self.dial_embed[:, -1:, :]
                 mask = mask[:, -1:]
 
@@ -1641,11 +917,6 @@ def train(
             # self.sim_op, sim_bot_emb, sim_dial_emb = self._tf_sim(self.dial_embed, tiled_bot_embed, mask)
             self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb = self._tf_sim(tiled_dial_embed, tiled_bot_embed, mask)
 
-            # construct loss
-            if self.scale_loss_by_action_counts:
-                self._loss_scales = self._scale_loss_by_count_actions(self.a_in, self.b_in, self.c_in, self.b_prev_in)
-            else:
-                self._loss_scales = None
             # loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sims_rnn_to_max, bad_negs, mask)
             loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb, sims_rnn_to_max, bad_negs, mask, batch_bad_negs)
 
@@ -1656,15 +927,15 @@ def train(
 
             train_init_op = iterator.make_initializer(train_dataset)
             if self.evaluate_on_num_examples:
-                val_init_op = iterator.make_initializer(val_dataset)
+                eval_init_op = iterator.make_initializer(eval_train_dataset)
             else:
-                val_init_op = None
+                eval_init_op = None
 
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
 
             # self._train_tf(session_data, loss, mask)
-            self._train_tf_dataset(train_init_op, val_init_op, batch_size_in, loss, mask, session_data.X.shape[1])
+            self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, loss, mask, session_data.X.shape[1])
 
             dialogue_len = None  # use dynamic time for rnn
             # create placeholders
@@ -1693,40 +964,12 @@ def train(
             # if there is at least one `-1` it should be masked
             mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
-            self.bot_embed = self._create_tf_bot_embed(self.b_in)
-            embed_prev_action = self._create_tf_bot_embed(self.b_prev_in)
-
-            if self.transformer:
-                self.attention_weights = {}
-                tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask,
-                                                                            self.attention_weights)
-                # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias,
-                #                                                               embed_prev_action,
-                #                                                               embed_for_action_listen,
-                #                                                               embed_for_no_action)
-                self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out")
-
-            else:
-                self.user_embed = self._create_tf_user_embed(self.a_in)
-                self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt")
-
-                # get rnn output
-                cell_output, final_state = self._create_tf_dial_embed(
-                    self.user_embed,
-                    self.slot_embed,
-                    embed_prev_action,
-                    mask,
-                    embed_for_no_intent,
-                    embed_for_no_action,
-                    embed_for_action_listen,
-                )
-                # process rnn output
-                if self.is_using_attention():
-                    self.alignment_history = self._alignments_history_from(final_state)
-
-                    self.all_time_masks = self._all_time_masks_from(final_state)
+            self.attention_weights = {}
+            transformer_out = self._create_transformer_encoder(
+                self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
+            self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out")
 
-                self.dial_embed = self._embed_dialogue_from(cell_output)
+            self.bot_embed = self._create_tf_bot_embed(self.b_in)
 
             if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
                 self.dial_embed = self.dial_embed[:, -1:, :]
@@ -1758,7 +1001,7 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int:
 
     def _train_tf_dataset(self,
                           train_init_op,
-                          val_init_op,
+                          eval_init_op,
                           batch_size_in,
                           loss: 'tf.Tensor',
                           mask,
@@ -1799,11 +1042,11 @@ def _train_tf_dataset(self,
 
             ep_loss /= batches_per_epoch
 
-            if self.evaluate_on_num_examples and val_init_op is not None:
+            if self.evaluate_on_num_examples and eval_init_op is not None:
                 if (ep == 0 or
                         (ep + 1) % self.evaluate_every_num_epochs == 0 or
                         (ep + 1) == self.epochs):
-                    train_acc = self._output_training_stat_dataset(val_init_op, mask, dialogue_len)
+                    train_acc = self._output_training_stat_dataset(eval_init_op, mask, dialogue_len)
                     last_loss = ep_loss
 
                 pbar.set_postfix({
@@ -1820,10 +1063,10 @@ def _train_tf_dataset(self,
                         "loss={:.3f}, train accuracy={:.3f}"
                         "".format(last_loss, train_acc))
 
-    def _output_training_stat_dataset(self, val_init_op, mask, dialogue_len) -> np.ndarray:
+    def _output_training_stat_dataset(self, eval_init_op, mask, dialogue_len) -> np.ndarray:
         """Output training statistics"""
 
-        self.session.run(val_init_op)
+        self.session.run(eval_init_op)
 
         sim_, mask_ = self.session.run([self.sim_op, mask],
                                        feed_dict={self._is_training: False,
@@ -1851,19 +1094,10 @@ def continue_training(
                 batch_size, training_trackers, domain
             )
 
-            session_data = self._create_tf_session_data(
-                domain, training_data.X, training_data.y
-            )
+            session_data = self._create_session_data(training_data.X, training_data.y)
 
             b = self._create_batch_b(session_data.Y, session_data.actions_for_Y)
 
-            batch_loss_scales = self._scale_loss_by_count_actions(
-                session_data.X,
-                session_data.slots,
-                session_data.previous_actions,
-                session_data.actions_for_Y,
-            )
-
             # fit to one extra example using updated trackers
             self.session.run(
                 self._train_op,
@@ -1873,11 +1107,7 @@ def continue_training(
                     self.c_in: session_data.slots,
                     self.b_prev_in: session_data.previous_actions,
                     self._dialogue_len: session_data.X.shape[1],
-                    self._x_for_no_intent_in: session_data.x_for_no_intent,
-                    self._y_for_no_action_in: session_data.y_for_no_action,
-                    self._y_for_action_listen_in: session_data.y_for_action_listen,
                     self._is_training: True,
-                    self._loss_scales: batch_loss_scales,
                 },
             )
 
@@ -1886,7 +1116,7 @@ def tf_feed_dict_for_prediction(self,
                                     domain: Domain) -> Dict:
         # noinspection PyPep8Naming
         data_X = self.featurizer.create_X([tracker], domain)
-        session_data = self._create_tf_session_data(domain, data_X)
+        session_data = self._create_session_data(data_X)
         # noinspection PyPep8Naming
         all_Y_d_x = np.stack([session_data.all_Y_d
                               for _ in range(session_data.X.shape[0])])
@@ -1915,7 +1145,7 @@ def predict_action_probabilities(
 
         # noinspection PyPep8Naming
         data_X = self.featurizer.create_X([tracker], domain)
-        session_data = self._create_tf_session_data(domain, data_X)
+        session_data = self._create_session_data(data_X)
         # noinspection PyPep8Naming
         all_Y_d_x = np.stack(
             [session_data.all_Y_d for _ in range(session_data.X.shape[0])]
@@ -1975,7 +1205,7 @@ def persist(self, path: Text) -> None:
 
         file_name = "tensorflow_embedding.ckpt"
         checkpoint = os.path.join(path, file_name)
-        utils.create_dir_for_file(checkpoint)
+        rasa.utils.io.create_directory_for_file(checkpoint)
 
         with self.graph.as_default():
             self._persist_tensor("intent_placeholder", self.a_in)
@@ -1983,9 +1213,6 @@ def persist(self, path: Text) -> None:
             self._persist_tensor("slots_placeholder", self.c_in)
             self._persist_tensor("prev_act_placeholder", self.b_prev_in)
             self._persist_tensor("dialogue_len", self._dialogue_len)
-            self._persist_tensor("x_for_no_intent", self._x_for_no_intent_in)
-            self._persist_tensor("y_for_no_action", self._y_for_no_action_in)
-            self._persist_tensor("y_for_action_listen", self._y_for_action_listen_in)
 
             self._persist_tensor("similarity_op", self.sim_op)
 
@@ -2062,9 +1289,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             c_in = cls.load_tensor("slots_placeholder")
             b_prev_in = cls.load_tensor("prev_act_placeholder")
             dialogue_len = cls.load_tensor("dialogue_len")
-            x_for_no_intent = cls.load_tensor("x_for_no_intent")
-            y_for_no_action = cls.load_tensor("y_for_no_action")
-            y_for_action_listen = cls.load_tensor("y_for_action_listen")
 
             sim_op = cls.load_tensor("similarity_op")
 
@@ -2101,9 +1325,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             slots_placeholder=c_in,
             prev_act_placeholder=b_prev_in,
             dialogue_len=dialogue_len,
-            x_for_no_intent=x_for_no_intent,
-            y_for_no_action=y_for_no_action,
-            y_for_action_listen=y_for_action_listen,
             similarity_op=sim_op,
             alignment_history=alignment_history,
             user_embed=user_embed,

From a39e10c17d99b450bfee420b69f4c56954d4b365 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 16 Jul 2019 17:40:48 +0200
Subject: [PATCH 03/50] refactor neg sampling, sim, acc and loss

---
 rasa/core/policies/embedding_policy.py | 481 ++++++++++++-------------
 1 file changed, 227 insertions(+), 254 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 68bc2808a67d..63a10afcfe3b 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -60,7 +60,7 @@
 class EmbeddingPolicy(Policy):
     """Recurrent Embedding Dialogue Policy (REDP)
 
-    The policy that is used in our paper https://arxiv.org/abs/1811.11707
+    Transformer version of the policy used in our paper https://arxiv.org/abs/1811.11707
     """
 
     SUPPORTS_ONLINE_TRAINING = True
@@ -103,7 +103,8 @@ class EmbeddingPolicy(Policy):
         # maximum negative similarity for incorrect actions
         "mu_neg": -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
         # the type of the similarity
-        "similarity_type": "cosine",  # string 'cosine' or 'inner'
+        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        "loss_type": 'softmax',  # string 'softmax' or 'margin'
         # the number of incorrect actions, the algorithm will minimize
         # their similarity to the user input during training
         "num_neg": 20,
@@ -164,7 +165,6 @@ def __init__(
         action_placeholder: Optional['tf.Tensor'] = None,
         slots_placeholder: Optional['tf.Tensor'] = None,
         prev_act_placeholder: Optional['tf.Tensor'] = None,
-        dialogue_len: Optional['tf.Tensor'] = None,
         similarity_op: Optional['tf.Tensor'] = None,
         alignment_history: Optional['tf.Tensor'] = None,
         user_embed: Optional['tf.Tensor'] = None,
@@ -208,7 +208,6 @@ def __init__(
         self.b_in = action_placeholder
         self.c_in = slots_placeholder
         self.b_prev_in = prev_act_placeholder
-        self._dialogue_len = dialogue_len
         self.sim_op = similarity_op
 
         # store attention probability distribution as
@@ -270,6 +269,13 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
         self.mu_pos = config["mu_pos"]
         self.mu_neg = config["mu_neg"]
         self.similarity_type = config["similarity_type"]
+        self.loss_type = config['loss_type']
+        if self.similarity_type == 'auto':
+            if self.loss_type == 'softmax':
+                self.similarity_type = 'inner'
+            elif self.loss_type == 'margin':
+                self.similarity_type = 'cosine'
+
         self.num_neg = config["num_neg"]
         self.use_max_sim_neg = config["use_max_sim_neg"]
 
@@ -414,12 +420,14 @@ def _sample_session_data(session_data: 'SessionData',
     # tf helpers:
     @staticmethod
     def _create_tf_dataset(session_data: 'SessionData',
-                           batch_size: Union['tf.Tensor', int]) -> 'tf.data.Dataset':
+                           batch_size: Union['tf.Tensor', int],
+                           shuffle: bool = True) -> 'tf.data.Dataset':
         train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X,
                                                             session_data.Y,
                                                             session_data.slots,
                                                             session_data.previous_actions))
-        train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
+        if shuffle:
+            train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
         train_dataset = train_dataset.batch(batch_size)
         return train_dataset
 
@@ -550,77 +558,55 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei
 
             return tf.nn.relu(x)
 
-    def _tf_sample_neg(self,
-                       pos_b,
-                       neg_bs=None,
-                       neg_ids=None,
-                       batch_size=None,
-                       first_only=False
+    @staticmethod
+    def _tf_make_flat(x):
+        return tf.reshape(x, (-1, x.shape[-1]))
+
+    @staticmethod
+    def _tf_sample_neg(batch_size,
+                       all_bs,
+                       neg_ids,
                        ) -> 'tf.Tensor':
 
-        all_b = pos_b[tf.newaxis, :, :]
-        if batch_size is None:
-            batch_size = tf.shape(pos_b)[0]
-        all_b = tf.tile(all_b, [batch_size, 1, 1])
-        if neg_bs is None and neg_ids is None:
-            return all_b
-
-        def sample_neg_b():
-            if neg_bs is not None:
-                _neg_bs = neg_bs
-            elif neg_ids is not None:
-                _neg_bs = tf.batch_gather(all_b, neg_ids)
-            else:
-                raise
-            return tf.concat([pos_b[:, tf.newaxis, :], _neg_bs], 1)
+        tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
 
-        if first_only:
-            out_b = pos_b[:, tf.newaxis, :]
-        else:
-            out_b = all_b
+        return tf.batch_gather(tiled_all_bs, neg_ids)
 
-        if neg_bs is not None:
-            cond = tf.logical_and(self._is_training, tf.shape(neg_bs)[0] > 1)
-        elif neg_ids is not None:
-            cond = tf.logical_and(self._is_training, tf.shape(neg_ids)[0] > 1)
-        else:
-            raise
+    def _tf_calc_iou_mask(self,
+                          pos_b,
+                          all_bs,
+                          neg_ids,
+                          ) -> 'tf.Tensor':
 
-        return tf.cond(cond, sample_neg_b, lambda: out_b)
+        pos_b_in_flat = pos_b[:, tf.newaxis, :]
+        neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
 
-    def _tf_calc_iou(self,
-                     b_raw,
-                     neg_bs=None,
-                     neg_ids=None
-                     ) -> 'tf.Tensor':
+        intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
+        union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
 
-        tiled_intent_raw = self._tf_sample_neg(b_raw, neg_bs=neg_bs, neg_ids=neg_ids)
-        pos_b_raw = tiled_intent_raw[:, :1, :]
-        neg_b_raw = tiled_intent_raw[:, 1:, :]
-        intersection_b_raw = tf.minimum(neg_b_raw, pos_b_raw)
-        union_b_raw = tf.maximum(neg_b_raw, pos_b_raw)
+        iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1)
+        return 1. - tf.nn.relu(tf.sign(1. - iou))
 
-        return tf.reduce_sum(intersection_b_raw, -1) / tf.reduce_sum(union_b_raw, -1)
+    def _tf_get_negs(self, all_embed, all_raw, raw_pos):
 
-    def _tf_sim(
-        self,
-        embed_dialogue: 'tf.Tensor',
-        embed_action: 'tf.Tensor',
-        mask: Optional['tf.Tensor'],
-    ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']:
-        """Define similarity.
-
-        This method has two roles:
-        - calculate similarity between
-            two embedding vectors of the same size
-            and output binary mask and similarity;
-        - calculate similarity with several embedded actions for the loss
-            and output similarities between user input and bot actions
-            and similarities between bot actions.
-
-        They are kept in the same helper method,
-        because it is necessary for them to be mathematically identical.
-        """
+        batch_size = tf.shape(raw_pos)[0]
+        seq_length = tf.shape(raw_pos)[1]
+        raw_flat = self._tf_make_flat(raw_pos)
+
+        neg_ids = tf.random.categorical(tf.log(tf.ones((batch_size * seq_length,
+                                                        tf.shape(all_raw)[0]))),
+                                        self.num_neg)
+
+        bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
+        bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1))
+
+        neg_embed_flat = self._tf_sample_neg(batch_size * seq_length, all_embed, neg_ids)
+        neg_embed = tf.reshape(neg_embed_flat,
+                               (batch_size, seq_length, -1, all_embed.shape[-1]))
+
+        return neg_embed, bad_negs
+
+    def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor':
 
         if self.similarity_type not in {"cosine", "inner"}:
             raise ValueError(
@@ -629,62 +615,72 @@ def _tf_sim(
                 "".format(self.similarity_type)
             )
 
-        # calculate similarity with several
-        # embedded actions for the loss
-
         if self.similarity_type == "cosine":
-            # normalize embedding vectors for cosine similarity
-            embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1)
-            embed_action = tf.nn.l2_normalize(embed_action, -1)
-
-        if len(embed_dialogue.shape) == 4:
-            embed_dialogue_pos = embed_dialogue[:, :, :1, :]
+            return tf.nn.l2_normalize(a, -1)
         else:
-            embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2)
+            return a
 
-        sim = tf.reduce_sum(
-            embed_dialogue_pos * embed_action, -1
-        ) * tf.expand_dims(mask, 2)
+    @staticmethod
+    def _tf_raw_sim(
+        a: 'tf.Tensor',
+        b: 'tf.Tensor',
+        mask: 'tf.Tensor',
+    ) -> 'tf.Tensor':
 
-        sim_bot_emb = tf.reduce_sum(
-            embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1
-        ) * tf.expand_dims(mask, 2)
+        return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2)
 
-        if len(embed_dialogue.shape) == 4:
-            sim_dial_emb = tf.reduce_sum(
-                embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1
-            ) * tf.expand_dims(mask, 2)
-        else:
-            sim_dial_emb = None
+    def _tf_sim(
+        self,
+        pos_dial_embed: 'tf.Tensor',
+        pos_bot_embed: 'tf.Tensor',
+        neg_dial_embed: 'tf.Tensor',
+        neg_bot_embed: 'tf.Tensor',
+        dial_bad_negs: 'tf.Tensor',
+        bot_bad_negs: 'tf.Tensor',
+        mask: 'tf.Tensor',
+    ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']:
+        """Define similarity."""
 
-        if len(embed_dialogue.shape) == 4:
-            sim_dial_bot_emb = tf.reduce_sum(
-                embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1
-            ) * tf.expand_dims(mask, 2)
-        else:
-            sim_dial_bot_emb = None
+        # calculate similarity with several
+        # embedded actions for the loss
+        neg_inf = common_attention.large_compatible_negative(pos_dial_embed.dtype)
+
+        sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
+        sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed,
+                                   mask) + neg_inf * bot_bad_negs
+        sim_neg_bot_bot = self._tf_raw_sim(pos_bot_embed, neg_bot_embed,
+                                       mask) + neg_inf * bot_bad_negs
+        sim_neg_dial_dial = self._tf_raw_sim(pos_dial_embed, neg_dial_embed,
+                                        mask) + neg_inf * dial_bad_negs
+        sim_neg_bot_dial = self._tf_raw_sim(pos_bot_embed, neg_dial_embed,
+                                        mask) + neg_inf * dial_bad_negs
 
         # output similarities between user input and bot actions
-        # and similarities between bot actions
-        return sim,  sim_bot_emb, sim_dial_emb, sim_dial_bot_emb
+        # and similarities between bot actions and similarities between user inputs
+        return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
 
-    def _tf_loss(
+    @staticmethod
+    def _tf_calc_accuracy(sim_pos, sim_neg):
+
+        max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
+        return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]),
+                                      tf.float32))
+
+    def _tf_loss_margin(
         self,
-        sim: 'tf.Tensor',
-        sim_bot_emb: 'tf.Tensor',
-        sim_dial_emb: 'tf.Tensor',
-        sims_rnn_to_max: List['tf.Tensor'],
-        bad_negs,
+        sim_pos: 'tf.Tensor',
+        sim_neg: 'tf.Tensor',
+        sim_neg_bot_bot: 'tf.Tensor',
+        sim_neg_dial_dial: 'tf.Tensor',
+        sim_neg_bot_dial: 'tf.Tensor',
         mask: 'tf.Tensor',
-        batch_bad_negs
     ) -> 'tf.Tensor':
-        """Define loss."""
+        """Define max margin loss."""
 
         # loss for maximizing similarity with correct action
-        loss = tf.maximum(0., self.mu_pos - sim[:, :, 0])
+        loss = tf.maximum(0., self.mu_pos - sim_pos[:, :, 0])
 
         # loss for minimizing similarity with `num_neg` incorrect actions
-        sim_neg = sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs
         if self.use_max_sim_neg:
             # minimize only maximum similarity over incorrect actions
             max_sim_neg = tf.reduce_max(sim_neg, -1)
@@ -694,71 +690,55 @@ def _tf_loss(
             max_margin = tf.maximum(0., self.mu_neg + sim_neg)
             loss += tf.reduce_sum(max_margin, -1)
 
-        # penalize max similarity between bot embeddings
-        sim_bot_emb += common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs
-        max_sim_bot_emb = tf.maximum(0., tf.reduce_max(sim_bot_emb, -1))
-        loss += max_sim_bot_emb * self.C_emb
+        # penalize max similarity between pos bot and neg bot embeddings
+        max_sim_neg_bot = tf.maximum(0., tf.reduce_max(sim_neg_bot_bot, -1))
+        loss += max_sim_neg_bot * self.C_emb
 
-        # penalize max similarity between dial embeddings
-        if sim_dial_emb is not None:
-            sim_dial_emb += common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs
-            max_sim_input_emb = tf.maximum(0., tf.reduce_max(sim_dial_emb, -1))
-            loss += max_sim_input_emb * self.C_emb
+        # penalize max similarity between pos dial and neg dial embeddings
+        max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_dial_dial, -1))
+        loss += max_sim_neg_dial * self.C_emb
 
-        # maximize similarity returned by time attention wrapper
-        for sim_to_add in sims_rnn_to_max:
-            loss += tf.maximum(0.0, 1.0 - sim_to_add)
+        # penalize max similarity between pos bot and neg dial embeddings
+        max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_bot_dial, -1))
+        loss += max_sim_neg_dial * self.C_emb
 
         # mask loss for different length sequences
         loss *= mask
         # average the loss over sequence length
         loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1)
-
         # average the loss over the batch
-        loss = (
-            tf.reduce_mean(loss)
-            # add regularization losses
-            + self._regularization_loss()
-            + tf.losses.get_regularization_loss()
-        )
+        loss = tf.reduce_mean(loss)
+
+        # add regularization losses
+        loss += tf.losses.get_regularization_loss()
+
         return loss
 
-    def _tf_loss_2(
-        self,
-        sim: 'tf.Tensor',
-        sim_bot_emb: 'tf.Tensor',
-        sim_dial_emb: 'tf.Tensor',
-        sim_dial_bot_emb,
-        sims_rnn_to_max: List['tf.Tensor'],
-        bad_negs,
+    @staticmethod
+    def _tf_loss_softmax(
+        sim_pos: 'tf.Tensor',
+        sim_neg: 'tf.Tensor',
+        sim_neg_bot_bot: 'tf.Tensor',
+        sim_neg_dial_dial: 'tf.Tensor',
+        sim_neg_bot_dial: 'tf.Tensor',
         mask: 'tf.Tensor',
-        batch_bad_negs=None,
     ) -> 'tf.Tensor':
-        """Define loss."""
+        """Define softmax loss."""
 
-        all_sim = [sim[:, :, :1],
-                   sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs,
-                   sim_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs,
-                   ]
-        if sim_dial_emb is not None:
-            all_sim.append(sim_dial_emb + common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs)
+        logits = tf.concat([sim_pos,
+                            sim_neg,
+                            sim_neg_bot_bot,
+                            sim_neg_dial_dial,
+                            sim_neg_bot_dial
+                            ], -1)
 
-        if sim_dial_bot_emb is not None:
-            all_sim.append(sim_dial_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs)
-
-        logits = tf.concat(all_sim, -1)
+        # create labels for softmax
         pos_labels = tf.ones_like(logits[:, :, :1])
         neg_labels = tf.zeros_like(logits[:, :, 1:])
         labels = tf.concat([pos_labels, neg_labels], -1)
 
+        # mask loss by prediction confidence
         pred = tf.nn.softmax(logits)
-        # fake_logits = tf.concat([logits[:, :, :1] - common_attention.large_compatible_negative(logits.dtype),
-        #                          logits[:, :, 1:] + common_attention.large_compatible_negative(logits.dtype)], -1)
-
-        # ones = tf.ones_like(pred[:, :, 0])
-        # zeros = tf.zeros_like(pred[:, :, 0])
-
-        # already_learned = tf.where(pred[:, :, 0] > 0.8, zeros, ones)
         already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
 
         loss = tf.losses.softmax_cross_entropy(labels,
@@ -767,23 +747,35 @@ def _tf_loss_2(
         # add regularization losses
         loss += tf.losses.get_regularization_loss()
 
-        # maximize similarity returned by time attention wrapper
-        add_loss = []
-        for sim_to_add in sims_rnn_to_max:
-            add_loss.append(tf.maximum(0.0, 1.0 - sim_to_add))
-
-        if add_loss:
-            # mask loss for different length sequences
-            add_loss = sum(add_loss) * mask
-            # average the loss over sequence length
-            add_loss = tf.reduce_sum(add_loss, -1) / tf.reduce_sum(mask, 1)
-            # average the loss over the batch
-            add_loss = tf.reduce_mean(add_loss)
-
-            loss += add_loss
-
         return loss
 
+    def _choose_loss(self,
+                     sim_pos: 'tf.Tensor',
+                     sim_neg: 'tf.Tensor',
+                     sim_neg_bot_bot: 'tf.Tensor',
+                     sim_neg_dial_dial: 'tf.Tensor',
+                     sim_neg_bot_dial: 'tf.Tensor',
+                     mask: 'tf.Tensor') -> 'tf.Tensor':
+
+        if self.loss_type == 'margin':
+            return self._tf_loss_margin(sim_pos, sim_neg,
+                                        sim_neg_bot_bot,
+                                        sim_neg_dial_dial,
+                                        sim_neg_bot_dial,
+                                        mask)
+        elif self.loss_type == 'softmax':
+            return self._tf_loss_softmax(sim_pos, sim_neg,
+                                         sim_neg_bot_bot,
+                                         sim_neg_dial_dial,
+                                         sim_neg_bot_dial,
+                                         mask)
+        else:
+            raise ValueError(
+                "Wrong loss type {}, "
+                "should be 'margin' or 'softmax'"
+                "".format(self.loss_type)
+            )
+
     # training methods
     def train(
         self,
@@ -830,7 +822,7 @@ def train(
 
             if self.evaluate_on_num_examples:
                 eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples)
-                eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples)
+                eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False)
             else:
                 eval_train_dataset = None
 
@@ -850,9 +842,6 @@ def train(
 
             # dynamic variables
             self._is_training = tf.placeholder_with_default(False, shape=())
-            self._dialogue_len = tf.placeholder(
-                dtype=tf.int32, shape=(), name="dialogue_len"
-            )
 
             # mask different length sequences
             # if there is at least one `-1` it should be masked
@@ -862,7 +851,6 @@ def train(
             transformer_out = self._create_transformer_encoder(
                 self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
             self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out")
-            sims_rnn_to_max = []
 
             self.bot_embed = self._create_tf_bot_embed(self.b_in)
             all_actions_embed = self._create_tf_bot_embed(all_actions)
@@ -875,55 +863,46 @@ def train(
                 self.dial_embed = self.dial_embed[:, -1:, :]
                 mask = mask[:, -1:]
 
-            b_raw = tf.reshape(self.b_in, (-1, self.b_in.shape[-1]))
-
-            _, i, c = gen_array_ops.unique_with_counts_v2(b_raw, axis=[0])
-            counts = tf.expand_dims(tf.reshape(tf.gather(tf.cast(c, tf.float32), i), (tf.shape(b_raw)[0],)), 0)
-            batch_neg_ids = tf.random.categorical(tf.log((1. - tf.eye(tf.shape(b_raw)[0])/counts)), self.num_neg)
-
-            batch_iou_bot = self._tf_calc_iou(b_raw, neg_ids=batch_neg_ids)
-            batch_bad_negs = 1. - tf.nn.relu(tf.sign(1. - batch_iou_bot))
-            batch_bad_negs = tf.reshape(batch_bad_negs, (tf.shape(self.dial_embed)[0],
-                                                         tf.shape(self.dial_embed)[1],
-                                                         -1))
-
-            neg_ids = tf.random.categorical(tf.log(tf.ones((tf.shape(b_raw)[0], tf.shape(all_actions)[0]))), self.num_neg)
-
-            tiled_all_actions = tf.tile(tf.expand_dims(all_actions, 0), (tf.shape(b_raw)[0], 1, 1))
-            neg_bs = tf.batch_gather(tiled_all_actions, neg_ids)
-            iou_bot = self._tf_calc_iou(b_raw, neg_bs)
-            bad_negs = 1. - tf.nn.relu(tf.sign(1. - iou_bot))
-            bad_negs = tf.reshape(bad_negs, (tf.shape(self.bot_embed)[0],
-                                             tf.shape(self.bot_embed)[1],
-                                             -1))
-
-            dial_embed_flat = tf.reshape(self.dial_embed, (-1, self.dial_embed.shape[-1]))
-
-            tiled_dial_embed = self._tf_sample_neg(dial_embed_flat, neg_ids=batch_neg_ids, first_only=True)
-            tiled_dial_embed = tf.reshape(tiled_dial_embed, (tf.shape(self.dial_embed)[0],
-                                                             tf.shape(self.dial_embed)[1],
-                                                             -1,
-                                                             self.dial_embed.shape[-1]))
-
-            bot_embed_flat = tf.reshape(self.bot_embed, (-1, self.bot_embed.shape[-1]))
-            tiled_all_actions_embed = tf.tile(tf.expand_dims(all_actions_embed, 0), (tf.shape(b_raw)[0], 1, 1))
-            neg_embs = tf.batch_gather(tiled_all_actions_embed, neg_ids)
-            tiled_bot_embed = self._tf_sample_neg(bot_embed_flat, neg_bs=neg_embs)
-            tiled_bot_embed = tf.reshape(tiled_bot_embed, (tf.shape(self.bot_embed)[0],
-                                                           tf.shape(self.bot_embed)[1],
-                                                           -1,
-                                                           self.bot_embed.shape[-1]))
-
-            # self.sim_op, sim_bot_emb, sim_dial_emb = self._tf_sim(self.dial_embed, tiled_bot_embed, mask)
-            self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb = self._tf_sim(tiled_dial_embed, tiled_bot_embed, mask)
-
-            # loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sims_rnn_to_max, bad_negs, mask)
-            loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb, sims_rnn_to_max, bad_negs, mask, batch_bad_negs)
+            pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :]
+            neg_dial_embed, dial_bad_negs = self._tf_get_negs(
+                self._tf_make_flat(self.dial_embed),
+                self._tf_make_flat(self.b_in),
+                self.b_in
+            )
+            pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :]
+            neg_bot_embed, bot_bad_negs = self._tf_get_negs(
+                all_actions_embed,
+                all_actions,
+                self.b_in
+            )
 
+            # normalize embedding vectors for cosine similarity
+            pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed)
+            pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed)
+            neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed)
+            neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed)
+
+            (sim_pos,
+             sim_neg,
+             sim_neg_bot_bot,
+             sim_neg_dial_dial,
+             sim_neg_bot_dial) = self._tf_sim(pos_dial_embed,
+                                              pos_bot_embed,
+                                              neg_dial_embed,
+                                              neg_bot_embed,
+                                              dial_bad_negs,
+                                              bot_bad_negs,
+                                              mask)
+
+            acc = self._tf_calc_accuracy(sim_pos, sim_neg)
+
+            loss = self._choose_loss(sim_pos, sim_neg,
+                                     sim_neg_bot_bot,
+                                     sim_neg_dial_dial,
+                                     sim_neg_bot_dial,
+                                     mask)
             # define which optimizer to use
-            self._train_op = tf.train.AdamOptimizer(
-                # learning_rate=0.001, epsilon=1e-16
-            ).minimize(loss)
+            self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
             train_init_op = iterator.make_initializer(train_dataset)
             if self.evaluate_on_num_examples:
@@ -934,8 +913,8 @@ def train(
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
 
-            # self._train_tf(session_data, loss, mask)
-            self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, loss, mask, session_data.X.shape[1])
+            self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in,
+                                   loss, acc)
 
             dialogue_len = None  # use dynamic time for rnn
             # create placeholders
@@ -974,7 +953,10 @@ def train(
             if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
                 self.dial_embed = self.dial_embed[:, -1:, :]
 
-            self.sim_op, _, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask)
+            self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed)
+            self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed)
+            self.sim_op = self._tf_raw_sim(self.dial_embed[:, :, tf.newaxis, :],
+                                           self.bot_embed, mask)
 
             # if self.attention_weights.items():
             #     self.attention_weights = tf.concat([tf.expand_dims(t, 0)
@@ -1004,8 +986,7 @@ def _train_tf_dataset(self,
                           eval_init_op,
                           batch_size_in,
                           loss: 'tf.Tensor',
-                          mask,
-                          dialogue_len,
+                          acc,
                           ) -> None:
         """Train tf graph"""
 
@@ -1026,36 +1007,42 @@ def _train_tf_dataset(self,
 
             self.session.run(train_init_op, feed_dict={batch_size_in: batch_size})
 
-            ep_loss = 0
+            ep_train_loss = 0
+            ep_train_acc = 0
             batches_per_epoch = 0
             while True:
                 try:
-                    _, batch_loss = self.session.run((self._train_op, loss),
-                                                     feed_dict={self._is_training: True,
-                                                                self._dialogue_len: dialogue_len})
+                    _, batch_train_loss, batch_train_acc = self.session.run(
+                        [self._train_op, loss, acc],
+                        feed_dict={self._is_training: True}
+                    )
 
                 except tf.errors.OutOfRangeError:
                     break
 
                 batches_per_epoch += 1
-                ep_loss += batch_loss
+                ep_train_loss += batch_train_loss
+                ep_train_acc += batch_train_acc
 
-            ep_loss /= batches_per_epoch
+            ep_train_loss /= batches_per_epoch
+            ep_train_acc /= batches_per_epoch
 
             if self.evaluate_on_num_examples and eval_init_op is not None:
                 if (ep == 0 or
                         (ep + 1) % self.evaluate_every_num_epochs == 0 or
                         (ep + 1) == self.epochs):
-                    train_acc = self._output_training_stat_dataset(eval_init_op, mask, dialogue_len)
-                    last_loss = ep_loss
+                    train_acc = self._output_training_stat_dataset(eval_init_op, acc)
+                    last_loss = ep_train_loss
 
                 pbar.set_postfix({
-                    "loss": "{:.3f}".format(ep_loss),
-                    "acc": "{:.3f}".format(train_acc)
+                    "train_loss": "{:.3f}".format(ep_train_loss),
+                    "train_acc": "{:.3f}".format(ep_train_acc),
+                    "acc": "{:.3f}".format(train_acc),
                 })
             else:
                 pbar.set_postfix({
-                    "loss": "{:.3f}".format(ep_loss)
+                    "train_loss": "{:.3f}".format(ep_train_loss),
+                    "train_acc": "{:.3f}".format(ep_train_acc)
                 })
 
         if self.evaluate_on_num_examples:
@@ -1063,20 +1050,12 @@ def _train_tf_dataset(self,
                         "loss={:.3f}, train accuracy={:.3f}"
                         "".format(last_loss, train_acc))
 
-    def _output_training_stat_dataset(self, eval_init_op, mask, dialogue_len) -> np.ndarray:
+    def _output_training_stat_dataset(self, eval_init_op, acc) -> np.ndarray:
         """Output training statistics"""
 
         self.session.run(eval_init_op)
 
-        sim_, mask_ = self.session.run([self.sim_op, mask],
-                                       feed_dict={self._is_training: False,
-                                                  self._dialogue_len: dialogue_len})
-        sim_ = sim_.reshape((-1, sim_.shape[-1]))
-        mask_ = mask_.reshape((-1,))
-
-        train_acc = np.sum((np.max(sim_, -1) == sim_.diagonal()) * mask_) / np.sum(mask_)
-
-        return train_acc
+        return self.session.run(acc, feed_dict={self._is_training: False})
 
     def continue_training(
         self,
@@ -1106,7 +1085,6 @@ def continue_training(
                     self.b_in: b,
                     self.c_in: session_data.slots,
                     self.b_prev_in: session_data.previous_actions,
-                    self._dialogue_len: session_data.X.shape[1],
                     self._is_training: True,
                 },
             )
@@ -1124,8 +1102,7 @@ def tf_feed_dict_for_prediction(self,
         return {self.a_in: session_data.X,
                 self.b_in: all_Y_d_x,
                 self.c_in: session_data.slots,
-                self.b_prev_in: session_data.previous_actions,
-                self._dialogue_len: session_data.X.shape[1]}
+                self.b_prev_in: session_data.previous_actions}
 
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
@@ -1160,7 +1137,6 @@ def predict_action_probabilities(
                 self.b_in: all_Y_d_x,
                 self.c_in: session_data.slots,
                 self.b_prev_in: session_data.previous_actions,
-                self._dialogue_len: session_data.X.shape[1],
             },
         )
 
@@ -1212,7 +1188,6 @@ def persist(self, path: Text) -> None:
             self._persist_tensor("action_placeholder", self.b_in)
             self._persist_tensor("slots_placeholder", self.c_in)
             self._persist_tensor("prev_act_placeholder", self.b_prev_in)
-            self._persist_tensor("dialogue_len", self._dialogue_len)
 
             self._persist_tensor("similarity_op", self.sim_op)
 
@@ -1288,7 +1263,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             b_in = cls.load_tensor("action_placeholder")
             c_in = cls.load_tensor("slots_placeholder")
             b_prev_in = cls.load_tensor("prev_act_placeholder")
-            dialogue_len = cls.load_tensor("dialogue_len")
 
             sim_op = cls.load_tensor("similarity_op")
 
@@ -1324,7 +1298,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             action_placeholder=b_in,
             slots_placeholder=c_in,
             prev_act_placeholder=b_prev_in,
-            dialogue_len=dialogue_len,
             similarity_op=sim_op,
             alignment_history=alignment_history,
             user_embed=user_embed,

From c631646dee76298be777473fc40de19a76f82a0d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 16:09:50 +0200
Subject: [PATCH 04/50] refactor train

---
 rasa/core/policies/embedding_policy.py | 428 +++++++++++--------------
 1 file changed, 186 insertions(+), 242 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 63a10afcfe3b..b4192758284c 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -24,7 +24,6 @@
 from rasa.utils.common import is_logging_disabled
 
 import tensorflow as tf
-from tensorflow.python.ops import gen_array_ops
 
 try:
     from tensor2tensor.layers import common_attention
@@ -52,7 +51,6 @@
         "slots",
         "previous_actions",
         "actions_for_Y",
-        "all_Y_d",
     ),
 )
 
@@ -166,15 +164,9 @@ def __init__(
         slots_placeholder: Optional['tf.Tensor'] = None,
         prev_act_placeholder: Optional['tf.Tensor'] = None,
         similarity_op: Optional['tf.Tensor'] = None,
-        alignment_history: Optional['tf.Tensor'] = None,
-        user_embed: Optional['tf.Tensor'] = None,
-        bot_embed: Optional['tf.Tensor'] = None,
-        slot_embed: Optional['tf.Tensor'] = None,
         dial_embed: Optional['tf.Tensor'] = None,
-        rnn_embed: Optional['tf.Tensor'] = None,
-        attn_embed: Optional['tf.Tensor'] = None,
-        copy_attn_debug: Optional['tf.Tensor'] = None,
-        all_time_masks: Optional['tf.Tensor'] = None,
+        bot_embed: Optional['tf.Tensor'] = None,
+        all_bot_embed: Optional['tf.Tensor'] = None,
         attention_weights=None,
         max_history: Optional[int] = None,
         **kwargs: Any
@@ -210,21 +202,11 @@ def __init__(
         self.b_prev_in = prev_act_placeholder
         self.sim_op = similarity_op
 
-        # store attention probability distribution as
-        # concatenated tensor of each attention types
-        self.alignment_history = alignment_history
-
         # persisted embeddings
-        self.user_embed = user_embed
-        self.bot_embed = bot_embed
-        self.slot_embed = slot_embed
         self.dial_embed = dial_embed
+        self.bot_embed = bot_embed
+        self.all_bot_embed = all_bot_embed
 
-        self.rnn_embed = rnn_embed
-        self.attn_embed = attn_embed
-        self.copy_attn_debug = copy_attn_debug
-
-        self.all_time_masks = all_time_masks
         self.attention_weights = attention_weights
         # internal tf instances
         self._train_op = None
@@ -361,16 +343,6 @@ def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray:
                 ]
             )
 
-    # noinspection PyPep8Naming
-    def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray:
-        """Stack encoded_all_intents on top of each other
-
-        to create candidates for training examples and
-        to calculate training accuracy.
-        """
-
-        return np.stack([self.encoded_all_actions] * dialogue_len)
-
     # noinspection PyPep8Naming
     def _create_session_data(
         self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
@@ -393,7 +365,6 @@ def _create_session_data(
             dial_len = X.shape[1]
         else:
             dial_len = 1
-        all_Y_d = self._create_all_Y_d(dial_len)
 
         return SessionData(
             X=X,
@@ -401,7 +372,6 @@ def _create_session_data(
             slots=slots,
             previous_actions=previous_actions,
             actions_for_Y=actions_for_Y,
-            all_Y_d=all_Y_d,
         )
 
     @staticmethod
@@ -414,7 +384,6 @@ def _sample_session_data(session_data: 'SessionData',
             slots=session_data.slots[ids],
             previous_actions=session_data.previous_actions[ids],
             actions_for_Y=session_data.actions_for_Y[ids],
-            all_Y_d=session_data.all_Y_d,
         )
 
     # tf helpers:
@@ -422,15 +391,22 @@ def _sample_session_data(session_data: 'SessionData',
     def _create_tf_dataset(session_data: 'SessionData',
                            batch_size: Union['tf.Tensor', int],
                            shuffle: bool = True) -> 'tf.data.Dataset':
-        train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X,
-                                                            session_data.Y,
-                                                            session_data.slots,
-                                                            session_data.previous_actions))
+        train_dataset = tf.data.Dataset.from_tensor_slices(
+            (session_data.X, session_data.Y,
+             session_data.slots, session_data.previous_actions)
+        )
         if shuffle:
             train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
         train_dataset = train_dataset.batch(batch_size)
+
         return train_dataset
 
+    @staticmethod
+    def _create_tf_iterator(dataset):
+        return tf.data.Iterator.from_structure(dataset.output_types,
+                                               dataset.output_shapes,
+                                               output_classes=dataset.output_classes)
+
     def _create_tf_nn(
         self,
         x_in: 'tf.Tensor',
@@ -454,7 +430,7 @@ def _create_tf_nn(
             x = tf.layers.dropout(x, rate=droprate, training=self._is_training)
         return x
 
-    def _create_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
+    def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
         """Create dense embedding layer with a name."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -479,7 +455,7 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
             self.droprate["b"],
             layer_name_suffix=layer_name_suffix,
         )
-        return self._create_embed(b, layer_name_suffix=layer_name_suffix)
+        return self._create_tf_embed(b, layer_name_suffix=layer_name_suffix)
 
     def _create_hparams(self):
         hparams = transformer_base()
@@ -501,7 +477,7 @@ def _create_hparams(self):
         hparams.add_relative_to_values = True
         return hparams
 
-    def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights):
+    def _create_tf_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights):
         hparams = self._create_hparams()
 
         x_in = tf.concat([a_in, b_prev_in, c_in], -1)
@@ -558,6 +534,24 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei
 
             return tf.nn.relu(x)
 
+    def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
+        # mask different length sequences
+        # if there is at least one `-1` it should be masked
+        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+
+        self.attention_weights = {}
+        a = self._create_tf_transformer_encoder(
+            self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
+
+        dial_embed = self._create_tf_embed(a, layer_name_suffix="out")
+
+        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+            # pick last action if max history featurizer is used
+            dial_embed = dial_embed[:, -1:, :]
+            mask = mask[:, -1:]
+
+        return dial_embed, mask
+
     @staticmethod
     def _tf_make_flat(x):
         return tf.reshape(x, (-1, x.shape[-1]))
@@ -606,6 +600,24 @@ def _tf_get_negs(self, all_embed, all_raw, raw_pos):
 
         return neg_embed, bad_negs
 
+    def _sample_negatives(self, all_actions):
+
+        # sample negatives
+        pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :]
+        neg_dial_embed, dial_bad_negs = self._tf_get_negs(
+            self._tf_make_flat(self.dial_embed),
+            self._tf_make_flat(self.b_in),
+            self.b_in
+        )
+        pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :]
+        neg_bot_embed, bot_bad_negs = self._tf_get_negs(
+            self.all_bot_embed,
+            all_actions,
+            self.b_in
+        )
+        return (pos_dial_embed, pos_bot_embed, neg_dial_embed, neg_bot_embed,
+                dial_bad_negs, bot_bad_negs)
+
     def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor':
 
         if self.similarity_type not in {"cosine", "inner"}:
@@ -776,6 +788,99 @@ def _choose_loss(self,
                 "".format(self.loss_type)
             )
 
+    def _build_tf_train_graph(self, iterator):
+
+        # session data are int counts but we need a float tensors
+        (self.a_in,
+         self.b_in,
+         self.c_in,
+         self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next())
+
+        all_actions = tf.constant(self.encoded_all_actions,
+                                  dtype=tf.float32,
+                                  name="all_actions")
+
+        self.dial_embed, mask = self._create_tf_dial()
+
+        self.bot_embed = self._create_tf_bot_embed(self.b_in)
+        self.all_bot_embed = self._create_tf_bot_embed(all_actions)
+
+        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+            # add time dimension if max history featurizer is used
+            self.b_in = self.b_in[:, tf.newaxis, :]
+            self.bot_embed = self.bot_embed[:, tf.newaxis, :]
+
+        (pos_dial_embed,
+         pos_bot_embed,
+         neg_dial_embed,
+         neg_bot_embed,
+         dial_bad_negs,
+         bot_bad_negs) = self._sample_negatives(all_actions)
+
+        # normalize embedding vectors for cosine similarity
+        pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed)
+        pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed)
+        neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed)
+        neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed)
+
+        # calculate similarities
+        (sim_pos,
+         sim_neg,
+         sim_neg_bot_bot,
+         sim_neg_dial_dial,
+         sim_neg_bot_dial) = self._tf_sim(pos_dial_embed,
+                                          pos_bot_embed,
+                                          neg_dial_embed,
+                                          neg_bot_embed,
+                                          dial_bad_negs,
+                                          bot_bad_negs,
+                                          mask)
+
+        acc = self._tf_calc_accuracy(sim_pos, sim_neg)
+
+        loss = self._choose_loss(sim_pos, sim_neg,
+                                 sim_neg_bot_bot,
+                                 sim_neg_dial_dial,
+                                 sim_neg_bot_dial,
+                                 mask)
+        return loss, acc
+
+    def _create_tf_placeholders(self, session_data):
+        dialogue_len = None  # use dynamic time for rnn
+        self.a_in = tf.placeholder(
+            dtype=tf.float32,
+            shape=(None, dialogue_len, session_data.X.shape[-1]),
+            name="a",
+        )
+        self.b_in = tf.placeholder(
+            dtype=tf.float32,
+            shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
+            name="b",
+        )
+        self.c_in = tf.placeholder(
+            dtype=tf.float32,
+            shape=(None, dialogue_len, session_data.slots.shape[-1]),
+            name="slt",
+        )
+        self.b_prev_in = tf.placeholder(
+            dtype=tf.float32,
+            shape=(None, dialogue_len, session_data.Y.shape[-1]),
+            name="b_prev",
+        )
+
+    def _build_tf_pred_graph(self):
+        self.dial_embed, mask = self._create_tf_dial()
+        self.bot_embed = self._create_tf_bot_embed(self.b_in)
+
+        self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed)
+        self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed)
+
+        self.sim_op = self._tf_raw_sim(
+            self.dial_embed[:, :, tf.newaxis, :],
+            self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
+            mask
+        )
+
     # training methods
     def train(
         self,
@@ -820,144 +925,31 @@ def train(
             batch_size_in = tf.placeholder(tf.int64)
             train_dataset = self._create_tf_dataset(session_data, batch_size_in)
 
+            iterator = self._create_tf_iterator(train_dataset)
+
+            train_init_op = iterator.make_initializer(train_dataset)
+
             if self.evaluate_on_num_examples:
                 eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples)
                 eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False)
+                eval_init_op = iterator.make_initializer(eval_train_dataset)
             else:
-                eval_train_dataset = None
-
-            iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
-                                                       train_dataset.output_shapes,
-                                                       output_classes=train_dataset.output_classes)
-
-            # session data are int counts but we need a float tensors
-            (self.a_in,
-             self.b_in,
-             self.c_in,
-             self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next())
-
-            all_actions = tf.constant(self.encoded_all_actions,
-                                      dtype=tf.float32,
-                                      name="all_actions")
+                eval_init_op = None
 
-            # dynamic variables
             self._is_training = tf.placeholder_with_default(False, shape=())
+            loss, acc = self._build_tf_train_graph(iterator)
 
-            # mask different length sequences
-            # if there is at least one `-1` it should be masked
-            mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
-
-            self.attention_weights = {}
-            transformer_out = self._create_transformer_encoder(
-                self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
-            self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out")
-
-            self.bot_embed = self._create_tf_bot_embed(self.b_in)
-            all_actions_embed = self._create_tf_bot_embed(all_actions)
-
-            # calculate similarities
-            if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-                # pick last action if max history is used
-                self.b_in = self.b_in[:, tf.newaxis, :]
-                self.bot_embed = self.bot_embed[:, tf.newaxis, :]
-                self.dial_embed = self.dial_embed[:, -1:, :]
-                mask = mask[:, -1:]
-
-            pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :]
-            neg_dial_embed, dial_bad_negs = self._tf_get_negs(
-                self._tf_make_flat(self.dial_embed),
-                self._tf_make_flat(self.b_in),
-                self.b_in
-            )
-            pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :]
-            neg_bot_embed, bot_bad_negs = self._tf_get_negs(
-                all_actions_embed,
-                all_actions,
-                self.b_in
-            )
-
-            # normalize embedding vectors for cosine similarity
-            pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed)
-            pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed)
-            neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed)
-            neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed)
-
-            (sim_pos,
-             sim_neg,
-             sim_neg_bot_bot,
-             sim_neg_dial_dial,
-             sim_neg_bot_dial) = self._tf_sim(pos_dial_embed,
-                                              pos_bot_embed,
-                                              neg_dial_embed,
-                                              neg_bot_embed,
-                                              dial_bad_negs,
-                                              bot_bad_negs,
-                                              mask)
-
-            acc = self._tf_calc_accuracy(sim_pos, sim_neg)
-
-            loss = self._choose_loss(sim_pos, sim_neg,
-                                     sim_neg_bot_bot,
-                                     sim_neg_dial_dial,
-                                     sim_neg_bot_dial,
-                                     mask)
             # define which optimizer to use
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
-            train_init_op = iterator.make_initializer(train_dataset)
-            if self.evaluate_on_num_examples:
-                eval_init_op = iterator.make_initializer(eval_train_dataset)
-            else:
-                eval_init_op = None
-
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
-
             self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in,
                                    loss, acc)
 
-            dialogue_len = None  # use dynamic time for rnn
-            # create placeholders
-            self.a_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.X.shape[-1]),
-                name="a",
-            )
-            self.b_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
-                name="b",
-            )
-            self.c_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.slots.shape[-1]),
-                name="slt",
-            )
-            self.b_prev_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.Y.shape[-1]),
-                name="b_prev",
-            )
-
-            # mask different length sequences
-            # if there is at least one `-1` it should be masked
-            mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
-
-            self.attention_weights = {}
-            transformer_out = self._create_transformer_encoder(
-                self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
-            self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out")
-
-            self.bot_embed = self._create_tf_bot_embed(self.b_in)
-
-            if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-                self.dial_embed = self.dial_embed[:, -1:, :]
-
-            self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed)
-            self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed)
-            self.sim_op = self._tf_raw_sim(self.dial_embed[:, :, tf.newaxis, :],
-                                           self.bot_embed, mask)
-
+            # rebuild the graph for prediction
+            self._create_tf_placeholders(session_data)
+            self._build_tf_pred_graph()
             # if self.attention_weights.items():
             #     self.attention_weights = tf.concat([tf.expand_dims(t, 0)
             #                                         for name, t in self.attention_weights.items()
@@ -999,8 +991,8 @@ def _train_tf_dataset(self,
             )
         pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled())
 
-        train_acc = 0
-        last_loss = 0
+        eval_acc = 0
+        eval_loss = 0
         for ep in pbar:
 
             batch_size = self._linearly_increasing_batch_size(ep)
@@ -1016,46 +1008,42 @@ def _train_tf_dataset(self,
                         [self._train_op, loss, acc],
                         feed_dict={self._is_training: True}
                     )
+                    batches_per_epoch += 1
+                    ep_train_loss += batch_train_loss
+                    ep_train_acc += batch_train_acc
 
                 except tf.errors.OutOfRangeError:
                     break
 
-                batches_per_epoch += 1
-                ep_train_loss += batch_train_loss
-                ep_train_acc += batch_train_acc
-
             ep_train_loss /= batches_per_epoch
             ep_train_acc /= batches_per_epoch
 
+            pbar.set_postfix({
+                "loss": "{:.3f}".format(ep_train_loss),
+                "acc": "{:.3f}".format(ep_train_acc)
+            })
+
             if self.evaluate_on_num_examples and eval_init_op is not None:
                 if (ep == 0 or
                         (ep + 1) % self.evaluate_every_num_epochs == 0 or
                         (ep + 1) == self.epochs):
-                    train_acc = self._output_training_stat_dataset(eval_init_op, acc)
-                    last_loss = ep_train_loss
-
-                pbar.set_postfix({
-                    "train_loss": "{:.3f}".format(ep_train_loss),
-                    "train_acc": "{:.3f}".format(ep_train_acc),
-                    "acc": "{:.3f}".format(train_acc),
-                })
-            else:
-                pbar.set_postfix({
-                    "train_loss": "{:.3f}".format(ep_train_loss),
-                    "train_acc": "{:.3f}".format(ep_train_acc)
-                })
+                    eval_loss, eval_acc = self._output_training_stat_dataset(
+                        eval_init_op, loss, acc
+                    )
+                    logger.info("Evaluation results: loss: {:.3f}, acc: {:.3f}"
+                                "".format(eval_loss, eval_acc))
 
         if self.evaluate_on_num_examples:
             logger.info("Finished training embedding classifier, "
-                        "loss={:.3f}, train accuracy={:.3f}"
-                        "".format(last_loss, train_acc))
+                        "loss={:.3f}, accuracy={:.3f}"
+                        "".format(eval_loss, eval_acc))
 
-    def _output_training_stat_dataset(self, eval_init_op, acc) -> np.ndarray:
+    def _output_training_stat_dataset(self, eval_init_op, loss, acc) -> Tuple[float, float]:
         """Output training statistics"""
 
         self.session.run(eval_init_op)
 
-        return self.session.run(acc, feed_dict={self._is_training: False})
+        return self.session.run([loss, acc], feed_dict={self._is_training: False})
 
     def continue_training(
         self,
@@ -1095,12 +1083,8 @@ def tf_feed_dict_for_prediction(self,
         # noinspection PyPep8Naming
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
-        # noinspection PyPep8Naming
-        all_Y_d_x = np.stack([session_data.all_Y_d
-                              for _ in range(session_data.X.shape[0])])
 
         return {self.a_in: session_data.X,
-                self.b_in: all_Y_d_x,
                 self.c_in: session_data.slots,
                 self.b_prev_in: session_data.previous_actions}
 
@@ -1120,30 +1104,14 @@ def predict_action_probabilities(
             )
             return [0.0] * domain.num_actions
 
-        # noinspection PyPep8Naming
-        data_X = self.featurizer.create_X([tracker], domain)
-        session_data = self._create_session_data(data_X)
-        # noinspection PyPep8Naming
-        all_Y_d_x = np.stack(
-            [session_data.all_Y_d for _ in range(session_data.X.shape[0])]
-        )
-        # self.similarity_type = 'cosine'
-        # mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
-        # self.sim_op, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask)
-        _sim = self.session.run(
-            self.sim_op,
-            feed_dict={
-                self.a_in: session_data.X,
-                self.b_in: all_Y_d_x,
-                self.c_in: session_data.slots,
-                self.b_prev_in: session_data.previous_actions,
-            },
-        )
+        tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain)
+
+        sim_ = self.session.run(self.sim_op, feed_dict=tf_feed_dict)
 
         # TODO assume we used inner:
         self.similarity_type = "inner"
 
-        result = _sim[0, -1, :]
+        result = sim_[0, -1, :]
         if self.similarity_type == "cosine":
             # clip negative values to zero
             result[result < 0] = 0
@@ -1191,18 +1159,9 @@ def persist(self, path: Text) -> None:
 
             self._persist_tensor("similarity_op", self.sim_op)
 
-            self._persist_tensor("alignment_history", self.alignment_history)
-
-            self._persist_tensor("user_embed", self.user_embed)
-            self._persist_tensor("bot_embed", self.bot_embed)
-            self._persist_tensor("slot_embed", self.slot_embed)
             self._persist_tensor("dial_embed", self.dial_embed)
-
-            self._persist_tensor("rnn_embed", self.rnn_embed)
-            self._persist_tensor("attn_embed", self.attn_embed)
-            self._persist_tensor("copy_attn_debug", self.copy_attn_debug)
-
-            self._persist_tensor("all_time_masks", self.all_time_masks)
+            self._persist_tensor("bot_embed", self.bot_embed)
+            self._persist_tensor("all_bot_embed", self.all_bot_embed)
 
             self._persist_tensor("attention_weights", self.attention_weights)
 
@@ -1266,18 +1225,9 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
             sim_op = cls.load_tensor("similarity_op")
 
-            alignment_history = cls.load_tensor("alignment_history")
-
-            user_embed = cls.load_tensor("user_embed")
-            bot_embed = cls.load_tensor("bot_embed")
-            slot_embed = cls.load_tensor("slot_embed")
             dial_embed = cls.load_tensor("dial_embed")
-
-            rnn_embed = cls.load_tensor("rnn_embed")
-            attn_embed = cls.load_tensor("attn_embed")
-            copy_attn_debug = cls.load_tensor("copy_attn_debug")
-
-            all_time_masks = cls.load_tensor("all_time_masks")
+            bot_embed = cls.load_tensor("bot_embed")
+            all_bot_embed = cls.load_tensor("all_bot_embed")
 
             attention_weights = cls.load_tensor("attention_weights")
 
@@ -1299,14 +1249,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             slots_placeholder=c_in,
             prev_act_placeholder=b_prev_in,
             similarity_op=sim_op,
-            alignment_history=alignment_history,
-            user_embed=user_embed,
-            bot_embed=bot_embed,
-            slot_embed=slot_embed,
             dial_embed=dial_embed,
-            rnn_embed=rnn_embed,
-            attn_embed=attn_embed,
-            copy_attn_debug=copy_attn_debug,
-            all_time_masks=all_time_masks,
+            bot_embed=bot_embed,
+            all_bot_embed=all_bot_embed,
             attention_weights=attention_weights
         )

From e355baa17110fa7eb70e8adb1e764b2db5858e2c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 17:08:22 +0200
Subject: [PATCH 05/50] normilize confidence inside pred graph

---
 rasa/core/policies/embedding_policy.py | 76 +++++++++++++++-----------
 1 file changed, 44 insertions(+), 32 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b4192758284c..1d3bb8eb7318 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -163,7 +163,9 @@ def __init__(
         action_placeholder: Optional['tf.Tensor'] = None,
         slots_placeholder: Optional['tf.Tensor'] = None,
         prev_act_placeholder: Optional['tf.Tensor'] = None,
-        similarity_op: Optional['tf.Tensor'] = None,
+        similarity_all: Optional['tf.Tensor'] = None,
+        pred_confidence: Optional['tf.Tensor'] = None,
+        similarity: Optional['tf.Tensor'] = None,
         dial_embed: Optional['tf.Tensor'] = None,
         bot_embed: Optional['tf.Tensor'] = None,
         all_bot_embed: Optional['tf.Tensor'] = None,
@@ -200,7 +202,9 @@ def __init__(
         self.b_in = action_placeholder
         self.c_in = slots_placeholder
         self.b_prev_in = prev_act_placeholder
-        self.sim_op = similarity_op
+        self.sim_all = similarity_all
+        self.pred_confidence = pred_confidence
+        self.sim = similarity
 
         # persisted embeddings
         self.dial_embed = dial_embed
@@ -541,8 +545,8 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
 
         self.attention_weights = {}
         a = self._create_tf_transformer_encoder(
-            self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights)
-
+            self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights
+        )
         dial_embed = self._create_tf_embed(a, layer_name_suffix="out")
 
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
@@ -661,11 +665,11 @@ def _tf_sim(
         sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed,
                                    mask) + neg_inf * bot_bad_negs
         sim_neg_bot_bot = self._tf_raw_sim(pos_bot_embed, neg_bot_embed,
-                                       mask) + neg_inf * bot_bad_negs
+                                           mask) + neg_inf * bot_bad_negs
         sim_neg_dial_dial = self._tf_raw_sim(pos_dial_embed, neg_dial_embed,
-                                        mask) + neg_inf * dial_bad_negs
+                                             mask) + neg_inf * dial_bad_negs
         sim_neg_bot_dial = self._tf_raw_sim(pos_bot_embed, neg_dial_embed,
-                                        mask) + neg_inf * dial_bad_negs
+                                            mask) + neg_inf * dial_bad_negs
 
         # output similarities between user input and bot actions
         # and similarities between bot actions and similarities between user inputs
@@ -870,17 +874,32 @@ def _create_tf_placeholders(self, session_data):
 
     def _build_tf_pred_graph(self):
         self.dial_embed, mask = self._create_tf_dial()
-        self.bot_embed = self._create_tf_bot_embed(self.b_in)
-
         self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed)
-        self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed)
 
-        self.sim_op = self._tf_raw_sim(
+        self.sim_all = self._tf_raw_sim(
             self.dial_embed[:, :, tf.newaxis, :],
             self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
             mask
         )
 
+        if self.similarity_type == "cosine":
+            # clip negative values to zero
+            confidence = tf.nn.relu(self.sim_all)
+        else:
+            # normalize result to [0, 1] with softmax
+            confidence = tf.nn.softmax(self.sim_all)
+
+        self.bot_embed = self._create_tf_bot_embed(self.b_in)
+        self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed)
+
+        self.sim = self._tf_raw_sim(
+            self.dial_embed[:, :, tf.newaxis, :],
+            self.bot_embed,
+            mask
+        )
+
+        return confidence
+
     # training methods
     def train(
         self,
@@ -949,7 +968,8 @@ def train(
 
             # rebuild the graph for prediction
             self._create_tf_placeholders(session_data)
-            self._build_tf_pred_graph()
+            self.pred_confidence = self._build_tf_pred_graph()
+
             # if self.attention_weights.items():
             #     self.attention_weights = tf.concat([tf.expand_dims(t, 0)
             #                                         for name, t in self.attention_weights.items()
@@ -1106,23 +1126,9 @@ def predict_action_probabilities(
 
         tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain)
 
-        sim_ = self.session.run(self.sim_op, feed_dict=tf_feed_dict)
-
-        # TODO assume we used inner:
-        self.similarity_type = "inner"
-
-        result = sim_[0, -1, :]
-        if self.similarity_type == "cosine":
-            # clip negative values to zero
-            result[result < 0] = 0
-        elif self.similarity_type == "inner":
-            # normalize result to [0, 1] with softmax but only over 3*num_neg+1 values
-            low_ids = result.argsort()[::-1][4*self.num_neg+1:]
-            result[low_ids] += -np.inf
-            result = np.exp(result)
-            result /= np.sum(result)
+        sim_ = self.session.run(self.sim_all, feed_dict=tf_feed_dict)
 
-        return result.tolist()
+        return sim_[0, -1, :].tolist()
 
     def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None:
         if tensor is not None:
@@ -1157,7 +1163,9 @@ def persist(self, path: Text) -> None:
             self._persist_tensor("slots_placeholder", self.c_in)
             self._persist_tensor("prev_act_placeholder", self.b_prev_in)
 
-            self._persist_tensor("similarity_op", self.sim_op)
+            self._persist_tensor("similarity_all", self.sim_all)
+            self._persist_tensor("pred_confidence", self.pred_confidence)
+            self._persist_tensor("similarity", self.sim)
 
             self._persist_tensor("dial_embed", self.dial_embed)
             self._persist_tensor("bot_embed", self.bot_embed)
@@ -1223,7 +1231,9 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             c_in = cls.load_tensor("slots_placeholder")
             b_prev_in = cls.load_tensor("prev_act_placeholder")
 
-            sim_op = cls.load_tensor("similarity_op")
+            sim_all = cls.load_tensor("similarity_all")
+            pred_confidence = cls.load_tensor("pred_confidence")
+            sim = cls.load_tensor("similarity")
 
             dial_embed = cls.load_tensor("dial_embed")
             bot_embed = cls.load_tensor("bot_embed")
@@ -1248,9 +1258,11 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             action_placeholder=b_in,
             slots_placeholder=c_in,
             prev_act_placeholder=b_prev_in,
-            similarity_op=sim_op,
+            similarity_all=sim_all,
+            pred_confidence=pred_confidence,
+            similarity=sim,
             dial_embed=dial_embed,
             bot_embed=bot_embed,
             all_bot_embed=all_bot_embed,
-            attention_weights=attention_weights
+            attention_weights=attention_weights,
         )

From fdb15f6cad3f1ddb3bb519cf985cfa32c187b4d4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 17:09:23 +0200
Subject: [PATCH 06/50] use pred_confidence in predict

---
 rasa/core/policies/embedding_policy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 1d3bb8eb7318..cdfd5551618a 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1126,9 +1126,9 @@ def predict_action_probabilities(
 
         tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain)
 
-        sim_ = self.session.run(self.sim_all, feed_dict=tf_feed_dict)
+        confidence = self.session.run(self.pred_confidence, feed_dict=tf_feed_dict)
 
-        return sim_[0, -1, :].tolist()
+        return confidence[0, -1, :].tolist()
 
     def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None:
         if tensor is not None:

From fe5a3adc35449871a6c42b73388fb148a608b62f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 17:30:13 +0200
Subject: [PATCH 07/50] move normalization to embed helper

---
 rasa/core/policies/embedding_policy.py | 39 +++++++++++---------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index cdfd5551618a..40f42816d199 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -434,6 +434,20 @@ def _create_tf_nn(
             x = tf.layers.dropout(x, rate=droprate, training=self._is_training)
         return x
 
+    def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor':
+
+        if self.similarity_type not in {"cosine", "inner"}:
+            raise ValueError(
+                "Wrong similarity type {}, "
+                "should be 'cosine' or 'inner'"
+                "".format(self.similarity_type)
+            )
+
+        if self.similarity_type == "cosine":
+            return tf.nn.l2_normalize(a, -1)
+        else:
+            return a
+
     def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
         """Create dense embedding layer with a name."""
 
@@ -446,7 +460,8 @@ def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tenso
             name="embed_layer_{}".format(layer_name_suffix),
             reuse=tf.AUTO_REUSE,
         )
-        return embed_x
+        # normalize embedding vectors for cosine similarity
+        return self._tf_normalize_if_cosine(embed_x)
 
     def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding bot vector."""
@@ -622,20 +637,6 @@ def _sample_negatives(self, all_actions):
         return (pos_dial_embed, pos_bot_embed, neg_dial_embed, neg_bot_embed,
                 dial_bad_negs, bot_bad_negs)
 
-    def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor':
-
-        if self.similarity_type not in {"cosine", "inner"}:
-            raise ValueError(
-                "Wrong similarity type {}, "
-                "should be 'cosine' or 'inner'"
-                "".format(self.similarity_type)
-            )
-
-        if self.similarity_type == "cosine":
-            return tf.nn.l2_normalize(a, -1)
-        else:
-            return a
-
     @staticmethod
     def _tf_raw_sim(
         a: 'tf.Tensor',
@@ -821,12 +822,6 @@ def _build_tf_train_graph(self, iterator):
          dial_bad_negs,
          bot_bad_negs) = self._sample_negatives(all_actions)
 
-        # normalize embedding vectors for cosine similarity
-        pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed)
-        pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed)
-        neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed)
-        neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed)
-
         # calculate similarities
         (sim_pos,
          sim_neg,
@@ -874,7 +869,6 @@ def _create_tf_placeholders(self, session_data):
 
     def _build_tf_pred_graph(self):
         self.dial_embed, mask = self._create_tf_dial()
-        self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed)
 
         self.sim_all = self._tf_raw_sim(
             self.dial_embed[:, :, tf.newaxis, :],
@@ -890,7 +884,6 @@ def _build_tf_pred_graph(self):
             confidence = tf.nn.softmax(self.sim_all)
 
         self.bot_embed = self._create_tf_bot_embed(self.b_in)
-        self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed)
 
         self.sim = self._tf_raw_sim(
             self.dial_embed[:, :, tf.newaxis, :],

From 4cff3c07aa6200e72702298eddab1da105cedd5c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 18:11:04 +0200
Subject: [PATCH 08/50] maybe fix continue_training

---
 rasa/core/policies/embedding_policy.py | 57 ++++++++++++++------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 40f42816d199..695db9a094c4 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -213,6 +213,7 @@ def __init__(
 
         self.attention_weights = attention_weights
         # internal tf instances
+        self._iterator = None
         self._train_op = None
         self._is_training = None
 
@@ -507,15 +508,19 @@ def _create_tf_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_
                 setattr(hparams, key, value * tf.cast(self._is_training, tf.float32))
         reg = tf.contrib.layers.l2_regularizer(self.C2)
 
-        x = tf.layers.dense(inputs=x_in,
-                            units=hparams.hidden_size,
-                            use_bias=False,
-                            kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5),
-                            kernel_regularizer=reg,
-                            name='transformer_embed_layer',
-                            reuse=tf.AUTO_REUSE)
-
-        x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training)
+        x = tf.nn.relu(x_in)
+        x = tf.layers.dense(
+            inputs=x,
+            units=hparams.hidden_size,
+            use_bias=False,
+            kernel_initializer=tf.random_normal_initializer(
+                0.0, hparams.hidden_size ** -0.5),
+            kernel_regularizer=reg,
+            name='transformer_embed_layer',
+            reuse=tf.AUTO_REUSE
+        )
+        x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout,
+                              training=self._is_training)
 
         if hparams.multiply_embedding_mode == "sqrt_depth":
             x *= hparams.hidden_size ** 0.5
@@ -793,13 +798,13 @@ def _choose_loss(self,
                 "".format(self.loss_type)
             )
 
-    def _build_tf_train_graph(self, iterator):
+    def _build_tf_train_graph(self):
 
         # session data are int counts but we need a float tensors
         (self.a_in,
          self.b_in,
          self.c_in,
-         self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next())
+         self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in self._iterator.get_next())
 
         all_actions = tf.constant(self.encoded_all_actions,
                                   dtype=tf.float32,
@@ -937,19 +942,19 @@ def train(
             batch_size_in = tf.placeholder(tf.int64)
             train_dataset = self._create_tf_dataset(session_data, batch_size_in)
 
-            iterator = self._create_tf_iterator(train_dataset)
+            self._iterator = self._create_tf_iterator(train_dataset)
 
-            train_init_op = iterator.make_initializer(train_dataset)
+            train_init_op = self._iterator.make_initializer(train_dataset)
 
             if self.evaluate_on_num_examples:
                 eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples)
                 eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False)
-                eval_init_op = iterator.make_initializer(eval_train_dataset)
+                eval_init_op = self._iterator.make_initializer(eval_train_dataset)
             else:
                 eval_init_op = None
 
             self._is_training = tf.placeholder_with_default(False, shape=())
-            loss, acc = self._build_tf_train_graph(iterator)
+            loss, acc = self._build_tf_train_graph()
 
             # define which optimizer to use
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
@@ -1075,20 +1080,18 @@ def continue_training(
             )
 
             session_data = self._create_session_data(training_data.X, training_data.y)
-
-            b = self._create_batch_b(session_data.Y, session_data.actions_for_Y)
+            train_dataset = self._create_tf_dataset(session_data, batch_size)
+            train_init_op = self._iterator.make_initializer(train_dataset)
+            self.session.run(train_init_op)
 
             # fit to one extra example using updated trackers
-            self.session.run(
-                self._train_op,
-                feed_dict={
-                    self.a_in: session_data.X,
-                    self.b_in: b,
-                    self.c_in: session_data.slots,
-                    self.b_prev_in: session_data.previous_actions,
-                    self._is_training: True,
-                },
-            )
+            while True:
+                try:
+                    self.session.run(self._train_op,
+                                     feed_dict={self._is_training: True})
+
+                except tf.errors.OutOfRangeError:
+                    break
 
     def tf_feed_dict_for_prediction(self,
                                     tracker: DialogueStateTracker,

From 2b1843cab3b77a4c6c3e0694f15eaefa574dbccf Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 18:21:11 +0200
Subject: [PATCH 09/50] fix extract attention

---
 rasa/core/policies/embedding_policy.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 695db9a094c4..8c6b1ed1105e 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -365,12 +365,6 @@ def _create_session_data(
             actions_for_Y = None
             Y = None
 
-        # is needed to calculate train accuracy
-        if isinstance(self.featurizer, FullDialogueTrackerFeaturizer):
-            dial_len = X.shape[1]
-        else:
-            dial_len = 1
-
         return SessionData(
             X=X,
             Y=Y,
@@ -898,6 +892,16 @@ def _build_tf_pred_graph(self):
 
         return confidence
 
+    def _extract_attention(self):
+        attention = [tf.expand_dims(t, 0)
+                     for name, t in self.attention_weights.items()
+                     if name.endswith('multihead_attention/dot_product_attention')]
+
+        if attention:
+            return tf.concat(attention, 0)
+        else:
+            return
+
     # training methods
     def train(
         self,
@@ -968,10 +972,7 @@ def train(
             self._create_tf_placeholders(session_data)
             self.pred_confidence = self._build_tf_pred_graph()
 
-            # if self.attention_weights.items():
-            #     self.attention_weights = tf.concat([tf.expand_dims(t, 0)
-            #                                         for name, t in self.attention_weights.items()
-            #                                         if name.endswith('multihead_attention/dot_product_attention')], 0)
+            self.attention_weights = self._extract_attention()
 
     # training helpers
     def _linearly_increasing_batch_size(self, epoch: int) -> int:

From d1fae48b5e5cb66e2f4a785ca74aad00b75f61c5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 20:46:13 +0200
Subject: [PATCH 10/50] add types

---
 rasa/core/policies/embedding_policy.py | 213 ++++++++++---------------
 1 file changed, 88 insertions(+), 125 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 8c6b1ed1105e..caf653be37b9 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -39,7 +39,11 @@
 except ImportError:
     import pickle
 
+if typing.TYPE_CHECKING:
+    from tensor2tensor.utils.hparam import HParams
 
+# avoid warning println on contrib import - remove for tf 2
+tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
 # namedtuple for all tf session related data
@@ -66,12 +70,9 @@ class EmbeddingPolicy(Policy):
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # a list of hidden layers sizes before user embed layer
-        # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_a": [],
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_b": [],
+        "hidden_layers_sizes_bot": [],
 
         "pos_encoding": "timing",  # {"timing", "emb", "custom_timing"}
         # introduce phase shift in time encodings between transformers
@@ -79,12 +80,10 @@ class EmbeddingPolicy(Policy):
         "pos_max_timescale": 1.0e1,
         "max_seq_length": 256,
         "num_heads": 4,
-        # number of units in rnn cell
-        "rnn_size": 128,
-        "num_rnn_layers": 1,
+        # number of units in transformer
+        "transformer_size": 128,
+        "num_transformer_layers": 1,
         # training parameters
-        # flag if to turn on layer normalization for lstm cell
-        "layer_norm": True,
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
         "batch_size": [8, 32],
@@ -114,23 +113,10 @@ class EmbeddingPolicy(Policy):
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different actions
         "C_emb": 0.8,
-        # dropout rate for user nn
-        "droprate_a": 0.0,
         # dropout rate for bot nn
-        "droprate_b": 0.0,
-        # dropout rate for rnn
-        "droprate_rnn": 0.1,
-        # attention parameters
-        # flag to use attention over user input
-        # as an input to rnn
-        "attn_before_rnn": True,
-        # flag to use attention over prev bot actions
-        # and copy it to output bypassing rnn
-        "attn_after_rnn": True,
-        # flag to use `sparsemax` instead of `softmax` for attention
-        "sparse_attention": False,  # flag to use sparsemax for probs
-        # the range of allowed location-based attention shifts
-        "attn_shift_range": None,  # if None, set to mean dialogue length / 2
+        "droprate_bot": 0.0,
+        # dropout rate for dial nn
+        "droprate_dial": 0.1,
         # visualization of accuracy
         # how often calculate train accuracy
         "evaluate_every_num_epochs": 20,  # small values may hurt performance
@@ -141,20 +127,20 @@ class EmbeddingPolicy(Policy):
     # end default properties (DOC MARKER - don't remove)
 
     @staticmethod
-    def _standard_featurizer(max_history=None):
+    def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturizer':
         if max_history is None:
             return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
         else:
             return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history)
 
     @staticmethod
-    def _check_t2t():
+    def _check_t2t() -> None:
         if common_attention is None:
             raise ImportError("Please install tensor2tensor")
 
     def __init__(
         self,
-        featurizer: Optional['FullDialogueTrackerFeaturizer'] = None,
+        featurizer: Optional['TrackerFeaturizer'] = None,
         priority: int = 1,
         encoded_all_actions: Optional['np.ndarray'] = None,
         graph: Optional['tf.Graph'] = None,
@@ -180,12 +166,6 @@ def __init__(
             featurizer = self._standard_featurizer(max_history)
         super(EmbeddingPolicy, self).__init__(featurizer, priority)
 
-        # flag if to use the same embeddings for user and bot
-        try:
-            self.share_embedding = self.featurizer.state_featurizer.use_shared_vocab
-        except AttributeError:
-            self.share_embedding = False
-
         self._load_params(**kwargs)
 
         # chrono initialization for forget bias
@@ -219,31 +199,15 @@ def __init__(
 
     # init helpers
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layer_sizes = {
-            "a": config["hidden_layers_sizes_a"],
-            "b": config["hidden_layers_sizes_b"],
-        }
+        self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"]
 
-        if self.share_embedding:
-            if self.hidden_layer_sizes["a"] != self.hidden_layer_sizes["b"]:
-                raise ValueError(
-                    "Due to sharing vocabulary "
-                    "in the featurizer, embedding weights "
-                    "are shared as well. "
-                    "So hidden_layers_sizes_a={} should be "
-                    "equal to hidden_layers_sizes_b={}"
-                    "".format(
-                        self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"]
-                    )
-                )
         self.pos_encoding = config['pos_encoding']
         self.pos_max_timescale = config['pos_max_timescale']
         self.max_seq_length = config['max_seq_length']
         self.num_heads = config['num_heads']
 
-        self.rnn_size = config["rnn_size"]
-        self.num_rnn_layers = config["num_rnn_layers"]
-        self.layer_norm = config["layer_norm"]
+        self.transformer_size = config["transformer_size"]
+        self.num_transformer_layers = config["num_transformer_layers"]
 
         self.batch_size = config["batch_size"]
 
@@ -270,20 +234,10 @@ def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
         self.C2 = config["C2"]
         self.C_emb = config["C_emb"]
         self.droprate = {
-            "a": config["droprate_a"],
-            "b": config["droprate_b"],
-            "rnn": config["droprate_rnn"],
+            "bot": config["droprate_bot"],
+            "dial": config["droprate_dial"],
         }
 
-    def _load_attn_params(self, config: Dict[Text, Any]) -> None:
-        self.sparse_attention = config["sparse_attention"]
-        self.attn_shift_range = config["attn_shift_range"]
-        self.attn_after_rnn = config["attn_after_rnn"]
-        self.attn_before_rnn = config["attn_before_rnn"]
-
-    def is_using_attention(self):
-        return self.attn_after_rnn or self.attn_before_rnn
-
     def _load_visual_params(self, config: Dict[Text, Any]) -> None:
         self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
         if self.evaluate_every_num_epochs < 1:
@@ -298,14 +252,13 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self._load_nn_architecture_params(config)
         self._load_embedding_params(config)
         self._load_regularization_params(config)
-        self._load_attn_params(config)
         self._load_visual_params(config)
 
     # data helpers
     # noinspection PyPep8Naming
     def _create_X_slots_previous_actions(
-        self, data_X: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        self, data_X: 'np.ndarray'
+    ) -> Tuple['np.ndarray', 'np.ndarray', 'np.ndarray']:
         """Extract feature vectors
 
         for user input (X), slots and
@@ -324,12 +277,12 @@ def _create_X_slots_previous_actions(
 
     # noinspection PyPep8Naming
     @staticmethod
-    def _actions_for_Y(data_Y: np.ndarray) -> np.ndarray:
+    def _actions_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray':
         """Prepare Y data for training: extract actions indices."""
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
-    def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray:
+    def _action_features_for_Y(self, actions_for_Y: 'np.ndarray') -> 'np.ndarray':
         """Prepare Y data for training: features for action labels."""
 
         if len(actions_for_Y.shape) == 2:
@@ -350,8 +303,8 @@ def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray:
 
     # noinspection PyPep8Naming
     def _create_session_data(
-        self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
-    ) -> SessionData:
+        self, data_X: 'np.ndarray', data_Y: Optional['np.ndarray'] = None
+    ) -> 'SessionData':
         """Combine all tf session related data into a named tuple"""
 
         X, slots, previous_actions = self._create_X_slots_previous_actions(data_X)
@@ -401,7 +354,7 @@ def _create_tf_dataset(session_data: 'SessionData',
         return train_dataset
 
     @staticmethod
-    def _create_tf_iterator(dataset):
+    def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator':
         return tf.data.Iterator.from_structure(dataset.output_types,
                                                dataset.output_shapes,
                                                output_classes=dataset.output_classes)
@@ -409,7 +362,7 @@ def _create_tf_iterator(dataset):
     def _create_tf_nn(
         self,
         x_in: 'tf.Tensor',
-        layer_sizes: List,
+        layer_sizes: List[int],
         droprate: float,
         layer_name_suffix: Text,
     ) -> 'tf.Tensor':
@@ -461,25 +414,23 @@ def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tenso
     def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         """Create embedding bot vector."""
 
-        layer_name_suffix = "a_and_b" if self.share_embedding else "b"
-
         b = self._create_tf_nn(
             b_in,
-            self.hidden_layer_sizes["b"],
-            self.droprate["b"],
-            layer_name_suffix=layer_name_suffix,
+            self.hidden_layer_sizes_bot,
+            self.droprate["bot"],
+            layer_name_suffix="bot",
         )
-        return self._create_tf_embed(b, layer_name_suffix=layer_name_suffix)
+        return self._create_tf_embed(b, layer_name_suffix="bot")
 
-    def _create_hparams(self):
+    def _create_hparams(self) -> 'HParams':
         hparams = transformer_base()
 
-        hparams.num_hidden_layers = self.num_rnn_layers
-        hparams.hidden_size = self.rnn_size
+        hparams.num_hidden_layers = self.num_transformer_layers
+        hparams.hidden_size = self.transformer_size
         # it seems to be factor of 4 for transformer architectures in t2t
         hparams.filter_size = hparams.hidden_size * 4
         hparams.num_heads = self.num_heads
-        hparams.relu_dropout = self.droprate["rnn"]
+        hparams.relu_dropout = self.droprate["dial"]
         hparams.pos = self.pos_encoding
 
         hparams.max_length = self.max_seq_length
@@ -489,13 +440,17 @@ def _create_hparams(self):
         hparams.self_attention_type = "dot_product_relative_v2"
         hparams.max_relative_position = 5
         hparams.add_relative_to_values = True
+
         return hparams
 
-    def _create_tf_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights):
+    # noinspection PyUnresolvedReferences
+    def _create_tf_transformer_encoder(self,
+                                       x_in: 'tf.Tensor',
+                                       mask: 'tf.Tensor',
+                                       attention_weights: Dict[Text, 'tf.Tensor'],
+                                       ) -> 'tf.Tensor':
         hparams = self._create_hparams()
 
-        x_in = tf.concat([a_in, b_prev_in, c_in], -1)
-
         # When not in training mode, set all forms of dropout to zero.
         for key, value in hparams.values().items():
             if key.endswith("dropout") or key == "label_smoothing":
@@ -557,11 +512,12 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
+        x_in = tf.concat([self.a_in, self.b_prev_in, self.c_in], -1)
+
         self.attention_weights = {}
-        a = self._create_tf_transformer_encoder(
-            self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights
-        )
-        dial_embed = self._create_tf_embed(a, layer_name_suffix="out")
+        x = self._create_tf_transformer_encoder(x_in, mask, self.attention_weights)
+
+        dial_embed = self._create_tf_embed(x, layer_name_suffix="dial")
 
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
             # pick last action if max history featurizer is used
@@ -571,24 +527,22 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         return dial_embed, mask
 
     @staticmethod
-    def _tf_make_flat(x):
+    def _tf_make_flat(x: 'tf.Tensor') -> 'tf.Tensor':
         return tf.reshape(x, (-1, x.shape[-1]))
 
     @staticmethod
-    def _tf_sample_neg(batch_size,
-                       all_bs,
-                       neg_ids,
-                       ) -> 'tf.Tensor':
+    def _tf_sample_neg(batch_size: 'tf.Tensor',
+                       all_bs: 'tf.Tensor',
+                       neg_ids: 'tf.Tensor') -> 'tf.Tensor':
 
         tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
 
         return tf.batch_gather(tiled_all_bs, neg_ids)
 
     def _tf_calc_iou_mask(self,
-                          pos_b,
-                          all_bs,
-                          neg_ids,
-                          ) -> 'tf.Tensor':
+                          pos_b: 'tf.Tensor',
+                          all_bs: 'tf.Tensor',
+                          neg_ids: 'tf.Tensor') -> 'tf.Tensor':
 
         pos_b_in_flat = pos_b[:, tf.newaxis, :]
         neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
@@ -599,7 +553,10 @@ def _tf_calc_iou_mask(self,
         iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1)
         return 1. - tf.nn.relu(tf.sign(1. - iou))
 
-    def _tf_get_negs(self, all_embed, all_raw, raw_pos):
+    def _tf_get_negs(self,
+                     all_embed: 'tf.Tensor',
+                     all_raw: 'tf.Tensor',
+                     raw_pos: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']:
 
         batch_size = tf.shape(raw_pos)[0]
         seq_length = tf.shape(raw_pos)[1]
@@ -618,7 +575,12 @@ def _tf_get_negs(self, all_embed, all_raw, raw_pos):
 
         return neg_embed, bad_negs
 
-    def _sample_negatives(self, all_actions):
+    def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor',
+                                                                   'tf.Tensor',
+                                                                   'tf.Tensor',
+                                                                   'tf.Tensor',
+                                                                   'tf.Tensor',
+                                                                   'tf.Tensor']:
 
         # sample negatives
         pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :]
@@ -637,11 +599,7 @@ def _sample_negatives(self, all_actions):
                 dial_bad_negs, bot_bad_negs)
 
     @staticmethod
-    def _tf_raw_sim(
-        a: 'tf.Tensor',
-        b: 'tf.Tensor',
-        mask: 'tf.Tensor',
-    ) -> 'tf.Tensor':
+    def _tf_raw_sim(a: 'tf.Tensor', b: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor':
 
         return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2)
 
@@ -676,7 +634,7 @@ def _tf_sim(
         return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
 
     @staticmethod
-    def _tf_calc_accuracy(sim_pos, sim_neg):
+    def _tf_calc_accuracy(sim_pos: 'tf.Tensor', sim_neg: 'tf.Tensor') -> 'tf.Tensor':
 
         max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
         return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]),
@@ -792,7 +750,7 @@ def _choose_loss(self,
                 "".format(self.loss_type)
             )
 
-    def _build_tf_train_graph(self):
+    def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
 
         # session data are int counts but we need a float tensors
         (self.a_in,
@@ -843,8 +801,8 @@ def _build_tf_train_graph(self):
                                  mask)
         return loss, acc
 
-    def _create_tf_placeholders(self, session_data):
-        dialogue_len = None  # use dynamic time for rnn
+    def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
+        dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
             shape=(None, dialogue_len, session_data.X.shape[-1]),
@@ -866,7 +824,7 @@ def _create_tf_placeholders(self, session_data):
             name="b_prev",
         )
 
-    def _build_tf_pred_graph(self):
+    def _build_tf_pred_graph(self) -> 'tf.Tensor':
         self.dial_embed, mask = self._create_tf_dial()
 
         self.sim_all = self._tf_raw_sim(
@@ -892,7 +850,7 @@ def _build_tf_pred_graph(self):
 
         return confidence
 
-    def _extract_attention(self):
+    def _extract_attention(self) -> Optional['tf.Tensor']:
         attention = [tf.expand_dims(t, 0)
                      for name, t in self.attention_weights.items()
                      if name.endswith('multihead_attention/dot_product_attention')]
@@ -931,6 +889,7 @@ def train(
             "else set num_neg to the number of actions - 1"
             "".format(self.num_neg, domain.num_actions)
         )
+        # noinspection PyAttributeOutsideInit
         self.num_neg = min(self.num_neg, domain.num_actions - 1)
 
         # extract actual training data to feed to tf session
@@ -993,9 +952,9 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int:
             return int(self.batch_size[0])
 
     def _train_tf_dataset(self,
-                          train_init_op,
-                          eval_init_op,
-                          batch_size_in,
+                          train_init_op: 'tf.Operation',
+                          eval_init_op: 'tf.Operation',
+                          batch_size_in: 'tf.Tensor',
                           loss: 'tf.Tensor',
                           acc,
                           ) -> None:
@@ -1057,7 +1016,10 @@ def _train_tf_dataset(self,
                         "loss={:.3f}, accuracy={:.3f}"
                         "".format(eval_loss, eval_acc))
 
-    def _output_training_stat_dataset(self, eval_init_op, loss, acc) -> Tuple[float, float]:
+    def _output_training_stat_dataset(self,
+                                      eval_init_op: 'tf.Operation',
+                                      loss: 'tf.Tensor',
+                                      acc: 'tf.Tensor') -> Tuple[float, float]:
         """Output training statistics"""
 
         self.session.run(eval_init_op)
@@ -1066,8 +1028,8 @@ def _output_training_stat_dataset(self, eval_init_op, loss, acc) -> Tuple[float,
 
     def continue_training(
         self,
-        training_trackers: List[DialogueStateTracker],
-        domain: Domain,
+        training_trackers: List['DialogueStateTracker'],
+        domain: 'Domain',
         **kwargs: Any
     ) -> None:
         """Continue training an already trained policy."""
@@ -1095,8 +1057,9 @@ def continue_training(
                     break
 
     def tf_feed_dict_for_prediction(self,
-                                    tracker: DialogueStateTracker,
-                                    domain: Domain) -> Dict:
+                                    tracker: 'DialogueStateTracker',
+                                    domain: 'Domain'
+                                    ) -> Dict['tf.Tensor', 'np.ndarray']:
         # noinspection PyPep8Naming
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
@@ -1106,7 +1069,7 @@ def tf_feed_dict_for_prediction(self,
                 self.b_prev_in: session_data.previous_actions}
 
     def predict_action_probabilities(
-        self, tracker: DialogueStateTracker, domain: Domain
+        self, tracker: 'DialogueStateTracker', domain: 'Domain'
     ) -> List[float]:
         """Predict the next action the bot should take.
 
@@ -1218,10 +1181,10 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         graph = tf.Graph()
         with graph.as_default():
-            sess = tf.Session(config=_tf_config)
+            session = tf.Session(config=_tf_config)
             saver = tf.train.import_meta_graph(checkpoint + ".meta")
 
-            saver.restore(sess, checkpoint)
+            saver.restore(session, checkpoint)
 
             a_in = cls.load_tensor("intent_placeholder")
             b_in = cls.load_tensor("action_placeholder")
@@ -1250,7 +1213,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             priority=meta["priority"],
             encoded_all_actions=encoded_all_actions,
             graph=graph,
-            session=sess,
+            session=session,
             intent_placeholder=a_in,
             action_placeholder=b_in,
             slots_placeholder=c_in,

From 3ae9b77c9d6ec6d4a3f60ec5b63fde8f3d215fd7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 21:04:21 +0200
Subject: [PATCH 11/50] Add method descriptions

---
 rasa/core/policies/embedding_policy.py | 85 ++++++++++++++++++--------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index caf653be37b9..2f5041222b9d 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -60,9 +60,9 @@
 
 
 class EmbeddingPolicy(Policy):
-    """Recurrent Embedding Dialogue Policy (REDP)
+    """Transformer Embedding Dialogue Policy (TEDP)
 
-    Transformer version of the policy used in our paper https://arxiv.org/abs/1811.11707
+    Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
     """
 
     SUPPORTS_ONLINE_TRAINING = True
@@ -259,10 +259,9 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
     def _create_X_slots_previous_actions(
         self, data_X: 'np.ndarray'
     ) -> Tuple['np.ndarray', 'np.ndarray', 'np.ndarray']:
-        """Extract feature vectors
+        """Extract feature vectors from training data.
 
-        for user input (X), slots and
-        previously executed actions from training data.
+        For user input (X), slots and previously executed actions.
         """
 
         featurizer = self.featurizer.state_featurizer
@@ -279,6 +278,7 @@ def _create_X_slots_previous_actions(
     @staticmethod
     def _actions_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray':
         """Prepare Y data for training: extract actions indices."""
+
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
@@ -329,6 +329,8 @@ def _create_session_data(
     @staticmethod
     def _sample_session_data(session_data: 'SessionData',
                              num_samples: int) -> 'SessionData':
+        """Sample session data."""
+
         ids = np.random.permutation(len(session_data.X))[:num_samples]
         return SessionData(
             X=session_data.X[ids],
@@ -343,6 +345,8 @@ def _sample_session_data(session_data: 'SessionData',
     def _create_tf_dataset(session_data: 'SessionData',
                            batch_size: Union['tf.Tensor', int],
                            shuffle: bool = True) -> 'tf.data.Dataset':
+        """Create tf dataset."""
+
         train_dataset = tf.data.Dataset.from_tensor_slices(
             (session_data.X, session_data.Y,
              session_data.slots, session_data.previous_actions)
@@ -355,6 +359,8 @@ def _create_tf_dataset(session_data: 'SessionData',
 
     @staticmethod
     def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator':
+        """Create tf iterator."""
+
         return tf.data.Iterator.from_structure(dataset.output_types,
                                                dataset.output_shapes,
                                                output_classes=dataset.output_classes)
@@ -382,20 +388,20 @@ def _create_tf_nn(
             x = tf.layers.dropout(x, rate=droprate, training=self._is_training)
         return x
 
-    def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor':
+        """Normalize embedding if similarity type is cosine."""
 
-        if self.similarity_type not in {"cosine", "inner"}:
+        if self.similarity_type == "cosine":
+            return tf.nn.l2_normalize(x, -1)
+        elif self.similarity_type == "inner":
+            return x
+        else:
             raise ValueError(
                 "Wrong similarity type {}, "
                 "should be 'cosine' or 'inner'"
                 "".format(self.similarity_type)
             )
 
-        if self.similarity_type == "cosine":
-            return tf.nn.l2_normalize(a, -1)
-        else:
-            return a
-
     def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
         """Create dense embedding layer with a name."""
 
@@ -422,7 +428,9 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         )
         return self._create_tf_embed(b, layer_name_suffix="bot")
 
-    def _create_hparams(self) -> 'HParams':
+    def _create_t2t_hparams(self) -> 'HParams':
+        """Create parameters for t2t transformer."""
+        
         hparams = transformer_base()
 
         hparams.num_hidden_layers = self.num_transformer_layers
@@ -444,12 +452,14 @@ def _create_hparams(self) -> 'HParams':
         return hparams
 
     # noinspection PyUnresolvedReferences
-    def _create_tf_transformer_encoder(self,
-                                       x_in: 'tf.Tensor',
-                                       mask: 'tf.Tensor',
-                                       attention_weights: Dict[Text, 'tf.Tensor'],
-                                       ) -> 'tf.Tensor':
-        hparams = self._create_hparams()
+    def _create_t2t_transformer_encoder(self,
+                                        x_in: 'tf.Tensor',
+                                        mask: 'tf.Tensor',
+                                        attention_weights: Dict[Text, 'tf.Tensor'],
+                                        ) -> 'tf.Tensor':
+        """Create t2t transformer encoder."""
+        
+        hparams = self._create_t2t_hparams()
 
         # When not in training mode, set all forms of dropout to zero.
         for key, value in hparams.values().items():
@@ -508,6 +518,8 @@ def _create_tf_transformer_encoder(self,
             return tf.nn.relu(x)
 
     def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
+        """Create dialogue level embedding and mask."""
+        
         # mask different length sequences
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
@@ -515,7 +527,7 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         x_in = tf.concat([self.a_in, self.b_prev_in, self.c_in], -1)
 
         self.attention_weights = {}
-        x = self._create_tf_transformer_encoder(x_in, mask, self.attention_weights)
+        x = self._create_t2t_transformer_encoder(x_in, mask, self.attention_weights)
 
         dial_embed = self._create_tf_embed(x, layer_name_suffix="dial")
 
@@ -528,12 +540,15 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
 
     @staticmethod
     def _tf_make_flat(x: 'tf.Tensor') -> 'tf.Tensor':
+        """Make tensor 2D."""
+
         return tf.reshape(x, (-1, x.shape[-1]))
 
     @staticmethod
     def _tf_sample_neg(batch_size: 'tf.Tensor',
                        all_bs: 'tf.Tensor',
                        neg_ids: 'tf.Tensor') -> 'tf.Tensor':
+        """Sample negative examples for given indices"""
 
         tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
 
@@ -543,6 +558,7 @@ def _tf_calc_iou_mask(self,
                           pos_b: 'tf.Tensor',
                           all_bs: 'tf.Tensor',
                           neg_ids: 'tf.Tensor') -> 'tf.Tensor':
+        """Calculate IOU mask for given indices"""
 
         pos_b_in_flat = pos_b[:, tf.newaxis, :]
         neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
@@ -557,6 +573,7 @@ def _tf_get_negs(self,
                      all_embed: 'tf.Tensor',
                      all_raw: 'tf.Tensor',
                      raw_pos: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']:
+        """Get negative examples from given tensor."""
 
         batch_size = tf.shape(raw_pos)[0]
         seq_length = tf.shape(raw_pos)[1]
@@ -581,8 +598,8 @@ def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor',
                                                                    'tf.Tensor',
                                                                    'tf.Tensor',
                                                                    'tf.Tensor']:
+        """Sample negative examples."""
 
-        # sample negatives
         pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :]
         neg_dial_embed, dial_bad_negs = self._tf_get_negs(
             self._tf_make_flat(self.dial_embed),
@@ -600,6 +617,7 @@ def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor',
 
     @staticmethod
     def _tf_raw_sim(a: 'tf.Tensor', b: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor':
+        """Calculate similarity between given tensors."""
 
         return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2)
 
@@ -635,6 +653,7 @@ def _tf_sim(
 
     @staticmethod
     def _tf_calc_accuracy(sim_pos: 'tf.Tensor', sim_neg: 'tf.Tensor') -> 'tf.Tensor':
+        """Calculate accuracy"""
 
         max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
         return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]),
@@ -730,6 +749,7 @@ def _choose_loss(self,
                      sim_neg_dial_dial: 'tf.Tensor',
                      sim_neg_bot_dial: 'tf.Tensor',
                      mask: 'tf.Tensor') -> 'tf.Tensor':
+        """Use loss depending on given option."""
 
         if self.loss_type == 'margin':
             return self._tf_loss_margin(sim_pos, sim_neg,
@@ -751,6 +771,7 @@ def _choose_loss(self,
             )
 
     def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
+        """Bulid train graph using iterator."""
 
         # session data are int counts but we need a float tensors
         (self.a_in,
@@ -802,6 +823,8 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         return loss, acc
 
     def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
+        """Create placeholders for prediction."""
+        
         dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
@@ -824,7 +847,11 @@ def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
             name="b_prev",
         )
 
-    def _build_tf_pred_graph(self) -> 'tf.Tensor':
+    def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor':
+        """Rebuild tf graph for prediction."""
+        
+        self._create_tf_placeholders(session_data)
+        
         self.dial_embed, mask = self._create_tf_dial()
 
         self.sim_all = self._tf_raw_sim(
@@ -851,6 +878,8 @@ def _build_tf_pred_graph(self) -> 'tf.Tensor':
         return confidence
 
     def _extract_attention(self) -> Optional['tf.Tensor']:
+        """Extract attention probabilities from t2t dict"""
+        
         attention = [tf.expand_dims(t, 0)
                      for name, t in self.attention_weights.items()
                      if name.endswith('multihead_attention/dot_product_attention')]
@@ -928,8 +957,7 @@ def train(
                                    loss, acc)
 
             # rebuild the graph for prediction
-            self._create_tf_placeholders(session_data)
-            self.pred_confidence = self._build_tf_pred_graph()
+            self.pred_confidence = self._build_tf_pred_graph(session_data)
 
             self.attention_weights = self._extract_attention()
 
@@ -1060,6 +1088,8 @@ def tf_feed_dict_for_prediction(self,
                                     tracker: 'DialogueStateTracker',
                                     domain: 'Domain'
                                     ) -> Dict['tf.Tensor', 'np.ndarray']:
+        """Create feed dictionary for tf session."""
+        
         # noinspection PyPep8Naming
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
@@ -1091,6 +1121,8 @@ def predict_action_probabilities(
         return confidence[0, -1, :].tolist()
 
     def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None:
+        """Add tensor to collection if it is not None"""
+        
         if tensor is not None:
             self.graph.clear_collection(name)
             self.graph.add_to_collection(name, tensor)
@@ -1148,6 +1180,8 @@ def persist(self, path: Text) -> None:
 
     @staticmethod
     def load_tensor(name: Text) -> Optional['tf.Tensor']:
+        """Load tensor or set it to None"""
+
         tensor_list = tf.get_collection(name)
         return tensor_list[0] if tensor_list else None
 
@@ -1155,7 +1189,8 @@ def load_tensor(name: Text) -> Optional['tf.Tensor']:
     def load(cls, path: Text) -> "EmbeddingPolicy":
         """Loads a policy from the storage.
 
-            **Needs to load its featurizer**"""
+        **Needs to load its featurizer**
+        """
 
         if not os.path.exists(path):
             raise Exception(

From 1d28d63427d5a73f49c062def26ee6a2db8b26ca Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 17 Jul 2019 21:18:13 +0200
Subject: [PATCH 12/50] break long lines

---
 rasa/core/policies/embedding_policy.py | 40 ++++++++++++++++----------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 2f5041222b9d..06aa26e473fc 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -27,7 +27,9 @@
 
 try:
     from tensor2tensor.layers import common_attention
-    from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder
+    from tensor2tensor.models.transformer import (transformer_base,
+                                                  transformer_prepare_encoder,
+                                                  transformer_encoder)
 except ImportError:
     common_attention = None
     transformer_base = None
@@ -131,7 +133,8 @@ def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturize
         if max_history is None:
             return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
         else:
-            return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history)
+            return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(),
+                                               max_history=max_history)
 
     @staticmethod
     def _check_t2t() -> None:
@@ -289,7 +292,8 @@ def _action_features_for_Y(self, actions_for_Y: 'np.ndarray') -> 'np.ndarray':
             return np.stack(
                 [
                     np.stack(
-                        [self.encoded_all_actions[action_idx] for action_idx in action_ids]
+                        [self.encoded_all_actions[action_idx]
+                         for action_idx in action_ids]
                     )
                     for action_ids in actions_for_Y
                 ]
@@ -493,7 +497,8 @@ def _create_t2t_transformer_encoder(self,
              ) = transformer_prepare_encoder(x, None, hparams)
 
             if hparams.pos == 'custom_timing':
-                x = common_attention.add_timing_signal_1d(x, max_timescale=self.pos_max_timescale)
+                x = common_attention.add_timing_signal_1d(
+                    x, max_timescale=self.pos_max_timescale)
 
             x *= tf.expand_dims(mask, -1)
 
@@ -566,7 +571,8 @@ def _tf_calc_iou_mask(self,
         intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
         union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
 
-        iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1)
+        iou = (tf.reduce_sum(intersection_b_in_flat, -1)
+               / tf.reduce_sum(union_b_in_flat, -1))
         return 1. - tf.nn.relu(tf.sign(1. - iou))
 
     def _tf_get_negs(self,
@@ -586,7 +592,8 @@ def _tf_get_negs(self,
         bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
         bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1))
 
-        neg_embed_flat = self._tf_sample_neg(batch_size * seq_length, all_embed, neg_ids)
+        neg_embed_flat = self._tf_sample_neg(batch_size * seq_length,
+                                             all_embed, neg_ids)
         neg_embed = tf.reshape(neg_embed_flat,
                                (batch_size, seq_length, -1, all_embed.shape[-1]))
 
@@ -777,7 +784,8 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         (self.a_in,
          self.b_in,
          self.c_in,
-         self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in self._iterator.get_next())
+         self.b_prev_in) = (tf.cast(x_in, tf.float32)
+                            for x_in in self._iterator.get_next())
 
         all_actions = tf.constant(self.encoded_all_actions,
                                   dtype=tf.float32,
@@ -907,9 +915,8 @@ def train(
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
 
         # encode all actions with policies' featurizer
-        self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
-            domain
-        )
+        self.encoded_all_actions = \
+            self.featurizer.state_featurizer.create_encoded_all_actions(domain)
 
         # check if number of negatives is less than number of actions
         logger.debug(
@@ -939,8 +946,10 @@ def train(
             train_init_op = self._iterator.make_initializer(train_dataset)
 
             if self.evaluate_on_num_examples:
-                eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples)
-                eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False)
+                eval_session_data = self._sample_session_data(
+                    session_data, self.evaluate_on_num_examples)
+                eval_train_dataset = self._create_tf_dataset(
+                    eval_session_data, self.evaluate_on_num_examples, shuffle=False)
                 eval_init_op = self._iterator.make_initializer(eval_train_dataset)
             else:
                 eval_init_op = None
@@ -1030,12 +1039,13 @@ def _train_tf_dataset(self,
             })
 
             if self.evaluate_on_num_examples and eval_init_op is not None:
-                if (ep == 0 or
-                        (ep + 1) % self.evaluate_every_num_epochs == 0 or
-                        (ep + 1) == self.epochs):
+                if ((ep + 1) % self.evaluate_every_num_epochs == 0
+                        or (ep + 1) == self.epochs):
                     eval_loss, eval_acc = self._output_training_stat_dataset(
                         eval_init_op, loss, acc
                     )
+                if ((ep + 1) % self.evaluate_every_num_epochs == 0
+                        and (ep + 1) != self.epochs):
                     logger.info("Evaluation results: loss: {:.3f}, acc: {:.3f}"
                                 "".format(eval_loss, eval_acc))
 

From 00604d5afc3e3fc92fe435271f82620523cb6215 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 18 Jul 2019 12:46:44 +0200
Subject: [PATCH 13/50] make methods more generic, by using expand_dims instead
 of tf.newaxis

---
 rasa/core/policies/embedding_policy.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 06aa26e473fc..6756b6a77cad 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -351,15 +351,15 @@ def _create_tf_dataset(session_data: 'SessionData',
                            shuffle: bool = True) -> 'tf.data.Dataset':
         """Create tf dataset."""
 
-        train_dataset = tf.data.Dataset.from_tensor_slices(
+        dataset = tf.data.Dataset.from_tensor_slices(
             (session_data.X, session_data.Y,
              session_data.slots, session_data.previous_actions)
         )
         if shuffle:
-            train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X))
-        train_dataset = train_dataset.batch(batch_size)
+            dataset = dataset.shuffle(buffer_size=len(session_data.X))
+        dataset = dataset.batch(batch_size)
 
-        return train_dataset
+        return dataset
 
     @staticmethod
     def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator':
@@ -565,7 +565,7 @@ def _tf_calc_iou_mask(self,
                           neg_ids: 'tf.Tensor') -> 'tf.Tensor':
         """Calculate IOU mask for given indices"""
 
-        pos_b_in_flat = pos_b[:, tf.newaxis, :]
+        pos_b_in_flat = tf.expand_dims(pos_b, -2)
         neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
 
         intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
@@ -607,13 +607,13 @@ def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor',
                                                                    'tf.Tensor']:
         """Sample negative examples."""
 
-        pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :]
+        pos_dial_embed = tf.expand_dims(self.dial_embed, -2)
         neg_dial_embed, dial_bad_negs = self._tf_get_negs(
             self._tf_make_flat(self.dial_embed),
             self._tf_make_flat(self.b_in),
             self.b_in
         )
-        pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :]
+        pos_bot_embed = tf.expand_dims(self.bot_embed, -2)
         neg_bot_embed, bot_bad_negs = self._tf_get_negs(
             self.all_bot_embed,
             all_actions,

From f2b0c91ff4c4597fe81395889bc47a68563959f7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 23 Jul 2019 14:39:24 +0200
Subject: [PATCH 14/50] add first version of stratified batching

---
 rasa/core/featurizers.py               |   8 -
 rasa/core/policies/embedding_policy.py | 197 ++++++++++++++++---------
 2 files changed, 127 insertions(+), 78 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index f0e722975078..4bdcbb9c6384 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -26,11 +26,6 @@ class SingleStateFeaturizer(object):
     the conversation state to a format which a classifier can read:
     feature vector."""
 
-    def __init__(self):
-        """Declares instant variables."""
-        self.user_feature_len = None
-        self.slot_feature_len = None
-
     def prepare_from_domain(self, domain: Domain) -> None:
         """Helper method to init based on domain"""
         pass
@@ -73,9 +68,6 @@ def prepare_from_domain(self, domain: Domain) -> None:
         self.num_features = domain.num_states
         self.input_state_map = domain.input_state_map
 
-        self.user_feature_len = len(domain.intent_states) + len(domain.entity_states)
-        self.slot_feature_len = len(domain.slot_states)
-
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
         """Returns a binary vector indicating which features are active.
 
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 6756b6a77cad..00030c260b4f 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -5,6 +5,7 @@
 import os
 import warnings
 
+import pandas as pd
 import numpy as np
 import typing
 from tqdm import tqdm
@@ -54,9 +55,7 @@
     (
         "X",
         "Y",
-        "slots",
-        "previous_actions",
-        "actions_for_Y",
+        "labels_for_Y",
     ),
 )
 
@@ -258,50 +257,31 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self._load_visual_params(config)
 
     # data helpers
-    # noinspection PyPep8Naming
-    def _create_X_slots_previous_actions(
-        self, data_X: 'np.ndarray'
-    ) -> Tuple['np.ndarray', 'np.ndarray', 'np.ndarray']:
-        """Extract feature vectors from training data.
-
-        For user input (X), slots and previously executed actions.
-        """
-
-        featurizer = self.featurizer.state_featurizer
-        slot_start = featurizer.user_feature_len
-        previous_start = slot_start + featurizer.slot_feature_len
-
-        X = data_X[:, :, :slot_start]
-        slots = data_X[:, :, slot_start:previous_start]
-        previous_actions = data_X[:, :, previous_start:]
-
-        return X, slots, previous_actions
-
     # noinspection PyPep8Naming
     @staticmethod
-    def _actions_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray':
+    def _labels_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray':
         """Prepare Y data for training: extract actions indices."""
 
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
-    def _action_features_for_Y(self, actions_for_Y: 'np.ndarray') -> 'np.ndarray':
+    def _action_features_for_Y(self, labels_for_Y: 'np.ndarray') -> 'np.ndarray':
         """Prepare Y data for training: features for action labels."""
 
-        if len(actions_for_Y.shape) == 2:
+        if len(labels_for_Y.shape) == 2:
             return np.stack(
                 [
                     np.stack(
                         [self.encoded_all_actions[action_idx]
                          for action_idx in action_ids]
                     )
-                    for action_ids in actions_for_Y
+                    for action_ids in labels_for_Y
                 ]
             )
         else:
             return np.stack(
                 [
-                    self.encoded_all_actions[action_idx] for action_idx in actions_for_Y
+                    self.encoded_all_actions[action_idx] for action_idx in labels_for_Y
                 ]
             )
 
@@ -311,23 +291,19 @@ def _create_session_data(
     ) -> 'SessionData':
         """Combine all tf session related data into a named tuple"""
 
-        X, slots, previous_actions = self._create_X_slots_previous_actions(data_X)
-
         if data_Y is not None:
             # training time
-            actions_for_Y = self._actions_for_Y(data_Y)
-            Y = self._action_features_for_Y(actions_for_Y)
+            labels_for_Y = self._labels_for_Y(data_Y)
+            Y = self._action_features_for_Y(labels_for_Y)
         else:
             # prediction time
-            actions_for_Y = None
+            labels_for_Y = None
             Y = None
 
         return SessionData(
-            X=X,
+            X=data_X,
             Y=Y,
-            slots=slots,
-            previous_actions=previous_actions,
-            actions_for_Y=actions_for_Y,
+            labels_for_Y=labels_for_Y,
         )
 
     @staticmethod
@@ -339,25 +315,121 @@ def _sample_session_data(session_data: 'SessionData',
         return SessionData(
             X=session_data.X[ids],
             Y=session_data.Y[ids],
-            slots=session_data.slots[ids],
-            previous_actions=session_data.previous_actions[ids],
-            actions_for_Y=session_data.actions_for_Y[ids],
+            labels_for_Y=session_data.labels_for_Y[ids],
         )
 
     # tf helpers:
+    # noinspection PyPep8Naming
+    @staticmethod
+    def gen_stratified_batch(session_data, batch_size):
+
+        num_examples = len(session_data.X)
+        ids = np.random.permutation(num_examples)
+        X = session_data.X[ids]
+        Y = session_data.Y[ids]
+        labels_for_Y = session_data.labels_for_Y[ids]
+
+        labels = list(set(labels_for_Y))
+        np.random.shuffle(labels)
+
+        class_data = []
+        for label in labels:
+            label_X = X[labels_for_Y == label]
+            label_Y = Y[labels_for_Y == label]
+            label_labels_for_Y = labels_for_Y[labels_for_Y == label]
+            session_data_label = SessionData(
+                X=label_X,
+                Y=label_Y,
+                labels_for_Y=label_labels_for_Y,
+            )
+
+            class_data.append(session_data_label)
+
+        num_classes = len(class_data)
+
+        data_idx = [0] * num_classes
+        num_data_cycles = [0] * num_classes
+        print(batch_size)
+        print(X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0))
+        # print([len(class_i.X) / num_examples for class_i in class_data])
+        class_idx = 0
+        bbb = 0
+        while min(num_data_cycles) == 0:
+            batch_x = []
+            batch_y = []
+            batch_len = 0
+            while batch_len < batch_size:
+
+                class_i = class_data[class_idx]
+
+                num_i = int(len(class_i.X) / num_examples * batch_size) + 1
+
+                if batch_len + num_i > batch_size:
+                    num_i = batch_size - batch_len
+
+                if data_idx[class_idx] + num_i > len(class_i.X):
+                    num_i = len(class_i.X) - data_idx[class_idx]
+
+                batch_x.append(class_i.X[data_idx[class_idx]:data_idx[class_idx]+num_i])
+                batch_y.append(class_i.Y[data_idx[class_idx]:data_idx[class_idx]+num_i])
+                batch_len += num_i
+
+                data_idx[class_idx] += num_i
+                if data_idx[class_idx] >= len(class_i.X):
+                    num_data_cycles[class_idx] += 1
+                    data_idx[class_idx] = 0
+
+                class_idx += 1
+                if class_idx >= num_classes:
+                    class_idx = 0
+                if max(num_data_cycles) > 0 and max(num_data_cycles) == num_data_cycles[class_idx]:
+                    class_idx += 1
+                if class_idx >= num_classes:
+                    class_idx = 0
+            bbb+=1
+            if min(num_data_cycles) > 0:
+                print(num_data_cycles)
+                print(bbb)
+            yield np.concatenate(batch_x), np.concatenate(batch_y)
+
+    # noinspection PyPep8Naming
     @staticmethod
-    def _create_tf_dataset(session_data: 'SessionData',
+    def gen_sequence_batch(session_data, batch_size):
+
+        ids = np.random.permutation(len(session_data.X))
+        X = session_data.X[ids]
+        Y = session_data.Y[ids]
+
+        num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
+
+        for batch_num in range(num_batches):
+            batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size]
+            batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size]
+
+            yield batch_x, batch_y
+
+    # @staticmethod
+    def _create_tf_dataset(self, session_data: 'SessionData',
                            batch_size: Union['tf.Tensor', int],
                            shuffle: bool = True) -> 'tf.data.Dataset':
         """Create tf dataset."""
 
-        dataset = tf.data.Dataset.from_tensor_slices(
-            (session_data.X, session_data.Y,
-             session_data.slots, session_data.previous_actions)
-        )
-        if shuffle:
-            dataset = dataset.shuffle(buffer_size=len(session_data.X))
-        dataset = dataset.batch(batch_size)
+        def train_gen_func(batch_size_):
+            return self.gen_stratified_batch(session_data, batch_size_)
+            # return self.gen_sequence_batch(session_data, batch_size_)
+
+        dpt_types = (tf.float32, tf.float32)
+        dpt_shapes = ([None] + list(session_data.X[0].shape),
+                      [None] + list(session_data.Y[0].shape))
+
+        dataset = tf.data.Dataset.from_generator(train_gen_func, dpt_types, dpt_shapes, args=([batch_size]))
+        # dataset = tf.data.Dataset.from_tensor_slices(
+        #     (session_data.X, session_data.Y,
+        #      session_data.slots, session_data.previous_actions)
+        # )
+        # if shuffle:
+        #     dataset = dataset.shuffle(buffer_size=len(session_data.X))
+        # dataset = dataset.batch(batch_size)
 
         return dataset
 
@@ -520,7 +592,8 @@ def _create_t2t_transformer_encoder(self,
 
             x *= tf.expand_dims(mask, -1)
 
-            return tf.nn.relu(x)
+            return tf.nn.dropout(tf.nn.relu(x),
+                                 1.0 - hparams.layer_prepostprocess_dropout)
 
     def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         """Create dialogue level embedding and mask."""
@@ -529,12 +602,12 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
-        x_in = tf.concat([self.a_in, self.b_prev_in, self.c_in], -1)
-
         self.attention_weights = {}
-        x = self._create_t2t_transformer_encoder(x_in, mask, self.attention_weights)
+        a = self._create_t2t_transformer_encoder(self.a_in,
+                                                 mask,
+                                                 self.attention_weights)
 
-        dial_embed = self._create_tf_embed(x, layer_name_suffix="dial")
+        dial_embed = self._create_tf_embed(a, layer_name_suffix="dial")
 
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
             # pick last action if max history featurizer is used
@@ -781,11 +854,7 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         """Bulid train graph using iterator."""
 
         # session data are int counts but we need a float tensors
-        (self.a_in,
-         self.b_in,
-         self.c_in,
-         self.b_prev_in) = (tf.cast(x_in, tf.float32)
-                            for x_in in self._iterator.get_next())
+        self.a_in, self.b_in = self._iterator.get_next()
 
         all_actions = tf.constant(self.encoded_all_actions,
                                   dtype=tf.float32,
@@ -844,16 +913,6 @@ def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
             shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
             name="b",
         )
-        self.c_in = tf.placeholder(
-            dtype=tf.float32,
-            shape=(None, dialogue_len, session_data.slots.shape[-1]),
-            name="slt",
-        )
-        self.b_prev_in = tf.placeholder(
-            dtype=tf.float32,
-            shape=(None, dialogue_len, session_data.Y.shape[-1]),
-            name="b_prev",
-        )
 
     def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor':
         """Rebuild tf graph for prediction."""
@@ -1104,9 +1163,7 @@ def tf_feed_dict_for_prediction(self,
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
 
-        return {self.a_in: session_data.X,
-                self.c_in: session_data.slots,
-                self.b_prev_in: session_data.previous_actions}
+        return {self.a_in: session_data.X}
 
     def predict_action_probabilities(
         self, tracker: 'DialogueStateTracker', domain: 'Domain'

From 435611f9643987a99997e7f63dd860eaa8f52413 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 23 Jul 2019 17:40:13 +0200
Subject: [PATCH 15/50] update batching

---
 rasa/core/policies/embedding_policy.py | 141 ++++++++++++-------------
 1 file changed, 66 insertions(+), 75 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 00030c260b4f..3f0311a924f0 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -5,7 +5,6 @@
 import os
 import warnings
 
-import pandas as pd
 import numpy as np
 import typing
 from tqdm import tqdm
@@ -55,7 +54,7 @@
     (
         "X",
         "Y",
-        "labels_for_Y",
+        "labels",
     ),
 )
 
@@ -265,23 +264,23 @@ def _labels_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray':
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
-    def _action_features_for_Y(self, labels_for_Y: 'np.ndarray') -> 'np.ndarray':
+    def _action_features_for_Y(self, labels: 'np.ndarray') -> 'np.ndarray':
         """Prepare Y data for training: features for action labels."""
 
-        if len(labels_for_Y.shape) == 2:
+        if len(labels.shape) == 2:
             return np.stack(
                 [
                     np.stack(
                         [self.encoded_all_actions[action_idx]
                          for action_idx in action_ids]
                     )
-                    for action_ids in labels_for_Y
+                    for action_ids in labels
                 ]
             )
         else:
             return np.stack(
                 [
-                    self.encoded_all_actions[action_idx] for action_idx in labels_for_Y
+                    self.encoded_all_actions[action_idx] for action_idx in labels
                 ]
             )
 
@@ -293,17 +292,17 @@ def _create_session_data(
 
         if data_Y is not None:
             # training time
-            labels_for_Y = self._labels_for_Y(data_Y)
-            Y = self._action_features_for_Y(labels_for_Y)
+            labels = self._labels_for_Y(data_Y)
+            Y = self._action_features_for_Y(labels)
         else:
             # prediction time
-            labels_for_Y = None
+            labels = None
             Y = None
 
         return SessionData(
             X=data_X,
             Y=Y,
-            labels_for_Y=labels_for_Y,
+            labels=labels,
         )
 
     @staticmethod
@@ -315,7 +314,7 @@ def _sample_session_data(session_data: 'SessionData',
         return SessionData(
             X=session_data.X[ids],
             Y=session_data.Y[ids],
-            labels_for_Y=session_data.labels_for_Y[ids],
+            labels=session_data.labels[ids],
         )
 
     # tf helpers:
@@ -327,70 +326,60 @@ def gen_stratified_batch(session_data, batch_size):
         ids = np.random.permutation(num_examples)
         X = session_data.X[ids]
         Y = session_data.Y[ids]
-        labels_for_Y = session_data.labels_for_Y[ids]
-
-        labels = list(set(labels_for_Y))
-        np.random.shuffle(labels)
-
-        class_data = []
-        for label in labels:
-            label_X = X[labels_for_Y == label]
-            label_Y = Y[labels_for_Y == label]
-            label_labels_for_Y = labels_for_Y[labels_for_Y == label]
-            session_data_label = SessionData(
-                X=label_X,
-                Y=label_Y,
-                labels_for_Y=label_labels_for_Y,
-            )
+        labels = session_data.labels[ids]
+
+        unique_labels, counts_labels = np.unique(labels, return_counts=True)
+        num_labels = len(unique_labels)
+        ids = np.random.permutation(num_labels)
+        unique_labels = unique_labels[ids]
+        counts_labels = counts_labels[ids]
+
+        label_data = []
+        for label in unique_labels:
+            label_data.append(SessionData(X=X[labels == label],
+                                          Y=Y[labels == label],
+                                          labels=labels[labels == label]))
+
+        data_idx = [0] * num_labels
+        num_data_cycles = [0] * num_labels
+        skipped = [False] * num_labels
+        new_X = []
+        new_Y = []
+        while min(num_data_cycles) == 0:
+            for i in range(num_labels):
+                if num_data_cycles[i] > 0 and not skipped[i]:
+                    skipped[i] = True
+                    continue
+                else:
+                    skipped[i] = False
 
-            class_data.append(session_data_label)
+                num_i = int(counts_labels[i] / num_examples * batch_size) + 1
 
-        num_classes = len(class_data)
+                new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i])
+                new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i])
 
-        data_idx = [0] * num_classes
-        num_data_cycles = [0] * num_classes
-        print(batch_size)
-        print(X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0))
-        # print([len(class_i.X) / num_examples for class_i in class_data])
-        class_idx = 0
-        bbb = 0
-        while min(num_data_cycles) == 0:
-            batch_x = []
-            batch_y = []
-            batch_len = 0
-            while batch_len < batch_size:
-
-                class_i = class_data[class_idx]
-
-                num_i = int(len(class_i.X) / num_examples * batch_size) + 1
-
-                if batch_len + num_i > batch_size:
-                    num_i = batch_size - batch_len
-
-                if data_idx[class_idx] + num_i > len(class_i.X):
-                    num_i = len(class_i.X) - data_idx[class_idx]
-
-                batch_x.append(class_i.X[data_idx[class_idx]:data_idx[class_idx]+num_i])
-                batch_y.append(class_i.Y[data_idx[class_idx]:data_idx[class_idx]+num_i])
-                batch_len += num_i
-
-                data_idx[class_idx] += num_i
-                if data_idx[class_idx] >= len(class_i.X):
-                    num_data_cycles[class_idx] += 1
-                    data_idx[class_idx] = 0
-
-                class_idx += 1
-                if class_idx >= num_classes:
-                    class_idx = 0
-                if max(num_data_cycles) > 0 and max(num_data_cycles) == num_data_cycles[class_idx]:
-                    class_idx += 1
-                if class_idx >= num_classes:
-                    class_idx = 0
-            bbb+=1
-            if min(num_data_cycles) > 0:
-                print(num_data_cycles)
-                print(bbb)
-            yield np.concatenate(batch_x), np.concatenate(batch_y)
+                data_idx[i] += num_i
+                if data_idx[i] >= counts_labels[i]:
+                    num_data_cycles[i] += 1
+                    data_idx[i] = 0
+
+                if min(num_data_cycles) > 0:
+                    break
+
+        print(num_data_cycles)
+        num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
+        print(num_batches)
+
+        X = np.concatenate(new_X)
+        Y = np.concatenate(new_Y)
+
+        num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
+        print(num_batches)
+        for batch_num in range(num_batches):
+            batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size]
+            batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size]
+
+            yield batch_x, batch_y
 
     # noinspection PyPep8Naming
     @staticmethod
@@ -658,9 +647,11 @@ def _tf_get_negs(self,
         seq_length = tf.shape(raw_pos)[1]
         raw_flat = self._tf_make_flat(raw_pos)
 
-        neg_ids = tf.random.categorical(tf.log(tf.ones((batch_size * seq_length,
-                                                        tf.shape(all_raw)[0]))),
-                                        self.num_neg)
+        total_cands = tf.shape(all_embed)[0]
+
+        all_indices = tf.tile(tf.expand_dims(tf.range(0, total_cands, 1), 0), (batch_size * seq_length, 1))
+        shuffled_indices = tf.transpose(tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0))
+        neg_ids = shuffled_indices[:, :self.num_neg]
 
         bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
         bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1))

From 0f48690cf4adc36fc833d2a59a5a1730c53f2569 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 23 Jul 2019 17:41:01 +0200
Subject: [PATCH 16/50] remove prints

---
 rasa/core/policies/embedding_policy.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 3f0311a924f0..ba528d7f38de 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -366,15 +366,11 @@ def gen_stratified_batch(session_data, batch_size):
                 if min(num_data_cycles) > 0:
                     break
 
-        print(num_data_cycles)
-        num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
-        print(num_batches)
-
         X = np.concatenate(new_X)
         Y = np.concatenate(new_Y)
 
         num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
-        print(num_batches)
+
         for batch_num in range(num_batches):
             batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size]
             batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size]

From c6634be85c188687742924ae05f77b570835124e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 10:14:28 +0200
Subject: [PATCH 17/50] add random perturbation of labels

---
 rasa/core/policies/embedding_policy.py | 42 +++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index ba528d7f38de..334e56fdd10a 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -87,6 +87,7 @@ class EmbeddingPolicy(Policy):
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
         "batch_size": [8, 32],
+        "batch_strategy": 'sequence',  # string 'sequence' or 'balanced'
         # number of epochs
         "epochs": 1,
         # set random seed to any int to get reproducible results
@@ -211,6 +212,7 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.num_transformer_layers = config["num_transformer_layers"]
 
         self.batch_size = config["batch_size"]
+        self.batch_strategy = config["batch_strategy"]
 
         self.epochs = config["epochs"]
 
@@ -320,7 +322,7 @@ def _sample_session_data(session_data: 'SessionData',
     # tf helpers:
     # noinspection PyPep8Naming
     @staticmethod
-    def gen_stratified_batch(session_data, batch_size):
+    def gen_balanced_batch(session_data, batch_size):
 
         num_examples = len(session_data.X)
         ids = np.random.permutation(num_examples)
@@ -330,9 +332,6 @@ def gen_stratified_batch(session_data, batch_size):
 
         unique_labels, counts_labels = np.unique(labels, return_counts=True)
         num_labels = len(unique_labels)
-        ids = np.random.permutation(num_labels)
-        unique_labels = unique_labels[ids]
-        counts_labels = counts_labels[ids]
 
         label_data = []
         for label in unique_labels:
@@ -346,7 +345,8 @@ def gen_stratified_batch(session_data, batch_size):
         new_X = []
         new_Y = []
         while min(num_data_cycles) == 0:
-            for i in range(num_labels):
+            ids = np.random.permutation(num_labels)
+            for i in ids:
                 if num_data_cycles[i] > 0 and not skipped[i]:
                     skipped[i] = True
                     continue
@@ -393,28 +393,28 @@ def gen_sequence_batch(session_data, batch_size):
 
             yield batch_x, batch_y
 
-    # @staticmethod
+    def train_gen_func(self, session_data, batch_size):
+        if self.batch_strategy == 'sequence':
+            return self.gen_sequence_batch(session_data, batch_size)
+        elif self.batch_strategy == 'balanced':
+            return self.gen_balanced_batch(session_data, batch_size)
+        else:
+            raise ValueError(
+                "Wrong batch strategy '{}', "
+                "should be 'sequence' or 'balanced'"
+                "".format(self.batch_strategy)
+            )
+
     def _create_tf_dataset(self, session_data: 'SessionData',
                            batch_size: Union['tf.Tensor', int],
                            shuffle: bool = True) -> 'tf.data.Dataset':
         """Create tf dataset."""
 
-        def train_gen_func(batch_size_):
-            return self.gen_stratified_batch(session_data, batch_size_)
-            # return self.gen_sequence_batch(session_data, batch_size_)
-
         dpt_types = (tf.float32, tf.float32)
         dpt_shapes = ([None] + list(session_data.X[0].shape),
                       [None] + list(session_data.Y[0].shape))
 
-        dataset = tf.data.Dataset.from_generator(train_gen_func, dpt_types, dpt_shapes, args=([batch_size]))
-        # dataset = tf.data.Dataset.from_tensor_slices(
-        #     (session_data.X, session_data.Y,
-        #      session_data.slots, session_data.previous_actions)
-        # )
-        # if shuffle:
-        #     dataset = dataset.shuffle(buffer_size=len(session_data.X))
-        # dataset = dataset.batch(batch_size)
+        dataset = tf.data.Dataset.from_generator(lambda x: self.train_gen_func(session_data, x), dpt_types, dpt_shapes, args=([batch_size]))
 
         return dataset
 
@@ -458,7 +458,7 @@ def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor':
             return x
         else:
             raise ValueError(
-                "Wrong similarity type {}, "
+                "Wrong similarity type '{}', "
                 "should be 'cosine' or 'inner'"
                 "".format(self.similarity_type)
             )
@@ -832,7 +832,7 @@ def _choose_loss(self,
                                          mask)
         else:
             raise ValueError(
-                "Wrong loss type {}, "
+                "Wrong loss type '{}', "
                 "should be 'margin' or 'softmax'"
                 "".format(self.loss_type)
             )
@@ -1248,7 +1248,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         if not os.path.exists(path):
             raise Exception(
-                "Failed to load dialogue model. Path {} "
+                "Failed to load dialogue model. Path '{}' "
                 "doesn't exist".format(os.path.abspath(path))
             )
 

From b917e37b132c0fb198f5b188806d8b7f465d1266 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 16:49:29 +0200
Subject: [PATCH 18/50] add validation split

---
 rasa/core/policies/embedding_policy.py | 473 ++++++++++++++-----------
 1 file changed, 266 insertions(+), 207 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 334e56fdd10a..687136c4363b 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -8,7 +8,7 @@
 import numpy as np
 import typing
 from tqdm import tqdm
-from typing import Any, List, Optional, Text, Dict, Tuple, Union
+from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator
 
 import rasa.utils.io
 from rasa.core import utils
@@ -23,18 +23,20 @@
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils.common import is_logging_disabled
 
+from sklearn.model_selection import train_test_split
 import tensorflow as tf
 
 try:
-    from tensor2tensor.layers import common_attention
     from tensor2tensor.models.transformer import (transformer_base,
                                                   transformer_prepare_encoder,
                                                   transformer_encoder)
+    from tensor2tensor.layers.common_attention import large_compatible_negative
+
 except ImportError:
-    common_attention = None
     transformer_base = None
     transformer_prepare_encoder = None
     transformer_encoder = None
+    large_compatible_negative = None
 
 try:
     import cPickle as pickle
@@ -73,20 +75,21 @@ class EmbeddingPolicy(Policy):
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_bot": [],
-
-        "pos_encoding": "timing",  # {"timing", "emb", "custom_timing"}
-        # introduce phase shift in time encodings between transformers
-        # 0.5 - 0.8 works on small dataset
-        "pos_max_timescale": 1.0e1,
+        # type of positional encoding in transformer
+        "pos_encoding": "timing",  # {"timing", "emb"}
+        # max sequence length if pos_encoding='emb'
         "max_seq_length": 256,
+        # number of attention heads in transformer
         "num_heads": 4,
         # number of units in transformer
         "transformer_size": 128,
+        # number of transformer layers
         "num_transformer_layers": 1,
         # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
+        # initial and final batch sizes:
+        # batch size will be linearly increased for each epoch
         "batch_size": [8, 32],
+        # how to create batches
         "batch_strategy": 'sequence',  # string 'sequence' or 'balanced'
         # number of epochs
         "epochs": 1,
@@ -102,6 +105,7 @@ class EmbeddingPolicy(Policy):
         "mu_neg": -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
         # the type of the similarity
         "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
         "loss_type": 'softmax',  # string 'softmax' or 'margin'
         # the number of incorrect actions, the algorithm will minimize
         # their similarity to the user input during training
@@ -119,10 +123,10 @@ class EmbeddingPolicy(Policy):
         # dropout rate for dial nn
         "droprate_dial": 0.1,
         # visualization of accuracy
-        # how often calculate train accuracy
+        # how often calculate validation accuracy
         "evaluate_every_num_epochs": 20,  # small values may hurt performance
-        # how many examples to use for calculation of train accuracy
-        "evaluate_on_num_examples": 100,  # large values may hurt performance
+        # how many examples to use for hold out validation set
+        "evaluate_on_num_examples": 0,  # large values may hurt performance
     }
 
     # end default properties (DOC MARKER - don't remove)
@@ -137,7 +141,7 @@ def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturize
 
     @staticmethod
     def _check_t2t() -> None:
-        if common_attention is None:
+        if transformer_base is None:
             raise ImportError("Please install tensor2tensor")
 
     def __init__(
@@ -204,7 +208,6 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"]
 
         self.pos_encoding = config['pos_encoding']
-        self.pos_max_timescale = config['pos_max_timescale']
         self.max_seq_length = config['max_seq_length']
         self.num_heads = config['num_heads']
 
@@ -296,6 +299,12 @@ def _create_session_data(
             # training time
             labels = self._labels_for_Y(data_Y)
             Y = self._action_features_for_Y(labels)
+
+            # idea taken from sklearn's stratify split
+            if labels.ndim == 2:
+                # for multi-label y, map each distinct row to a string repr
+                # using join because str(row) uses an ellipsis if len(row) > 1000
+                labels = np.array([' '.join(row.astype('str')) for row in labels])
         else:
             # prediction time
             labels = None
@@ -307,12 +316,43 @@ def _create_session_data(
             labels=labels,
         )
 
+    # noinspection PyPep8Naming
+    def _train_val_split(self, session_data: 'SessionData'
+                         ) -> Tuple['SessionData', 'SessionData']:
+        """Create random hold out validation set using stratified split."""
+
+        label_counts = dict(zip(*np.unique(session_data.labels,
+                                           return_counts=True, axis=0)))
+        counts = np.array([label_counts[label] for label in session_data.labels])
+
+        multi_X = session_data.X[counts > 1]
+        multi_Y = session_data.Y[counts > 1]
+        multi_labels = session_data.labels[counts > 1]
+
+        solo_X = session_data.X[counts == 1]
+        solo_Y = session_data.Y[counts == 1]
+        solo_labels = session_data.labels[counts == 1]
+
+        (X_train, X_val,
+         Y_train, Y_val,
+         labels_train, labels_val) = train_test_split(
+            multi_X, multi_Y, multi_labels,
+            test_size=self.evaluate_on_num_examples,
+            random_state=self.random_seed,
+            stratify=multi_labels
+        )
+        X_train = np.concatenate([X_train, solo_X])
+        Y_train = np.concatenate([Y_train, solo_Y])
+        labels_train = np.concatenate([labels_train, solo_labels])
+
+        return (SessionData(X=X_train, Y=Y_train, labels=labels_train),
+                SessionData(X=X_val, Y=Y_val, labels=labels_val))
+
     @staticmethod
-    def _sample_session_data(session_data: 'SessionData',
-                             num_samples: int) -> 'SessionData':
-        """Sample session data."""
+    def _shuffle_session_data(session_data: 'SessionData') -> 'SessionData':
+        """Shuffle session data."""
 
-        ids = np.random.permutation(len(session_data.X))[:num_samples]
+        ids = np.random.permutation(len(session_data.X))
         return SessionData(
             X=session_data.X[ids],
             Y=session_data.Y[ids],
@@ -321,102 +361,94 @@ def _sample_session_data(session_data: 'SessionData',
 
     # tf helpers:
     # noinspection PyPep8Naming
-    @staticmethod
-    def gen_balanced_batch(session_data, batch_size):
-
-        num_examples = len(session_data.X)
-        ids = np.random.permutation(num_examples)
-        X = session_data.X[ids]
-        Y = session_data.Y[ids]
-        labels = session_data.labels[ids]
-
-        unique_labels, counts_labels = np.unique(labels, return_counts=True)
-        num_labels = len(unique_labels)
-
-        label_data = []
-        for label in unique_labels:
-            label_data.append(SessionData(X=X[labels == label],
-                                          Y=Y[labels == label],
-                                          labels=labels[labels == label]))
-
-        data_idx = [0] * num_labels
-        num_data_cycles = [0] * num_labels
-        skipped = [False] * num_labels
-        new_X = []
-        new_Y = []
-        while min(num_data_cycles) == 0:
-            ids = np.random.permutation(num_labels)
-            for i in ids:
-                if num_data_cycles[i] > 0 and not skipped[i]:
-                    skipped[i] = True
-                    continue
+    def _gen_batch(self,
+                   session_data: 'SessionData',
+                   batch_size: int,
+                   batch_strategy: Text = 'sequence',
+                   shuffle: bool = False
+                   ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]:
+        """Generate batches."""
+
+        if shuffle:
+            session_data = self._shuffle_session_data(session_data)
+
+        if batch_strategy == 'balanced':
+            num_examples = len(session_data.X)
+            unique_labels, counts_labels = np.unique(session_data.labels,
+                                                     return_counts=True,
+                                                     axis=0)
+            num_labels = len(unique_labels)
+
+            label_data = []
+            for label in unique_labels:
+                label_data.append(SessionData(
+                    X=session_data.X[session_data.labels == label],
+                    Y=session_data.Y[session_data.labels == label],
+                    labels=None  # ignore new labels
+                ))
+
+            data_idx = [0] * num_labels
+            num_data_cycles = [0] * num_labels
+            skipped = [False] * num_labels
+            new_X = []
+            new_Y = []
+            while min(num_data_cycles) == 0:
+                if shuffle:
+                    ids = np.random.permutation(num_labels)
                 else:
-                    skipped[i] = False
+                    ids = range(num_labels)
 
-                num_i = int(counts_labels[i] / num_examples * batch_size) + 1
+                for i in ids:
+                    if num_data_cycles[i] > 0 and not skipped[i]:
+                        skipped[i] = True
+                        continue
+                    else:
+                        skipped[i] = False
 
-                new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i])
-                new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i])
+                    num_i = int(counts_labels[i] / num_examples * batch_size) + 1
 
-                data_idx[i] += num_i
-                if data_idx[i] >= counts_labels[i]:
-                    num_data_cycles[i] += 1
-                    data_idx[i] = 0
+                    new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i])
+                    new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i])
 
-                if min(num_data_cycles) > 0:
-                    break
+                    data_idx[i] += num_i
+                    if data_idx[i] >= counts_labels[i]:
+                        num_data_cycles[i] += 1
+                        data_idx[i] = 0
 
-        X = np.concatenate(new_X)
-        Y = np.concatenate(new_Y)
+                    if min(num_data_cycles) > 0:
+                        break
 
-        num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
+            session_data = SessionData(X=np.concatenate(new_X),
+                                       Y=np.concatenate(new_Y),
+                                       labels=None)  # ignore new labels
 
-        for batch_num in range(num_batches):
-            batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size]
-            batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size]
-
-            yield batch_x, batch_y
-
-    # noinspection PyPep8Naming
-    @staticmethod
-    def gen_sequence_batch(session_data, batch_size):
-
-        ids = np.random.permutation(len(session_data.X))
-        X = session_data.X[ids]
-        Y = session_data.Y[ids]
-
-        num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)
+        num_batches = (session_data.X.shape[0] // batch_size
+                       + int(session_data.X.shape[0] % batch_size > 0))
 
         for batch_num in range(num_batches):
-            batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size]
-            batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size]
+            batch_x = session_data.X[
+                      batch_num * batch_size: (batch_num + 1) * batch_size]
+            batch_y = session_data.Y[
+                      batch_num * batch_size: (batch_num + 1) * batch_size]
 
             yield batch_x, batch_y
 
-    def train_gen_func(self, session_data, batch_size):
-        if self.batch_strategy == 'sequence':
-            return self.gen_sequence_batch(session_data, batch_size)
-        elif self.batch_strategy == 'balanced':
-            return self.gen_balanced_batch(session_data, batch_size)
-        else:
-            raise ValueError(
-                "Wrong batch strategy '{}', "
-                "should be 'sequence' or 'balanced'"
-                "".format(self.batch_strategy)
-            )
-
     def _create_tf_dataset(self, session_data: 'SessionData',
                            batch_size: Union['tf.Tensor', int],
-                           shuffle: bool = True) -> 'tf.data.Dataset':
+                           batch_strategy: Text = 'sequence',
+                           shuffle: bool = False) -> 'tf.data.Dataset':
         """Create tf dataset."""
 
-        dpt_types = (tf.float32, tf.float32)
-        dpt_shapes = ([None] + list(session_data.X[0].shape),
-                      [None] + list(session_data.Y[0].shape))
-
-        dataset = tf.data.Dataset.from_generator(lambda x: self.train_gen_func(session_data, x), dpt_types, dpt_shapes, args=([batch_size]))
-
-        return dataset
+        return tf.data.Dataset.from_generator(
+            lambda batch_size_: self._gen_batch(session_data,
+                                                batch_size_,
+                                                batch_strategy,
+                                                shuffle),
+            output_types=(tf.float32, tf.float32),
+            output_shapes=([None] + list(session_data.X[0].shape),  # set batch to None
+                           [None] + list(session_data.Y[0].shape)),  # set batch to None
+            args=([batch_size])
+        )
 
     @staticmethod
     def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator':
@@ -553,10 +585,6 @@ def _create_t2t_transformer_encoder(self,
              encoder_decoder_attention_bias
              ) = transformer_prepare_encoder(x, None, hparams)
 
-            if hparams.pos == 'custom_timing':
-                x = common_attention.add_timing_signal_1d(
-                    x, max_timescale=self.pos_max_timescale)
-
             x *= tf.expand_dims(mask, -1)
 
             x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
@@ -643,10 +671,13 @@ def _tf_get_negs(self,
         seq_length = tf.shape(raw_pos)[1]
         raw_flat = self._tf_make_flat(raw_pos)
 
-        total_cands = tf.shape(all_embed)[0]
+        total_candidates = tf.shape(all_embed)[0]
 
-        all_indices = tf.tile(tf.expand_dims(tf.range(0, total_cands, 1), 0), (batch_size * seq_length, 1))
-        shuffled_indices = tf.transpose(tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0))
+        all_indices = tf.tile(tf.expand_dims(tf.range(0, total_candidates, 1), 0),
+                              (batch_size * seq_length, 1))
+        shuffled_indices = tf.transpose(
+            tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
+        )
         neg_ids = shuffled_indices[:, :self.num_neg]
 
         bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
@@ -702,7 +733,7 @@ def _tf_sim(
 
         # calculate similarity with several
         # embedded actions for the loss
-        neg_inf = common_attention.large_compatible_negative(pos_dial_embed.dtype)
+        neg_inf = large_compatible_negative(pos_dial_embed.dtype)
 
         sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
         sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed,
@@ -886,6 +917,119 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
                                  mask)
         return loss, acc
 
+    # training helpers
+    def _linearly_increasing_batch_size(self, epoch: int) -> int:
+        """Linearly increase batch size with every epoch.
+
+        The idea comes from https://arxiv.org/abs/1711.00489.
+        """
+
+        if not isinstance(self.batch_size, list):
+            return int(self.batch_size)
+
+        if self.epochs > 1:
+            return int(
+                self.batch_size[0]
+                + epoch * (self.batch_size[1] - self.batch_size[0]) / (self.epochs - 1)
+            )
+        else:
+            return int(self.batch_size[0])
+
+    def _train_tf_dataset(self,
+                          train_init_op: 'tf.Operation',
+                          eval_init_op: 'tf.Operation',
+                          batch_size_in: 'tf.Tensor',
+                          loss: 'tf.Tensor',
+                          acc,
+                          ) -> None:
+        """Train tf graph"""
+
+        self.session.run(tf.global_variables_initializer())
+
+        if self.evaluate_on_num_examples:
+            logger.info(
+                "Validation accuracy is calculated every {} epochs"
+                "".format(self.evaluate_every_num_epochs)
+            )
+        pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled())
+
+        train_loss = 0
+        train_acc = 0
+        eval_loss = 0
+        eval_acc = 0
+        for ep in pbar:
+
+            batch_size = self._linearly_increasing_batch_size(ep)
+
+            self.session.run(train_init_op, feed_dict={batch_size_in: batch_size})
+
+            ep_train_loss = 0
+            ep_train_acc = 0
+            batches_per_epoch = 0
+            while True:
+                try:
+                    _, batch_train_loss, batch_train_acc = self.session.run(
+                        [self._train_op, loss, acc], feed_dict={self._is_training: True}
+                    )
+                    batches_per_epoch += 1
+                    ep_train_loss += batch_train_loss
+                    ep_train_acc += batch_train_acc
+
+                except tf.errors.OutOfRangeError:
+                    break
+
+            train_loss = ep_train_loss / batches_per_epoch
+            train_acc = ep_train_acc / batches_per_epoch
+
+            pbar.set_postfix({
+                "loss": "{:.3f}".format(train_loss),
+                "acc": "{:.3f}".format(train_acc)
+            })
+
+            if eval_init_op is not None:
+                if ((ep + 1) % self.evaluate_every_num_epochs == 0
+                        or (ep + 1) == self.epochs):
+                    eval_loss, eval_acc = self._output_training_stat_dataset(
+                        eval_init_op, loss, acc
+                    )
+                    if (ep + 1) != self.epochs:
+                        logger.info("Evaluation results: "
+                                    "validation loss: {:.3f}, "
+                                    "validation accuracy: {:.3f}"
+                                    "".format(eval_loss, eval_acc))
+
+        final_message = ("Finished training embedding policy, "
+                         "train loss={:.3f}, train accuracy={:.3f}"
+                         "".format(train_loss, train_acc))
+        if eval_init_op is not None:
+            final_message += (", validation loss={:.3f}, validation accuracy={:.3f}"
+                              "".format(eval_loss, eval_acc))
+        logger.info(final_message)
+
+    def _output_training_stat_dataset(self,
+                                      eval_init_op: 'tf.Operation',
+                                      loss: 'tf.Tensor',
+                                      acc: 'tf.Tensor') -> Tuple[float, float]:
+        """Output training statistics"""
+
+        self.session.run(eval_init_op)
+        ep_val_loss = 0
+        ep_val_acc = 0
+        batches_per_epoch = 0
+        while True:
+            try:
+                batch_val_loss, batch_val_acc = self.session.run(
+                    [loss, acc], feed_dict={self._is_training: False}
+                )
+                batches_per_epoch += 1
+                ep_val_loss += batch_val_loss
+                ep_val_acc += batch_val_acc
+            except tf.errors.OutOfRangeError:
+                break
+
+        return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch
+
+    # prepare for prediction
     def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
         """Create placeholders for prediction."""
         
@@ -977,6 +1121,11 @@ def train(
         # extract actual training data to feed to tf session
         session_data = self._create_session_data(training_data.X, training_data.y)
 
+        if self.evaluate_on_num_examples:
+            session_data, eval_session_data = self._train_val_split(session_data)
+        else:
+            eval_session_data = None
+
         self.graph = tf.Graph()
 
         with self.graph.as_default():
@@ -985,18 +1134,22 @@ def train(
 
             # allows increasing batch size
             batch_size_in = tf.placeholder(tf.int64)
-            train_dataset = self._create_tf_dataset(session_data, batch_size_in)
+            train_dataset = self._create_tf_dataset(session_data,
+                                                    batch_size_in,
+                                                    batch_strategy=self.batch_strategy,
+                                                    shuffle=True)
 
             self._iterator = self._create_tf_iterator(train_dataset)
 
             train_init_op = self._iterator.make_initializer(train_dataset)
 
-            if self.evaluate_on_num_examples:
-                eval_session_data = self._sample_session_data(
-                    session_data, self.evaluate_on_num_examples)
-                eval_train_dataset = self._create_tf_dataset(
-                    eval_session_data, self.evaluate_on_num_examples, shuffle=False)
-                eval_init_op = self._iterator.make_initializer(eval_train_dataset)
+            if eval_session_data is not None:
+                eval_init_op = self._iterator.make_initializer(
+                    self._create_tf_dataset(
+                        eval_session_data,
+                        # pick maximum batch_size for eval
+                        self._linearly_increasing_batch_size(self.epochs))
+                )
             else:
                 eval_init_op = None
 
@@ -1016,100 +1169,6 @@ def train(
 
             self.attention_weights = self._extract_attention()
 
-    # training helpers
-    def _linearly_increasing_batch_size(self, epoch: int) -> int:
-        """Linearly increase batch size with every epoch.
-
-        The idea comes from https://arxiv.org/abs/1711.00489.
-        """
-
-        if not isinstance(self.batch_size, list):
-            return int(self.batch_size)
-
-        if self.epochs > 1:
-            return int(
-                self.batch_size[0]
-                + epoch * (self.batch_size[1] - self.batch_size[0]) / (self.epochs - 1)
-            )
-        else:
-            return int(self.batch_size[0])
-
-    def _train_tf_dataset(self,
-                          train_init_op: 'tf.Operation',
-                          eval_init_op: 'tf.Operation',
-                          batch_size_in: 'tf.Tensor',
-                          loss: 'tf.Tensor',
-                          acc,
-                          ) -> None:
-        """Train tf graph"""
-
-        self.session.run(tf.global_variables_initializer())
-
-        if self.evaluate_on_num_examples:
-            logger.info(
-                "Accuracy is updated every {} epochs"
-                "".format(self.evaluate_every_num_epochs)
-            )
-        pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled())
-
-        eval_acc = 0
-        eval_loss = 0
-        for ep in pbar:
-
-            batch_size = self._linearly_increasing_batch_size(ep)
-
-            self.session.run(train_init_op, feed_dict={batch_size_in: batch_size})
-
-            ep_train_loss = 0
-            ep_train_acc = 0
-            batches_per_epoch = 0
-            while True:
-                try:
-                    _, batch_train_loss, batch_train_acc = self.session.run(
-                        [self._train_op, loss, acc],
-                        feed_dict={self._is_training: True}
-                    )
-                    batches_per_epoch += 1
-                    ep_train_loss += batch_train_loss
-                    ep_train_acc += batch_train_acc
-
-                except tf.errors.OutOfRangeError:
-                    break
-
-            ep_train_loss /= batches_per_epoch
-            ep_train_acc /= batches_per_epoch
-
-            pbar.set_postfix({
-                "loss": "{:.3f}".format(ep_train_loss),
-                "acc": "{:.3f}".format(ep_train_acc)
-            })
-
-            if self.evaluate_on_num_examples and eval_init_op is not None:
-                if ((ep + 1) % self.evaluate_every_num_epochs == 0
-                        or (ep + 1) == self.epochs):
-                    eval_loss, eval_acc = self._output_training_stat_dataset(
-                        eval_init_op, loss, acc
-                    )
-                if ((ep + 1) % self.evaluate_every_num_epochs == 0
-                        and (ep + 1) != self.epochs):
-                    logger.info("Evaluation results: loss: {:.3f}, acc: {:.3f}"
-                                "".format(eval_loss, eval_acc))
-
-        if self.evaluate_on_num_examples:
-            logger.info("Finished training embedding classifier, "
-                        "loss={:.3f}, accuracy={:.3f}"
-                        "".format(eval_loss, eval_acc))
-
-    def _output_training_stat_dataset(self,
-                                      eval_init_op: 'tf.Operation',
-                                      loss: 'tf.Tensor',
-                                      acc: 'tf.Tensor') -> Tuple[float, float]:
-        """Output training statistics"""
-
-        self.session.run(eval_init_op)
-
-        return self.session.run([loss, acc], feed_dict={self._is_training: False})
-
     def continue_training(
         self,
         training_trackers: List['DialogueStateTracker'],

From 717d6e0f7bd60987fe167e8a161c2ef1640f40c3 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 16:56:50 +0200
Subject: [PATCH 19/50] black

---
 rasa/core/policies/embedding_policy.py | 711 +++++++++++++------------
 1 file changed, 377 insertions(+), 334 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 687136c4363b..5145039c7a02 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -17,7 +17,7 @@
     TrackerFeaturizer,
     FullDialogueTrackerFeaturizer,
     LabelTokenizerSingleStateFeaturizer,
-    MaxHistoryTrackerFeaturizer
+    MaxHistoryTrackerFeaturizer,
 )
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
@@ -27,9 +27,11 @@
 import tensorflow as tf
 
 try:
-    from tensor2tensor.models.transformer import (transformer_base,
-                                                  transformer_prepare_encoder,
-                                                  transformer_encoder)
+    from tensor2tensor.models.transformer import (
+        transformer_base,
+        transformer_prepare_encoder,
+        transformer_encoder,
+    )
     from tensor2tensor.layers.common_attention import large_compatible_negative
 
 except ImportError:
@@ -51,14 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # namedtuple for all tf session related data
-SessionData = namedtuple(
-    "SessionData",
-    (
-        "X",
-        "Y",
-        "labels",
-    ),
-)
+SessionData = namedtuple("SessionData", ("X", "Y", "labels"))
 
 
 class EmbeddingPolicy(Policy):
@@ -90,7 +85,7 @@ class EmbeddingPolicy(Policy):
         # batch size will be linearly increased for each epoch
         "batch_size": [8, 32],
         # how to create batches
-        "batch_strategy": 'sequence',  # string 'sequence' or 'balanced'
+        "batch_strategy": "sequence",  # string 'sequence' or 'balanced'
         # number of epochs
         "epochs": 1,
         # set random seed to any int to get reproducible results
@@ -106,7 +101,7 @@ class EmbeddingPolicy(Policy):
         # the type of the similarity
         "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
         # the type of the loss function
-        "loss_type": 'softmax',  # string 'softmax' or 'margin'
+        "loss_type": "softmax",  # string 'softmax' or 'margin'
         # the number of incorrect actions, the algorithm will minimize
         # their similarity to the user input during training
         "num_neg": 20,
@@ -132,12 +127,13 @@ class EmbeddingPolicy(Policy):
     # end default properties (DOC MARKER - don't remove)
 
     @staticmethod
-    def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturizer':
+    def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturizer":
         if max_history is None:
             return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
         else:
-            return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(),
-                                               max_history=max_history)
+            return MaxHistoryTrackerFeaturizer(
+                LabelTokenizerSingleStateFeaturizer(), max_history=max_history
+            )
 
     @staticmethod
     def _check_t2t() -> None:
@@ -146,22 +142,22 @@ def _check_t2t() -> None:
 
     def __init__(
         self,
-        featurizer: Optional['TrackerFeaturizer'] = None,
+        featurizer: Optional["TrackerFeaturizer"] = None,
         priority: int = 1,
-        encoded_all_actions: Optional['np.ndarray'] = None,
-        graph: Optional['tf.Graph'] = None,
-        session: Optional['tf.Session'] = None,
-        intent_placeholder: Optional['tf.Tensor'] = None,
-        action_placeholder: Optional['tf.Tensor'] = None,
-        slots_placeholder: Optional['tf.Tensor'] = None,
-        prev_act_placeholder: Optional['tf.Tensor'] = None,
-        similarity_all: Optional['tf.Tensor'] = None,
-        pred_confidence: Optional['tf.Tensor'] = None,
-        similarity: Optional['tf.Tensor'] = None,
-        dial_embed: Optional['tf.Tensor'] = None,
-        bot_embed: Optional['tf.Tensor'] = None,
-        all_bot_embed: Optional['tf.Tensor'] = None,
-        attention_weights=None,
+        encoded_all_actions: Optional["np.ndarray"] = None,
+        graph: Optional["tf.Graph"] = None,
+        session: Optional["tf.Session"] = None,
+        intent_placeholder: Optional["tf.Tensor"] = None,
+        action_placeholder: Optional["tf.Tensor"] = None,
+        slots_placeholder: Optional["tf.Tensor"] = None,
+        prev_act_placeholder: Optional["tf.Tensor"] = None,
+        similarity_all: Optional["tf.Tensor"] = None,
+        pred_confidence: Optional["tf.Tensor"] = None,
+        similarity: Optional["tf.Tensor"] = None,
+        dial_embed: Optional["tf.Tensor"] = None,
+        bot_embed: Optional["tf.Tensor"] = None,
+        all_bot_embed: Optional["tf.Tensor"] = None,
+        attention_weights: Optional["tf.Tensor"] = None,
         max_history: Optional[int] = None,
         **kwargs: Any
     ) -> None:
@@ -207,9 +203,9 @@ def __init__(
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"]
 
-        self.pos_encoding = config['pos_encoding']
-        self.max_seq_length = config['max_seq_length']
-        self.num_heads = config['num_heads']
+        self.pos_encoding = config["pos_encoding"]
+        self.max_seq_length = config["max_seq_length"]
+        self.num_heads = config["num_heads"]
 
         self.transformer_size = config["transformer_size"]
         self.num_transformer_layers = config["num_transformer_layers"]
@@ -226,12 +222,12 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
         self.mu_pos = config["mu_pos"]
         self.mu_neg = config["mu_neg"]
         self.similarity_type = config["similarity_type"]
-        self.loss_type = config['loss_type']
-        if self.similarity_type == 'auto':
-            if self.loss_type == 'softmax':
-                self.similarity_type = 'inner'
-            elif self.loss_type == 'margin':
-                self.similarity_type = 'cosine'
+        self.loss_type = config["loss_type"]
+        if self.similarity_type == "auto":
+            if self.loss_type == "softmax":
+                self.similarity_type = "inner"
+            elif self.loss_type == "margin":
+                self.similarity_type = "cosine"
 
         self.num_neg = config["num_neg"]
         self.use_max_sim_neg = config["use_max_sim_neg"]
@@ -239,10 +235,7 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
     def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
         self.C2 = config["C2"]
         self.C_emb = config["C_emb"]
-        self.droprate = {
-            "bot": config["droprate_bot"],
-            "dial": config["droprate_dial"],
-        }
+        self.droprate = {"bot": config["droprate_bot"], "dial": config["droprate_dial"]}
 
     def _load_visual_params(self, config: Dict[Text, Any]) -> None:
         self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
@@ -263,36 +256,36 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
     # data helpers
     # noinspection PyPep8Naming
     @staticmethod
-    def _labels_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray':
+    def _labels_for_Y(data_Y: "np.ndarray") -> "np.ndarray":
         """Prepare Y data for training: extract actions indices."""
 
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
-    def _action_features_for_Y(self, labels: 'np.ndarray') -> 'np.ndarray':
+    def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray":
         """Prepare Y data for training: features for action labels."""
 
         if len(labels.shape) == 2:
             return np.stack(
                 [
                     np.stack(
-                        [self.encoded_all_actions[action_idx]
-                         for action_idx in action_ids]
+                        [
+                            self.encoded_all_actions[action_idx]
+                            for action_idx in action_ids
+                        ]
                     )
                     for action_ids in labels
                 ]
             )
         else:
             return np.stack(
-                [
-                    self.encoded_all_actions[action_idx] for action_idx in labels
-                ]
+                [self.encoded_all_actions[action_idx] for action_idx in labels]
             )
 
     # noinspection PyPep8Naming
     def _create_session_data(
-        self, data_X: 'np.ndarray', data_Y: Optional['np.ndarray'] = None
-    ) -> 'SessionData':
+        self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
+    ) -> "SessionData":
         """Combine all tf session related data into a named tuple"""
 
         if data_Y is not None:
@@ -304,25 +297,23 @@ def _create_session_data(
             if labels.ndim == 2:
                 # for multi-label y, map each distinct row to a string repr
                 # using join because str(row) uses an ellipsis if len(row) > 1000
-                labels = np.array([' '.join(row.astype('str')) for row in labels])
+                labels = np.array([" ".join(row.astype("str")) for row in labels])
         else:
             # prediction time
             labels = None
             Y = None
 
-        return SessionData(
-            X=data_X,
-            Y=Y,
-            labels=labels,
-        )
+        return SessionData(X=data_X, Y=Y, labels=labels)
 
     # noinspection PyPep8Naming
-    def _train_val_split(self, session_data: 'SessionData'
-                         ) -> Tuple['SessionData', 'SessionData']:
+    def _train_val_split(
+        self, session_data: "SessionData"
+    ) -> Tuple["SessionData", "SessionData"]:
         """Create random hold out validation set using stratified split."""
 
-        label_counts = dict(zip(*np.unique(session_data.labels,
-                                           return_counts=True, axis=0)))
+        label_counts = dict(
+            zip(*np.unique(session_data.labels, return_counts=True, axis=0))
+        )
         counts = np.array([label_counts[label] for label in session_data.labels])
 
         multi_X = session_data.X[counts > 1]
@@ -333,23 +324,25 @@ def _train_val_split(self, session_data: 'SessionData'
         solo_Y = session_data.Y[counts == 1]
         solo_labels = session_data.labels[counts == 1]
 
-        (X_train, X_val,
-         Y_train, Y_val,
-         labels_train, labels_val) = train_test_split(
-            multi_X, multi_Y, multi_labels,
+        (X_train, X_val, Y_train, Y_val, labels_train, labels_val) = train_test_split(
+            multi_X,
+            multi_Y,
+            multi_labels,
             test_size=self.evaluate_on_num_examples,
             random_state=self.random_seed,
-            stratify=multi_labels
+            stratify=multi_labels,
         )
         X_train = np.concatenate([X_train, solo_X])
         Y_train = np.concatenate([Y_train, solo_Y])
         labels_train = np.concatenate([labels_train, solo_labels])
 
-        return (SessionData(X=X_train, Y=Y_train, labels=labels_train),
-                SessionData(X=X_val, Y=Y_val, labels=labels_val))
+        return (
+            SessionData(X=X_train, Y=Y_train, labels=labels_train),
+            SessionData(X=X_val, Y=Y_val, labels=labels_val),
+        )
 
     @staticmethod
-    def _shuffle_session_data(session_data: 'SessionData') -> 'SessionData':
+    def _shuffle_session_data(session_data: "SessionData") -> "SessionData":
         """Shuffle session data."""
 
         ids = np.random.permutation(len(session_data.X))
@@ -361,31 +354,34 @@ def _shuffle_session_data(session_data: 'SessionData') -> 'SessionData':
 
     # tf helpers:
     # noinspection PyPep8Naming
-    def _gen_batch(self,
-                   session_data: 'SessionData',
-                   batch_size: int,
-                   batch_strategy: Text = 'sequence',
-                   shuffle: bool = False
-                   ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]:
+    def _gen_batch(
+        self,
+        session_data: "SessionData",
+        batch_size: int,
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]:
         """Generate batches."""
 
         if shuffle:
             session_data = self._shuffle_session_data(session_data)
 
-        if batch_strategy == 'balanced':
+        if batch_strategy == "balanced":
             num_examples = len(session_data.X)
-            unique_labels, counts_labels = np.unique(session_data.labels,
-                                                     return_counts=True,
-                                                     axis=0)
+            unique_labels, counts_labels = np.unique(
+                session_data.labels, return_counts=True, axis=0
+            )
             num_labels = len(unique_labels)
 
             label_data = []
             for label in unique_labels:
-                label_data.append(SessionData(
-                    X=session_data.X[session_data.labels == label],
-                    Y=session_data.Y[session_data.labels == label],
-                    labels=None  # ignore new labels
-                ))
+                label_data.append(
+                    SessionData(
+                        X=session_data.X[session_data.labels == label],
+                        Y=session_data.Y[session_data.labels == label],
+                        labels=None,  # ignore new labels
+                    )
+                )
 
             data_idx = [0] * num_labels
             num_data_cycles = [0] * num_labels
@@ -407,8 +403,8 @@ def _gen_batch(self,
 
                     num_i = int(counts_labels[i] / num_examples * batch_size) + 1
 
-                    new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i])
-                    new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i])
+                    new_X.append(label_data[i].X[data_idx[i] : data_idx[i] + num_i])
+                    new_Y.append(label_data[i].Y[data_idx[i] : data_idx[i] + num_i])
 
                     data_idx[i] += num_i
                     if data_idx[i] >= counts_labels[i]:
@@ -418,53 +414,62 @@ def _gen_batch(self,
                     if min(num_data_cycles) > 0:
                         break
 
-            session_data = SessionData(X=np.concatenate(new_X),
-                                       Y=np.concatenate(new_Y),
-                                       labels=None)  # ignore new labels
+            session_data = SessionData(
+                X=np.concatenate(new_X), Y=np.concatenate(new_Y), labels=None
+            )  # ignore new labels
 
-        num_batches = (session_data.X.shape[0] // batch_size
-                       + int(session_data.X.shape[0] % batch_size > 0))
+        num_batches = session_data.X.shape[0] // batch_size + int(
+            session_data.X.shape[0] % batch_size > 0
+        )
 
         for batch_num in range(num_batches):
             batch_x = session_data.X[
-                      batch_num * batch_size: (batch_num + 1) * batch_size]
+                batch_num * batch_size : (batch_num + 1) * batch_size
+            ]
             batch_y = session_data.Y[
-                      batch_num * batch_size: (batch_num + 1) * batch_size]
+                batch_num * batch_size : (batch_num + 1) * batch_size
+            ]
 
             yield batch_x, batch_y
 
-    def _create_tf_dataset(self, session_data: 'SessionData',
-                           batch_size: Union['tf.Tensor', int],
-                           batch_strategy: Text = 'sequence',
-                           shuffle: bool = False) -> 'tf.data.Dataset':
+    def _create_tf_dataset(
+        self,
+        session_data: "SessionData",
+        batch_size: Union["tf.Tensor", int],
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ) -> "tf.data.Dataset":
         """Create tf dataset."""
 
         return tf.data.Dataset.from_generator(
-            lambda batch_size_: self._gen_batch(session_data,
-                                                batch_size_,
-                                                batch_strategy,
-                                                shuffle),
+            lambda batch_size_: self._gen_batch(
+                session_data, batch_size_, batch_strategy, shuffle
+            ),
             output_types=(tf.float32, tf.float32),
-            output_shapes=([None] + list(session_data.X[0].shape),  # set batch to None
-                           [None] + list(session_data.Y[0].shape)),  # set batch to None
-            args=([batch_size])
+            output_shapes=(
+                [None] + list(session_data.X[0].shape),  # set batch to None
+                [None] + list(session_data.Y[0].shape),  # set batch to None
+            ),
+            args=([batch_size]),
         )
 
     @staticmethod
-    def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator':
+    def _create_tf_iterator(dataset: "tf.data.Dataset") -> "tf.data.Iterator":
         """Create tf iterator."""
 
-        return tf.data.Iterator.from_structure(dataset.output_types,
-                                               dataset.output_shapes,
-                                               output_classes=dataset.output_classes)
+        return tf.data.Iterator.from_structure(
+            dataset.output_types,
+            dataset.output_shapes,
+            output_classes=dataset.output_classes,
+        )
 
     def _create_tf_nn(
         self,
-        x_in: 'tf.Tensor',
+        x_in: "tf.Tensor",
         layer_sizes: List[int],
         droprate: float,
         layer_name_suffix: Text,
-    ) -> 'tf.Tensor':
+    ) -> "tf.Tensor":
         """Create nn with hidden layers and name suffix."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -481,7 +486,7 @@ def _create_tf_nn(
             x = tf.layers.dropout(x, rate=droprate, training=self._is_training)
         return x
 
-    def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_normalize_if_cosine(self, x: "tf.Tensor") -> "tf.Tensor":
         """Normalize embedding if similarity type is cosine."""
 
         if self.similarity_type == "cosine":
@@ -495,7 +500,7 @@ def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor':
                 "".format(self.similarity_type)
             )
 
-    def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor':
+    def _create_tf_embed(self, x: "tf.Tensor", layer_name_suffix: Text) -> "tf.Tensor":
         """Create dense embedding layer with a name."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -510,7 +515,7 @@ def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tenso
         # normalize embedding vectors for cosine similarity
         return self._tf_normalize_if_cosine(embed_x)
 
-    def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
+    def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
         """Create embedding bot vector."""
 
         b = self._create_tf_nn(
@@ -521,9 +526,9 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor':
         )
         return self._create_tf_embed(b, layer_name_suffix="bot")
 
-    def _create_t2t_hparams(self) -> 'HParams':
+    def _create_t2t_hparams(self) -> "HParams":
         """Create parameters for t2t transformer."""
-        
+
         hparams = transformer_base()
 
         hparams.num_hidden_layers = self.num_transformer_layers
@@ -545,13 +550,14 @@ def _create_t2t_hparams(self) -> 'HParams':
         return hparams
 
     # noinspection PyUnresolvedReferences
-    def _create_t2t_transformer_encoder(self,
-                                        x_in: 'tf.Tensor',
-                                        mask: 'tf.Tensor',
-                                        attention_weights: Dict[Text, 'tf.Tensor'],
-                                        ) -> 'tf.Tensor':
+    def _create_t2t_transformer_encoder(
+        self,
+        x_in: "tf.Tensor",
+        mask: "tf.Tensor",
+        attention_weights: Dict[Text, "tf.Tensor"],
+    ) -> "tf.Tensor":
         """Create t2t transformer encoder."""
-        
+
         hparams = self._create_t2t_hparams()
 
         # When not in training mode, set all forms of dropout to zero.
@@ -566,24 +572,27 @@ def _create_t2t_transformer_encoder(self,
             units=hparams.hidden_size,
             use_bias=False,
             kernel_initializer=tf.random_normal_initializer(
-                0.0, hparams.hidden_size ** -0.5),
+                0.0, hparams.hidden_size ** -0.5
+            ),
             kernel_regularizer=reg,
-            name='transformer_embed_layer',
-            reuse=tf.AUTO_REUSE
+            name="transformer_embed_layer",
+            reuse=tf.AUTO_REUSE,
+        )
+        x = tf.layers.dropout(
+            x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training
         )
-        x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout,
-                              training=self._is_training)
 
         if hparams.multiply_embedding_mode == "sqrt_depth":
             x *= hparams.hidden_size ** 0.5
 
         x *= tf.expand_dims(mask, -1)
 
-        with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
-            (x,
-             self_attention_bias,
-             encoder_decoder_attention_bias
-             ) = transformer_prepare_encoder(x, None, hparams)
+        with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
+            (
+                x,
+                self_attention_bias,
+                encoder_decoder_attention_bias,
+            ) = transformer_prepare_encoder(x, None, hparams)
 
             x *= tf.expand_dims(mask, -1)
 
@@ -605,20 +614,21 @@ def _create_t2t_transformer_encoder(self,
 
             x *= tf.expand_dims(mask, -1)
 
-            return tf.nn.dropout(tf.nn.relu(x),
-                                 1.0 - hparams.layer_prepostprocess_dropout)
+            return tf.nn.dropout(
+                tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout
+            )
 
-    def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
+    def _create_tf_dial(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Create dialogue level embedding and mask."""
-        
+
         # mask different length sequences
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
         self.attention_weights = {}
-        a = self._create_t2t_transformer_encoder(self.a_in,
-                                                 mask,
-                                                 self.attention_weights)
+        a = self._create_t2t_transformer_encoder(
+            self.a_in, mask, self.attention_weights
+        )
 
         dial_embed = self._create_tf_embed(a, layer_name_suffix="dial")
 
@@ -630,25 +640,24 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
         return dial_embed, mask
 
     @staticmethod
-    def _tf_make_flat(x: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":
         """Make tensor 2D."""
 
         return tf.reshape(x, (-1, x.shape[-1]))
 
     @staticmethod
-    def _tf_sample_neg(batch_size: 'tf.Tensor',
-                       all_bs: 'tf.Tensor',
-                       neg_ids: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_sample_neg(
+        batch_size: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
+    ) -> "tf.Tensor":
         """Sample negative examples for given indices"""
 
         tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
 
         return tf.batch_gather(tiled_all_bs, neg_ids)
 
-    def _tf_calc_iou_mask(self,
-                          pos_b: 'tf.Tensor',
-                          all_bs: 'tf.Tensor',
-                          neg_ids: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_calc_iou_mask(
+        self, pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
+    ) -> "tf.Tensor":
         """Calculate IOU mask for given indices"""
 
         pos_b_in_flat = tf.expand_dims(pos_b, -2)
@@ -657,14 +666,14 @@ def _tf_calc_iou_mask(self,
         intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
         union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
 
-        iou = (tf.reduce_sum(intersection_b_in_flat, -1)
-               / tf.reduce_sum(union_b_in_flat, -1))
-        return 1. - tf.nn.relu(tf.sign(1. - iou))
+        iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(
+            union_b_in_flat, -1
+        )
+        return 1.0 - tf.nn.relu(tf.sign(1.0 - iou))
 
-    def _tf_get_negs(self,
-                     all_embed: 'tf.Tensor',
-                     all_raw: 'tf.Tensor',
-                     raw_pos: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']:
+    def _tf_get_negs(
+        self, all_embed: "tf.Tensor", all_raw: "tf.Tensor", raw_pos: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Get negative examples from given tensor."""
 
         batch_size = tf.shape(raw_pos)[0]
@@ -673,62 +682,69 @@ def _tf_get_negs(self,
 
         total_candidates = tf.shape(all_embed)[0]
 
-        all_indices = tf.tile(tf.expand_dims(tf.range(0, total_candidates, 1), 0),
-                              (batch_size * seq_length, 1))
+        all_indices = tf.tile(
+            tf.expand_dims(tf.range(0, total_candidates, 1), 0),
+            (batch_size * seq_length, 1),
+        )
         shuffled_indices = tf.transpose(
             tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
         )
-        neg_ids = shuffled_indices[:, :self.num_neg]
+        neg_ids = shuffled_indices[:, : self.num_neg]
 
         bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
         bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1))
 
-        neg_embed_flat = self._tf_sample_neg(batch_size * seq_length,
-                                             all_embed, neg_ids)
-        neg_embed = tf.reshape(neg_embed_flat,
-                               (batch_size, seq_length, -1, all_embed.shape[-1]))
+        neg_embed_flat = self._tf_sample_neg(
+            batch_size * seq_length, all_embed, neg_ids
+        )
+        neg_embed = tf.reshape(
+            neg_embed_flat, (batch_size, seq_length, -1, all_embed.shape[-1])
+        )
 
         return neg_embed, bad_negs
 
-    def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor',
-                                                                   'tf.Tensor',
-                                                                   'tf.Tensor',
-                                                                   'tf.Tensor',
-                                                                   'tf.Tensor',
-                                                                   'tf.Tensor']:
+    def _sample_negatives(
+        self, all_actions: "tf.Tensor"
+    ) -> Tuple[
+        "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"
+    ]:
         """Sample negative examples."""
 
         pos_dial_embed = tf.expand_dims(self.dial_embed, -2)
         neg_dial_embed, dial_bad_negs = self._tf_get_negs(
             self._tf_make_flat(self.dial_embed),
             self._tf_make_flat(self.b_in),
-            self.b_in
+            self.b_in,
         )
         pos_bot_embed = tf.expand_dims(self.bot_embed, -2)
         neg_bot_embed, bot_bad_negs = self._tf_get_negs(
-            self.all_bot_embed,
-            all_actions,
-            self.b_in
+            self.all_bot_embed, all_actions, self.b_in
+        )
+        return (
+            pos_dial_embed,
+            pos_bot_embed,
+            neg_dial_embed,
+            neg_bot_embed,
+            dial_bad_negs,
+            bot_bad_negs,
         )
-        return (pos_dial_embed, pos_bot_embed, neg_dial_embed, neg_bot_embed,
-                dial_bad_negs, bot_bad_negs)
 
     @staticmethod
-    def _tf_raw_sim(a: 'tf.Tensor', b: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_raw_sim(a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor") -> "tf.Tensor":
         """Calculate similarity between given tensors."""
 
         return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2)
 
     def _tf_sim(
         self,
-        pos_dial_embed: 'tf.Tensor',
-        pos_bot_embed: 'tf.Tensor',
-        neg_dial_embed: 'tf.Tensor',
-        neg_bot_embed: 'tf.Tensor',
-        dial_bad_negs: 'tf.Tensor',
-        bot_bad_negs: 'tf.Tensor',
-        mask: 'tf.Tensor',
-    ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']:
+        pos_dial_embed: "tf.Tensor",
+        pos_bot_embed: "tf.Tensor",
+        neg_dial_embed: "tf.Tensor",
+        neg_bot_embed: "tf.Tensor",
+        dial_bad_negs: "tf.Tensor",
+        bot_bad_negs: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
         """Define similarity."""
 
         # calculate similarity with several
@@ -736,61 +752,70 @@ def _tf_sim(
         neg_inf = large_compatible_negative(pos_dial_embed.dtype)
 
         sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
-        sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed,
-                                   mask) + neg_inf * bot_bad_negs
-        sim_neg_bot_bot = self._tf_raw_sim(pos_bot_embed, neg_bot_embed,
-                                           mask) + neg_inf * bot_bad_negs
-        sim_neg_dial_dial = self._tf_raw_sim(pos_dial_embed, neg_dial_embed,
-                                             mask) + neg_inf * dial_bad_negs
-        sim_neg_bot_dial = self._tf_raw_sim(pos_bot_embed, neg_dial_embed,
-                                            mask) + neg_inf * dial_bad_negs
+        sim_neg = (
+            self._tf_raw_sim(pos_dial_embed, neg_bot_embed, mask)
+            + neg_inf * bot_bad_negs
+        )
+        sim_neg_bot_bot = (
+            self._tf_raw_sim(pos_bot_embed, neg_bot_embed, mask)
+            + neg_inf * bot_bad_negs
+        )
+        sim_neg_dial_dial = (
+            self._tf_raw_sim(pos_dial_embed, neg_dial_embed, mask)
+            + neg_inf * dial_bad_negs
+        )
+        sim_neg_bot_dial = (
+            self._tf_raw_sim(pos_bot_embed, neg_dial_embed, mask)
+            + neg_inf * dial_bad_negs
+        )
 
         # output similarities between user input and bot actions
         # and similarities between bot actions and similarities between user inputs
         return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
 
     @staticmethod
-    def _tf_calc_accuracy(sim_pos: 'tf.Tensor', sim_neg: 'tf.Tensor') -> 'tf.Tensor':
+    def _tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
         """Calculate accuracy"""
 
         max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
-        return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]),
-                                      tf.float32))
+        return tf.reduce_mean(
+            tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), tf.float32)
+        )
 
     def _tf_loss_margin(
         self,
-        sim_pos: 'tf.Tensor',
-        sim_neg: 'tf.Tensor',
-        sim_neg_bot_bot: 'tf.Tensor',
-        sim_neg_dial_dial: 'tf.Tensor',
-        sim_neg_bot_dial: 'tf.Tensor',
-        mask: 'tf.Tensor',
-    ) -> 'tf.Tensor':
+        sim_pos: "tf.Tensor",
+        sim_neg: "tf.Tensor",
+        sim_neg_bot_bot: "tf.Tensor",
+        sim_neg_dial_dial: "tf.Tensor",
+        sim_neg_bot_dial: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> "tf.Tensor":
         """Define max margin loss."""
 
         # loss for maximizing similarity with correct action
-        loss = tf.maximum(0., self.mu_pos - sim_pos[:, :, 0])
+        loss = tf.maximum(0.0, self.mu_pos - sim_pos[:, :, 0])
 
         # loss for minimizing similarity with `num_neg` incorrect actions
         if self.use_max_sim_neg:
             # minimize only maximum similarity over incorrect actions
             max_sim_neg = tf.reduce_max(sim_neg, -1)
-            loss += tf.maximum(0., self.mu_neg + max_sim_neg)
+            loss += tf.maximum(0.0, self.mu_neg + max_sim_neg)
         else:
             # minimize all similarities with incorrect actions
-            max_margin = tf.maximum(0., self.mu_neg + sim_neg)
+            max_margin = tf.maximum(0.0, self.mu_neg + sim_neg)
             loss += tf.reduce_sum(max_margin, -1)
 
         # penalize max similarity between pos bot and neg bot embeddings
-        max_sim_neg_bot = tf.maximum(0., tf.reduce_max(sim_neg_bot_bot, -1))
+        max_sim_neg_bot = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_bot, -1))
         loss += max_sim_neg_bot * self.C_emb
 
         # penalize max similarity between pos dial and neg dial embeddings
-        max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_dial_dial, -1))
+        max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_dial_dial, -1))
         loss += max_sim_neg_dial * self.C_emb
 
         # penalize max similarity between pos bot and neg dial embeddings
-        max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_bot_dial, -1))
+        max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_dial, -1))
         loss += max_sim_neg_dial * self.C_emb
 
         # mask loss for different length sequences
@@ -807,21 +832,18 @@ def _tf_loss_margin(
 
     @staticmethod
     def _tf_loss_softmax(
-        sim_pos: 'tf.Tensor',
-        sim_neg: 'tf.Tensor',
-        sim_neg_bot_bot: 'tf.Tensor',
-        sim_neg_dial_dial: 'tf.Tensor',
-        sim_neg_bot_dial: 'tf.Tensor',
-        mask: 'tf.Tensor',
-    ) -> 'tf.Tensor':
+        sim_pos: "tf.Tensor",
+        sim_neg: "tf.Tensor",
+        sim_neg_bot_bot: "tf.Tensor",
+        sim_neg_dial_dial: "tf.Tensor",
+        sim_neg_bot_dial: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> "tf.Tensor":
         """Define softmax loss."""
 
-        logits = tf.concat([sim_pos,
-                            sim_neg,
-                            sim_neg_bot_bot,
-                            sim_neg_dial_dial,
-                            sim_neg_bot_dial
-                            ], -1)
+        logits = tf.concat(
+            [sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial], -1
+        )
 
         # create labels for softmax
         pos_labels = tf.ones_like(logits[:, :, :1])
@@ -832,35 +854,41 @@ def _tf_loss_softmax(
         pred = tf.nn.softmax(logits)
         already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
 
-        loss = tf.losses.softmax_cross_entropy(labels,
-                                               logits,
-                                               mask * already_learned)
+        loss = tf.losses.softmax_cross_entropy(labels, logits, mask * already_learned)
         # add regularization losses
         loss += tf.losses.get_regularization_loss()
 
         return loss
 
-    def _choose_loss(self,
-                     sim_pos: 'tf.Tensor',
-                     sim_neg: 'tf.Tensor',
-                     sim_neg_bot_bot: 'tf.Tensor',
-                     sim_neg_dial_dial: 'tf.Tensor',
-                     sim_neg_bot_dial: 'tf.Tensor',
-                     mask: 'tf.Tensor') -> 'tf.Tensor':
+    def _choose_loss(
+        self,
+        sim_pos: "tf.Tensor",
+        sim_neg: "tf.Tensor",
+        sim_neg_bot_bot: "tf.Tensor",
+        sim_neg_dial_dial: "tf.Tensor",
+        sim_neg_bot_dial: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> "tf.Tensor":
         """Use loss depending on given option."""
 
-        if self.loss_type == 'margin':
-            return self._tf_loss_margin(sim_pos, sim_neg,
-                                        sim_neg_bot_bot,
-                                        sim_neg_dial_dial,
-                                        sim_neg_bot_dial,
-                                        mask)
-        elif self.loss_type == 'softmax':
-            return self._tf_loss_softmax(sim_pos, sim_neg,
-                                         sim_neg_bot_bot,
-                                         sim_neg_dial_dial,
-                                         sim_neg_bot_dial,
-                                         mask)
+        if self.loss_type == "margin":
+            return self._tf_loss_margin(
+                sim_pos,
+                sim_neg,
+                sim_neg_bot_bot,
+                sim_neg_dial_dial,
+                sim_neg_bot_dial,
+                mask,
+            )
+        elif self.loss_type == "softmax":
+            return self._tf_loss_softmax(
+                sim_pos,
+                sim_neg,
+                sim_neg_bot_bot,
+                sim_neg_dial_dial,
+                sim_neg_bot_dial,
+                mask,
+            )
         else:
             raise ValueError(
                 "Wrong loss type '{}', "
@@ -868,15 +896,15 @@ def _choose_loss(self,
                 "".format(self.loss_type)
             )
 
-    def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
+    def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Bulid train graph using iterator."""
 
         # session data are int counts but we need a float tensors
         self.a_in, self.b_in = self._iterator.get_next()
 
-        all_actions = tf.constant(self.encoded_all_actions,
-                                  dtype=tf.float32,
-                                  name="all_actions")
+        all_actions = tf.constant(
+            self.encoded_all_actions, dtype=tf.float32, name="all_actions"
+        )
 
         self.dial_embed, mask = self._create_tf_dial()
 
@@ -888,33 +916,37 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']:
             self.b_in = self.b_in[:, tf.newaxis, :]
             self.bot_embed = self.bot_embed[:, tf.newaxis, :]
 
-        (pos_dial_embed,
-         pos_bot_embed,
-         neg_dial_embed,
-         neg_bot_embed,
-         dial_bad_negs,
-         bot_bad_negs) = self._sample_negatives(all_actions)
+        (
+            pos_dial_embed,
+            pos_bot_embed,
+            neg_dial_embed,
+            neg_bot_embed,
+            dial_bad_negs,
+            bot_bad_negs,
+        ) = self._sample_negatives(all_actions)
 
         # calculate similarities
-        (sim_pos,
-         sim_neg,
-         sim_neg_bot_bot,
-         sim_neg_dial_dial,
-         sim_neg_bot_dial) = self._tf_sim(pos_dial_embed,
-                                          pos_bot_embed,
-                                          neg_dial_embed,
-                                          neg_bot_embed,
-                                          dial_bad_negs,
-                                          bot_bad_negs,
-                                          mask)
+        (
+            sim_pos,
+            sim_neg,
+            sim_neg_bot_bot,
+            sim_neg_dial_dial,
+            sim_neg_bot_dial,
+        ) = self._tf_sim(
+            pos_dial_embed,
+            pos_bot_embed,
+            neg_dial_embed,
+            neg_bot_embed,
+            dial_bad_negs,
+            bot_bad_negs,
+            mask,
+        )
 
         acc = self._tf_calc_accuracy(sim_pos, sim_neg)
 
-        loss = self._choose_loss(sim_pos, sim_neg,
-                                 sim_neg_bot_bot,
-                                 sim_neg_dial_dial,
-                                 sim_neg_bot_dial,
-                                 mask)
+        loss = self._choose_loss(
+            sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial, mask
+        )
         return loss, acc
 
     # training helpers
@@ -935,13 +967,14 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int:
         else:
             return int(self.batch_size[0])
 
-    def _train_tf_dataset(self,
-                          train_init_op: 'tf.Operation',
-                          eval_init_op: 'tf.Operation',
-                          batch_size_in: 'tf.Tensor',
-                          loss: 'tf.Tensor',
-                          acc,
-                          ) -> None:
+    def _train_tf_dataset(
+        self,
+        train_init_op: "tf.Operation",
+        eval_init_op: "tf.Operation",
+        batch_size_in: "tf.Tensor",
+        loss: "tf.Tensor",
+        acc: "tf.Tensor",
+    ) -> None:
         """Train tf graph"""
 
         self.session.run(tf.global_variables_initializer())
@@ -981,35 +1014,40 @@ def _train_tf_dataset(self,
             train_loss = ep_train_loss / batches_per_epoch
             train_acc = ep_train_acc / batches_per_epoch
 
-            pbar.set_postfix({
-                "loss": "{:.3f}".format(train_loss),
-                "acc": "{:.3f}".format(train_acc)
-            })
+            pbar.set_postfix(
+                {"loss": "{:.3f}".format(train_loss), "acc": "{:.3f}".format(train_acc)}
+            )
 
             if eval_init_op is not None:
-                if ((ep + 1) % self.evaluate_every_num_epochs == 0
-                        or (ep + 1) == self.epochs):
+                if (ep + 1) % self.evaluate_every_num_epochs == 0 or (
+                    ep + 1
+                ) == self.epochs:
                     eval_loss, eval_acc = self._output_training_stat_dataset(
                         eval_init_op, loss, acc
                     )
                     if (ep + 1) != self.epochs:
-                        logger.info("Evaluation results: "
-                                    "validation loss: {:.3f}, "
-                                    "validation accuracy: {:.3f}"
-                                    "".format(eval_loss, eval_acc))
-
-        final_message = ("Finished training embedding policy, "
-                         "train loss={:.3f}, train accuracy={:.3f}"
-                         "".format(train_loss, train_acc))
+                        logger.info(
+                            "Evaluation results: "
+                            "validation loss: {:.3f}, "
+                            "validation accuracy: {:.3f}"
+                            "".format(eval_loss, eval_acc)
+                        )
+
+        final_message = (
+            "Finished training embedding policy, "
+            "train loss={:.3f}, train accuracy={:.3f}"
+            "".format(train_loss, train_acc)
+        )
         if eval_init_op is not None:
-            final_message += (", validation loss={:.3f}, validation accuracy={:.3f}"
-                              "".format(eval_loss, eval_acc))
+            final_message += (
+                ", validation loss={:.3f}, validation accuracy={:.3f}"
+                "".format(eval_loss, eval_acc)
+            )
         logger.info(final_message)
 
-    def _output_training_stat_dataset(self,
-                                      eval_init_op: 'tf.Operation',
-                                      loss: 'tf.Tensor',
-                                      acc: 'tf.Tensor') -> Tuple[float, float]:
+    def _output_training_stat_dataset(
+        self, eval_init_op: "tf.Operation", loss: "tf.Tensor", acc: "tf.Tensor"
+    ) -> Tuple[float, float]:
         """Output training statistics"""
 
         self.session.run(eval_init_op)
@@ -1030,9 +1068,9 @@ def _output_training_stat_dataset(self,
         return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch
 
     # prepare for prediction
-    def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
+    def _create_tf_placeholders(self, session_data: "SessionData") -> None:
         """Create placeholders for prediction."""
-        
+
         dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
@@ -1045,17 +1083,17 @@ def _create_tf_placeholders(self, session_data: 'SessionData') -> None:
             name="b",
         )
 
-    def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor':
+    def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         """Rebuild tf graph for prediction."""
-        
+
         self._create_tf_placeholders(session_data)
-        
+
         self.dial_embed, mask = self._create_tf_dial()
 
         self.sim_all = self._tf_raw_sim(
             self.dial_embed[:, :, tf.newaxis, :],
             self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
-            mask
+            mask,
         )
 
         if self.similarity_type == "cosine":
@@ -1068,19 +1106,19 @@ def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor':
         self.bot_embed = self._create_tf_bot_embed(self.b_in)
 
         self.sim = self._tf_raw_sim(
-            self.dial_embed[:, :, tf.newaxis, :],
-            self.bot_embed,
-            mask
+            self.dial_embed[:, :, tf.newaxis, :], self.bot_embed, mask
         )
 
         return confidence
 
-    def _extract_attention(self) -> Optional['tf.Tensor']:
+    def _extract_attention(self) -> Optional["tf.Tensor"]:
         """Extract attention probabilities from t2t dict"""
-        
-        attention = [tf.expand_dims(t, 0)
-                     for name, t in self.attention_weights.items()
-                     if name.endswith('multihead_attention/dot_product_attention')]
+
+        attention = [
+            tf.expand_dims(t, 0)
+            for name, t in self.attention_weights.items()
+            if name.endswith("multihead_attention/dot_product_attention")
+        ]
 
         if attention:
             return tf.concat(attention, 0)
@@ -1090,8 +1128,8 @@ def _extract_attention(self) -> Optional['tf.Tensor']:
     # training methods
     def train(
         self,
-        training_trackers: List['DialogueStateTracker'],
-        domain: 'Domain',
+        training_trackers: List["DialogueStateTracker"],
+        domain: "Domain",
         **kwargs: Any
     ) -> None:
         """Train the policy on given training trackers."""
@@ -1105,8 +1143,9 @@ def train(
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
 
         # encode all actions with policies' featurizer
-        self.encoded_all_actions = \
-            self.featurizer.state_featurizer.create_encoded_all_actions(domain)
+        self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
+            domain
+        )
 
         # check if number of negatives is less than number of actions
         logger.debug(
@@ -1134,10 +1173,12 @@ def train(
 
             # allows increasing batch size
             batch_size_in = tf.placeholder(tf.int64)
-            train_dataset = self._create_tf_dataset(session_data,
-                                                    batch_size_in,
-                                                    batch_strategy=self.batch_strategy,
-                                                    shuffle=True)
+            train_dataset = self._create_tf_dataset(
+                session_data,
+                batch_size_in,
+                batch_strategy=self.batch_strategy,
+                shuffle=True,
+            )
 
             self._iterator = self._create_tf_iterator(train_dataset)
 
@@ -1148,7 +1189,8 @@ def train(
                     self._create_tf_dataset(
                         eval_session_data,
                         # pick maximum batch_size for eval
-                        self._linearly_increasing_batch_size(self.epochs))
+                        self._linearly_increasing_batch_size(self.epochs),
+                    )
                 )
             else:
                 eval_init_op = None
@@ -1161,8 +1203,9 @@ def train(
 
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
-            self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in,
-                                   loss, acc)
+            self._train_tf_dataset(
+                train_init_op, eval_init_op, batch_size_in, loss, acc
+            )
 
             # rebuild the graph for prediction
             self.pred_confidence = self._build_tf_pred_graph(session_data)
@@ -1171,8 +1214,8 @@ def train(
 
     def continue_training(
         self,
-        training_trackers: List['DialogueStateTracker'],
-        domain: 'Domain',
+        training_trackers: List["DialogueStateTracker"],
+        domain: "Domain",
         **kwargs: Any
     ) -> None:
         """Continue training an already trained policy."""
@@ -1193,18 +1236,18 @@ def continue_training(
             # fit to one extra example using updated trackers
             while True:
                 try:
-                    self.session.run(self._train_op,
-                                     feed_dict={self._is_training: True})
+                    self.session.run(
+                        self._train_op, feed_dict={self._is_training: True}
+                    )
 
                 except tf.errors.OutOfRangeError:
                     break
 
-    def tf_feed_dict_for_prediction(self,
-                                    tracker: 'DialogueStateTracker',
-                                    domain: 'Domain'
-                                    ) -> Dict['tf.Tensor', 'np.ndarray']:
+    def tf_feed_dict_for_prediction(
+        self, tracker: "DialogueStateTracker", domain: "Domain"
+    ) -> Dict["tf.Tensor", "np.ndarray"]:
         """Create feed dictionary for tf session."""
-        
+
         # noinspection PyPep8Naming
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
@@ -1212,7 +1255,7 @@ def tf_feed_dict_for_prediction(self,
         return {self.a_in: session_data.X}
 
     def predict_action_probabilities(
-        self, tracker: 'DialogueStateTracker', domain: 'Domain'
+        self, tracker: "DialogueStateTracker", domain: "Domain"
     ) -> List[float]:
         """Predict the next action the bot should take.
 
@@ -1233,9 +1276,9 @@ def predict_action_probabilities(
 
         return confidence[0, -1, :].tolist()
 
-    def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None:
+    def _persist_tensor(self, name: Text, tensor: "tf.Tensor") -> None:
         """Add tensor to collection if it is not None"""
-        
+
         if tensor is not None:
             self.graph.clear_collection(name)
             self.graph.add_to_collection(name, tensor)
@@ -1292,7 +1335,7 @@ def persist(self, path: Text) -> None:
             pickle.dump(self._tf_config, f)
 
     @staticmethod
-    def load_tensor(name: Text) -> Optional['tf.Tensor']:
+    def load_tensor(name: Text) -> Optional["tf.Tensor"]:
         """Load tensor or set it to None"""
 
         tensor_list = tf.get_collection(name)

From 7d30f55ec072ca8c15928f5d1a7d6fa99234fcf5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 17:25:44 +0200
Subject: [PATCH 20/50] update changelog

---
 CHANGELOG.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f219c13016e8..30116b0c1107 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -20,6 +20,8 @@ Changed
 - ``Agent.update_model()`` and ``Agent.handle_message()`` now work without needing to set a domain
   or a policy ensemble
 - Update pytype to ``2019.7.11``
+- Substitute LSTM with Transformer in ``EmbeddingPolicy``
+- ``EmbeddingPolicy`` can now use ``MaxHistoryTrackerFeaturizer``
 
 Removed
 -------

From dc27bfcf0b7e5d3ae305c54eaf889e2e411efa32 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 17:36:32 +0200
Subject: [PATCH 21/50] update tests

---
 tests/core/test_policies.py | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 483e797deead..95b22d96beb1 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -393,43 +393,27 @@ def test_train_with_shuffle_false(
         policy.train(trackers, domain=default_domain)
 
 
-class TestEmbeddingPolicyNoAttention(PolicyTestCollection):
+class TestEmbeddingPolicyWithFeaturizer(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=False, attn_after_rnn=False
-        )
+        p = EmbeddingPolicy(featurizer=featurizer, priority=priority)
         return p
 
 
-class TestEmbeddingPolicyAttentionBeforeRNN(PolicyTestCollection):
+class TestEmbeddingPolicyWithFullDialogue(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=True, attn_after_rnn=False
-        )
+        # if max_history is not specified
+        p = EmbeddingPolicy(priority=priority)
         return p
 
 
-class TestEmbeddingPolicyAttentionAfterRNN(PolicyTestCollection):
+class TestEmbeddingPolicyWithMaxHistory(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=False, attn_after_rnn=True
-        )
-        return p
-
-
-class TestEmbeddingPolicyAttentionBoth(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=True, attn_after_rnn=True
-        )
+        # since it is using MaxHistoryTrackerFeaturizer
+        # if max_history is specified
+        p = EmbeddingPolicy(priority=priority, max_history=self.max_history)
         return p
 
 
@@ -437,6 +421,7 @@ class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using FullDialogueTrackerFeaturizer
+        # if max_history is not specified
         p = EmbeddingPolicy(priority=priority, **tf_defaults())
         return p
 

From 2faf24a2ac58dfe58aaff878374d69752c1e8bd4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 19:38:36 +0200
Subject: [PATCH 22/50] fix featurizer, add t2t requirements

---
 docs/core/policies.rst         |  41 +-
 rasa/core/featurizers.py       |   2 +-
 rasa/core/policies/tf_utils.py | 957 ---------------------------------
 requirements.txt               |   3 +-
 4 files changed, 9 insertions(+), 994 deletions(-)
 delete mode 100644 rasa/core/policies/tf_utils.py

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index a64dc15fac64..97606621df81 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -165,8 +165,10 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 Embedding Policy
 ^^^^^^^^^^^^^^^^
 
-The Recurrent Embedding Dialogue Policy (REDP)
-described in our paper: `<https://arxiv.org/abs/1811.11707>`_
+Transformer Embedding Dialogue Policy (TEDP)
+
+Transformer version of the Recurrent Embedding Dialogue Policy (REDP)
+used in our paper: `<https://arxiv.org/abs/1811.11707>`_
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
@@ -201,11 +203,6 @@ following steps:
       This step is based on the
       `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
 
-.. note::
-
-    This policy only works with
-    ``FullDialogueTrackerFeaturizer(state_featurizer)``.
-
 It is recommended to use
 ``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
 (see :ref:`featurization` for details).
@@ -219,40 +216,14 @@ It is recommended to use
 
         Pass an appropriate number of ``epochs`` to the ``EmbeddingPolicy``,
         otherwise the policy will be trained only for ``1``
-        epoch. Since this is an embedding based policy, it requires a large
-        number of epochs, which depends on the complexity of the
-        training data and whether attention is used or not.
-
-    The main feature of this policy is an **attention** mechanism over
-    previous user input and system actions.
-    **Attention is turned on by default**; in order to turn it off,
-    configure the following parameters:
-
-        - ``attn_before_rnn`` if ``true`` the algorithm will use
-          attention mechanism over previous user input, default ``true``;
-        - ``attn_after_rnn`` if ``true`` the algorithm will use
-          attention mechanism over previous system actions and will be
-          able to copy previously executed action together with LSTM's
-          hidden state from its history, default ``true``;
-        - ``sparse_attention`` if ``true`` ``sparsemax`` will be used
-          instead of ``softmax`` for attention probabilities, default
-          ``false``;
-        - ``attn_shift_range`` the range of allowed location-based
-          attention shifts for system memory (``attn_after_rnn``), see
-          `<https://arxiv.org/abs/1410.5401>`_ for details;
+        epoch.
 
-    .. note::
-
-        Attention requires larger values of ``epochs`` and takes longer
-        to train. But it can learn more complicated and nonlinear behaviour.
+    The main feature of this policy is **transformer**.
 
     The algorithm also has hyper-parameters to control:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_a`` sets a list of hidden layers
-              sizes before embedding layer for user inputs, the number
-              of hidden layers is equal to the length of the list;
             - ``hidden_layers_sizes_b`` sets a list of hidden layers
               sizes before embedding layer for system actions, the number
               of hidden layers is equal to the length of the list;
diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index 4bdcbb9c6384..4c32158260ee 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -355,7 +355,7 @@ def _featurize_labels(
 
         y = np.array(labels)
         # if it is MaxHistoryFeaturizer, squeeze out time axis
-        if y.shape[1] == 1 and isinstance(self, MaxHistoryTrackerFeaturizer):
+        if y.ndim == 3 and isinstance(self, MaxHistoryTrackerFeaturizer):
             y = y[:, 0, :]
 
         return y
diff --git a/rasa/core/policies/tf_utils.py b/rasa/core/policies/tf_utils.py
deleted file mode 100644
index 2cfddda81bdd..000000000000
--- a/rasa/core/policies/tf_utils.py
+++ /dev/null
@@ -1,957 +0,0 @@
-from collections import namedtuple
-import tensorflow as tf
-
-tf.contrib._warning = None  # avoid warning println on contrib import - remove for tf 2
-
-
-class TimedNTM(object):
-    """Timed Neural Turing Machine
-
-    Inspired by paper:
-        https://arxiv.org/pdf/1410.5401.pdf
-    Implementation inspired by:
-        https://github.com/carpedm20/NTM-tensorflow/blob/master/ntm_cell.py
-
-    See our paper for details: https://arxiv.org/abs/1811.11707
-    """
-
-    def __init__(self, attn_shift_range, sparse_attention, name):
-        """Construct the `TimedNTM`.
-
-        Args:
-            attn_shift_range: Python int.
-                A time range within which to attend to the memory by location
-            sparse_attention: Python bool.
-                If `True` use sparsemax instead of softmax for probs
-            name: Name to use when creating ops.
-        """
-
-        # interpolation gate
-        self.name = "timed_ntm_" + name
-
-        self._inter_gate = tf.layers.Dense(
-            units=1, activation=tf.sigmoid, name=self.name + "/inter_gate"
-        )
-        # if use sparsemax instead of softmax for probs
-        self._sparse_attention = sparse_attention
-
-        if sparse_attention:
-            # sparsemax doesn't support inf
-            self._inf = float(5000)
-        else:
-            self._inf = float("inf")
-
-        # shift weighting if range is provided
-        if attn_shift_range:
-            self._shift_weight = tf.layers.Dense(
-                units=2 * attn_shift_range + 1,
-                activation=tf.nn.softmax,
-                name=self.name + "/shift_weight",
-            )
-        else:
-            self._shift_weight = None
-
-        # sharpening parameter
-        self._gamma_sharp = tf.layers.Dense(
-            units=1,
-            activation=lambda a: tf.nn.softplus(a) + 1,
-            bias_initializer=tf.constant_initializer(1),
-            name=self.name + "/gamma_sharp",
-        )
-
-    def __call__(self, attn_inputs, scores, scores_state, mask):
-        # apply exponential moving average with interpolation gate weight
-        # to scores from previous time which are equal to probs at this point
-        # different from original NTM where it is applied after softmax
-        i_g = self._inter_gate(attn_inputs)
-
-        # scores limited by time
-        scores = tf.concat(
-            [i_g * scores[:, :-1] + (1 - i_g) * scores_state, scores[:, -1:]], 1
-        )
-        next_scores_state = scores
-
-        if mask is not None:
-            # apply mask to scores
-            if self._shift_weight is not None:
-                # rearrange scores to make them continuous for convolution
-                scores = tf.map_fn(
-                    self._rearrange_fn, [scores, mask], dtype=scores.dtype
-                )
-            else:
-                scores = tf.where(mask > 0, scores, -self._inf * tf.ones_like(scores))
-
-        # create probabilities for attention
-        if self._sparse_attention:
-            probs = tf.contrib.sparsemax.sparsemax(scores)
-        else:
-            probs = tf.nn.softmax(scores)
-
-        if self._shift_weight is not None:
-            s_w = self._shift_weight(attn_inputs)
-
-            # we want to go back in time during convolution
-            conv_probs = tf.reverse(probs, axis=[1])
-
-            # preare probs for tf.nn.depthwise_conv2d
-            # [in_width, in_channels=batch]
-            conv_probs = tf.transpose(conv_probs, [1, 0])
-            # [batch=1, in_height=1, in_width=time+1, in_channels=batch]
-            conv_probs = conv_probs[tf.newaxis, tf.newaxis, :, :]
-
-            # [filter_height=1, filter_width=2*attn_shift_range+1,
-            #   in_channels=batch, channel_multiplier=1]
-            conv_s_w = tf.transpose(s_w, [1, 0])
-            conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis]
-
-            # perform 1d convolution
-            # [batch=1, out_height=1, out_width=time+1, out_channels=batch]
-            conv_probs = tf.nn.depthwise_conv2d_native(
-                conv_probs, conv_s_w, [1, 1, 1, 1], "SAME"
-            )
-            conv_probs = conv_probs[0, 0, :, :]
-            conv_probs = tf.transpose(conv_probs, [1, 0])
-
-            probs = tf.reverse(conv_probs, axis=[1])
-
-            if mask is not None:
-                # arrange probs back to their original time order
-                probs = tf.map_fn(
-                    self._arrange_back_fn, [probs, mask], dtype=probs.dtype
-                )
-
-        # sharpening
-        g_sh = self._gamma_sharp(attn_inputs)
-
-        powed_probs = tf.pow(probs, g_sh)
-        probs = powed_probs / (tf.reduce_sum(powed_probs, 1, keepdims=True) + 1e-32)
-
-        return probs, next_scores_state
-
-    def _rearrange_fn(self, list_tensor_1d_mask_1d):
-        """Rearranges tensor_1d to put all the values
-            where mask_1d=1 to the right and
-            where mask_1d=0 to the left and sets them to -infinity"""
-        tensor_1d, mask_1d = list_tensor_1d_mask_1d
-
-        partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2)
-        partitioned_tensor[0] = -self._inf * tf.ones_like(partitioned_tensor[0])
-
-        return tf.concat(partitioned_tensor, 0)
-
-    @staticmethod
-    def _arrange_back_fn(list_tensor_1d_mask_1d):
-        """Arranges back tensor_1d to restore original order
-            modified by `_rearrange_fn` according to mask_1d:
-            - number of 0s in mask_1d values on the left are set to
-              their corresponding places where mask_1d=0,
-            - number of 1s in mask_1d values on the right are set to
-              their corresponding places where mask_1d=1"""
-        tensor_1d, mask_1d = list_tensor_1d_mask_1d
-
-        mask_indices = tf.dynamic_partition(
-            tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2
-        )
-
-        mask_sum = tf.reduce_sum(mask_1d, axis=0)
-        partitioned_tensor = [
-            tf.zeros_like(tensor_1d[:-mask_sum]),
-            tensor_1d[-mask_sum:],
-        ]
-
-        return tf.dynamic_stitch(mask_indices, partitioned_tensor)
-
-
-def _compute_time_attention(
-    attention_mechanism,
-    attn_inputs,
-    attention_state,
-    # time is added to calculate time attention
-    time,
-    timed_ntm,
-    time_mask,
-    ignore_mask,
-    attention_layer,
-):
-    """Computes the attention and alignments limited by time
-        for a given attention_mechanism.
-
-        Modified helper method from tensorflow."""
-
-    scores, _ = attention_mechanism(attn_inputs, state=attention_state)
-
-    # take only scores from current and past times
-    timed_scores = scores[:, : time + 1]
-    timed_scores_state = attention_state[:, :time]
-
-    # get mask for past times
-    timed_time_mask = time_mask[:, :time]
-    if ignore_mask is not None:
-        timed_time_mask *= 1 - ignore_mask[:, :time]
-
-    # set mask for current time to 1
-    timed_time_mask = tf.concat([timed_time_mask, tf.ones_like(time_mask[:, :1])], 1)
-
-    # pass these scores to NTM
-    probs, next_scores_state = timed_ntm(
-        attn_inputs, timed_scores, timed_scores_state, timed_time_mask
-    )
-
-    # concatenate probs with zeros to get new alignments
-    zeros = tf.zeros_like(scores)
-    # remove current time from attention
-    alignments = tf.concat([probs[:, :-1], zeros[:, time:]], 1)
-
-    # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
-    expanded_alignments = tf.expand_dims(alignments, 1)
-
-    # Context is the inner product of alignments and values along the
-    # memory time dimension.
-    # alignments shape is
-    #   [batch_size, 1, memory_time]
-    # attention_mechanism.values shape is
-    #   [batch_size, memory_time, memory_size]
-    # the batched matmul is over memory_time, so the output shape is
-    #   [batch_size, 1, memory_size].
-    # we then squeeze out the singleton dim.
-    context = tf.matmul(expanded_alignments, attention_mechanism.values)
-    context = tf.squeeze(context, [1])
-
-    if attention_layer is not None:
-        attention = attention_layer(tf.concat([attn_inputs, context], 1))
-    else:
-        attention = context
-
-    # return current time to attention
-    alignments = tf.concat([probs, zeros[:, time + 1 :]], 1)
-    next_attention_state = tf.concat([next_scores_state, zeros[:, time + 1 :]], 1)
-    return attention, alignments, next_attention_state
-
-
-# noinspection PyProtectedMember
-class TimeAttentionWrapperState(
-    namedtuple(
-        "TimeAttentionWrapperState",
-        tf.contrib.seq2seq.AttentionWrapperState._fields
-        + ("all_time_masks", "all_cell_states"),
-    )
-):  # added
-    """Modified  from tensorflow's tf.contrib.seq2seq.AttentionWrapperState
-        see there for description of the parameters
-
-    Additional fields:
-        - `all_time_masks`: A mask applied to a memory
-           that filters certain time steps
-        - `all_cell_states`: All states of the wrapped `RNNCell`
-           at all the previous time steps.
-    """
-
-    def clone(self, **kwargs):
-        """Copied  from tensorflow's tf.contrib.seq2seq.AttentionWrapperState
-            see there for description of the parameters"""
-
-        def with_same_shape(old, new):
-            """Check and set new tensor's shape."""
-            if isinstance(old, tf.Tensor) and isinstance(new, tf.Tensor):
-                return tf.contrib.framework.with_same_shape(old, new)
-            return new
-
-        return tf.contrib.framework.nest.map_structure(
-            with_same_shape,
-            self,
-            super(TimeAttentionWrapperState, self)._replace(**kwargs),
-        )
-
-
-class TimeAttentionWrapper(tf.contrib.seq2seq.AttentionWrapper):
-    """Custom AttentionWrapper that takes into account time
-        when calculating attention.
-        Attention is calculated before calling rnn cell.
-
-        Modified from tensorflow's tf.contrib.seq2seq.AttentionWrapper.
-
-        See our paper for details: https://arxiv.org/abs/1811.11707
-    """
-
-    def __init__(
-        self,
-        cell,
-        attention_mechanism,
-        sequence_len,
-        attn_shift_range=0,
-        sparse_attention=False,
-        attention_layer_size=None,
-        alignment_history=False,
-        rnn_and_attn_inputs_fn=None,
-        ignore_mask=None,
-        cell_input_fn=None,
-        index_of_attn_to_copy=None,
-        likelihood_fn=None,
-        tensor_not_to_copy=None,
-        output_attention=False,
-        initial_cell_state=None,
-        name=None,
-        attention_layer=None,
-    ):
-        """Construct the `TimeAttentionWrapper`.
-            See the super class for the original arguments description.
-
-        Additional args:
-            sequence_len: Python integer.
-                Maximum length of the sequence, used to create
-                appropriate TensorArray for all cell states
-                in TimeAttentionWrapperState
-            attn_shift_range: Python integer (`0` by default).
-                A time range within which to attend to the memory
-                by location in Neural Turing Machine.
-            sparse_attention: Python bool.
-                A flag to use sparsemax (if `True`) instead of
-                softmax (if `False`, default) for probabilities
-            inputs_and_attn_inputs_fn: (optional) A `callable`.
-                A function that creates inputs and attention inputs tensors.
-            ignore_mask: (optional) Boolean Tensor.
-                Determines which time steps to ignore in attention
-            index_of_attn_to_copy: (optional) Python integer.
-                An index of attention mechanism that picks
-                which part of attention tensor to use for copying to output,
-                the default is `None`, which turns off copying mechanism.
-                Copy inspired by: https://arxiv.org/pdf/1603.06393.pdf
-            likelihood_fn: (optional) A `callable`.
-                A method to perform likelihood calculation to
-                filter time step in copy mechanism.
-                Returns a tuple of binary likelihood and likelihood
-            tensor_not_to_copy: (optional) A Tensor.
-                A tensor, which shouldn't be copied from previous time steps
-
-        Modified args:
-            output_attention: Python bool.  If `True`, the output at each
-                time step is the concatenated cell outputs,
-                attention values and additional values described in
-                `additional_output_size()`, used in copy mechanism.
-        """
-        super(TimeAttentionWrapper, self).__init__(
-            cell,
-            attention_mechanism,
-            attention_layer_size,
-            alignment_history,
-            cell_input_fn,
-            output_attention,
-            initial_cell_state,
-            name,
-            attention_layer,
-        )
-        self._sequence_len = sequence_len
-
-        if not isinstance(attn_shift_range, list):
-            # attn_shift_range might not be a list
-            attn_shift_range = [attn_shift_range]
-        self._timed_ntms = [TimedNTM(attn_shift_range[0], sparse_attention, name="0")]
-        if self._is_multi:
-            # if there are several attention mechanisms,
-            # create additional TimedNTMs for them
-            if len(attn_shift_range) == 1:
-                # original attn_shift_range might not be a list
-                attn_shift_range *= len(attention_mechanism)
-            elif len(attn_shift_range) != len(attention_mechanism):
-                raise ValueError(
-                    "If provided, `attn_shift_range` must contain exactly one "
-                    "integer per attention_mechanism, saw: {} vs {}"
-                    "".format(len(attn_shift_range), len(attention_mechanism))
-                )
-            for i in range(1, len(attention_mechanism)):
-                self._timed_ntms.append(
-                    TimedNTM(attn_shift_range[i], sparse_attention, name=str(i))
-                )
-
-        if rnn_and_attn_inputs_fn is None:
-            rnn_and_attn_inputs_fn = self._default_rnn_and_attn_inputs_fn
-        else:
-            if not callable(rnn_and_attn_inputs_fn):
-                raise TypeError(
-                    "`rnn_and_attn_inputs_fn` must be callable, saw type: {}"
-                    "".format(type(rnn_and_attn_inputs_fn).__name__)
-                )
-        self._rnn_and_attn_inputs_fn = rnn_and_attn_inputs_fn
-
-        if not isinstance(ignore_mask, list):
-            self._ignore_mask = [tf.cast(ignore_mask, tf.int32)]
-        else:
-            self._ignore_mask = [tf.cast(i_m, tf.int32) for i_m in ignore_mask]
-
-        self._index_of_attn_to_copy = index_of_attn_to_copy
-
-        self._likelihood_fn = likelihood_fn
-        self._tensor_not_to_copy = tensor_not_to_copy
-
-    @staticmethod
-    def _default_rnn_and_attn_inputs_fn(inputs, cell_state):
-        if isinstance(cell_state, tf.contrib.rnn.LSTMStateTuple):
-            return inputs, tf.concat([inputs, cell_state.h], -1)
-        else:
-            return inputs, tf.concat([inputs, cell_state], -1)
-
-    @staticmethod
-    def additional_output_size():
-        """Number of additional outputs:
-
-        likelihoods:
-            attn_likelihood, state_likelihood
-        debugging info:
-            current_time_prob,
-            bin_likelihood_not_to_copy, bin_likelihood_to_copy
-
-        **Method should be static**
-        """
-        return 2 + 3
-
-    @property
-    def output_size(self):
-        if self._output_attention:
-            if self._index_of_attn_to_copy is not None:
-                # output both raw rnn cell_output and
-                # cell_output with copied attention
-                # together with attention vector itself
-                # and additional output
-                return (
-                    2 * self._cell.output_size
-                    + self._attention_layer_size
-                    + self.additional_output_size()
-                )
-            else:
-                return self._cell.output_size + self._attention_layer_size
-        else:
-            return self._cell.output_size
-
-    @property
-    def state_size(self):
-        """The `state_size` property of `TimeAttentionWrapper`.
-        Returns:
-            A `TimeAttentionWrapperState` tuple containing shapes
-            used by this object.
-        """
-
-        # use AttentionWrapperState from superclass
-        state_size = super(TimeAttentionWrapper, self).state_size
-
-        all_cell_states = self._cell.state_size
-
-        return TimeAttentionWrapperState(
-            cell_state=state_size.cell_state,
-            time=state_size.time,
-            attention=state_size.attention,
-            alignments=state_size.alignments,
-            attention_state=state_size.attention_state,
-            alignment_history=state_size.alignment_history,
-            all_time_masks=self._sequence_len,
-            all_cell_states=all_cell_states,
-        )
-
-    def zero_state(self, batch_size, dtype):
-        """Modified  from tensorflow's zero_state
-            see there for description of the parameters"""
-
-        # use AttentionWrapperState from superclass
-        zero_state = super(TimeAttentionWrapper, self).zero_state(batch_size, dtype)
-
-        with tf.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-            # store time masks
-            all_time_masks = tf.TensorArray(
-                tf.int32,
-                size=self._sequence_len + 1,
-                dynamic_size=False,
-                clear_after_read=False,
-            ).write(0, tf.zeros([batch_size, self.state_size.all_time_masks], tf.int32))
-
-            # store all cell states into a tensor array to allow
-            # copy mechanism to go back in time
-            if isinstance(self._cell.state_size, tf.contrib.rnn.LSTMStateTuple):
-                all_cell_states = tf.contrib.rnn.LSTMStateTuple(
-                    tf.TensorArray(
-                        dtype,
-                        size=self._sequence_len + 1,
-                        dynamic_size=False,
-                        clear_after_read=False,
-                    ).write(0, zero_state.cell_state.c),
-                    tf.TensorArray(
-                        dtype,
-                        size=self._sequence_len + 1,
-                        dynamic_size=False,
-                        clear_after_read=False,
-                    ).write(0, zero_state.cell_state.h),
-                )
-            else:
-                all_cell_states = tf.TensorArray(
-                    dtype, size=0, dynamic_size=False, clear_after_read=False
-                ).write(0, zero_state.cell_state)
-
-            return TimeAttentionWrapperState(
-                cell_state=zero_state.cell_state,
-                time=zero_state.time,
-                attention=zero_state.attention,
-                alignments=zero_state.alignments,
-                attention_state=zero_state.attention_state,
-                alignment_history=zero_state.alignment_history,
-                all_time_masks=all_time_masks,
-                all_cell_states=all_cell_states,
-            )
-
-    def call(self, inputs, state):
-        """Perform a step of attention-wrapped RNN.
-
-        The order has changed:
-        - Step 1: Calculate attention inputs based on the previous cell state
-                  and current inputs
-        - Step 2: Score the output with `attention_mechanism`.
-        - Step 3: Calculate the alignments by passing the score through the
-                  `normalizer` and limit them by time.
-        - Step 4: Calculate the context vector as the inner product between the
-                  alignments and the attention_mechanism's values (memory).
-        - Step 5: Calculate the attention output by concatenating
-                  the cell output and context through the attention layer
-                  (a linear layer with `attention_layer_size` outputs).
-        - Step 6: Mix the `inputs` and `attention` output via
-                  `cell_input_fn` to get cell inputs.
-        - Step 7: Call the wrapped `cell` with these cell inputs and
-                  its previous state.
-        - Step 8: (optional) Maybe copy output and cell state from history
-
-        Args:
-          inputs: (Possibly nested tuple of) Tensor,
-                  the input at this time step.
-          state: An instance of `TimeAttentionWrapperState`
-                 containing tensors from the previous time step.
-
-        Returns:
-          A tuple `(attention_or_cell_output, next_state)`, where:
-
-          - `attention_or_cell_output` depending on `output_attention`.
-          - `next_state` is an instance of `TimeAttentionWrapperState`
-             containing the state calculated at this time step.
-
-        Raises:
-          TypeError: If `state` is not an instance of
-          `TimeAttentionWrapperState`.
-        """
-        if not isinstance(state, TimeAttentionWrapperState):
-            raise TypeError(
-                "Expected state to be instance of "
-                "TimeAttentionWrapperState. "
-                "Received type {} instead.".format(type(state))
-            )
-
-        # Step 1: Calculate attention based on
-        #         the previous output and current input
-        cell_state = state.cell_state
-
-        rnn_inputs, attn_inputs = self._rnn_and_attn_inputs_fn(inputs, cell_state)
-
-        cell_batch_size = attn_inputs.shape[0].value or tf.shape(attn_inputs)[0]
-        error_message = (
-            "When applying AttentionWrapper %s: " % self.name
-            + "Non-matching batch sizes between the memory "
-            "(encoder output) and the query (decoder output).  "
-            "Are you using "
-            "the BeamSearchDecoder?  "
-            "You may need to tile your memory input via "
-            "the tf.contrib.seq2seq.tile_batch function with argument "
-            "multiple=beam_width."
-        )
-        with tf.control_dependencies(
-            self._batch_size_checks(cell_batch_size, error_message)
-        ):
-            attn_inputs = tf.identity(attn_inputs, name="checked_attn_inputs")
-
-        if self._is_multi:
-            previous_attention_state = state.attention_state
-            previous_alignment_history = state.alignment_history
-        else:
-            previous_attention_state = [state.attention_state]
-            previous_alignment_history = [state.alignment_history]
-
-        all_alignments = []
-        all_attentions = []
-        all_attention_states = []
-        maybe_all_histories = []
-
-        prev_time_masks = self._read_from_tensor_array(state.all_time_masks, state.time)
-        prev_time_mask = prev_time_masks[:, -1, :]
-
-        for i, attention_mechanism in enumerate(self._attention_mechanisms):
-            # Steps 2 - 5 are performed inside `_compute_time_attention`
-            (attention, alignments, next_attention_state) = _compute_time_attention(
-                attention_mechanism,
-                attn_inputs,
-                previous_attention_state[i],
-                # time is added to calculate time attention
-                state.time,
-                self._timed_ntms[i],
-                # provide boolean masks, to ignore some time steps
-                prev_time_mask,
-                self._ignore_mask[i],
-                self._attention_layers[i] if self._attention_layers else None,
-            )
-
-            alignment_history = (
-                previous_alignment_history[i].write(state.time, alignments)
-                if self._alignment_history
-                else ()
-            )
-
-            all_attention_states.append(next_attention_state)
-            all_alignments.append(alignments)
-            all_attentions.append(attention)
-            maybe_all_histories.append(alignment_history)
-
-        attention = tf.concat(all_attentions, 1)
-
-        # Step 6: Mix the `inputs` and `attention` output via
-        #         `cell_input_fn` to get cell inputs.
-        cell_inputs = self._cell_input_fn(rnn_inputs, attention)
-
-        # Step 7: Call the wrapped `cell` with these cell inputs and
-        #         its previous state.
-        cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
-
-        prev_all_cell_states = state.all_cell_states
-
-        time_mask = tf.concat(
-            [
-                prev_time_mask[:, : state.time],
-                tf.ones_like(prev_time_mask[:, :1]),
-                prev_time_mask[:, state.time + 1 :],
-            ],
-            1,
-        )
-
-        if self._index_of_attn_to_copy is not None:
-            # Step 8: Maybe copy output and cell state from history
-
-            # get relevant previous outputs from history
-            attn_to_copy = all_attentions[self._index_of_attn_to_copy]
-            # copy them to current output
-            cell_output_with_attn = cell_output + attn_to_copy
-
-            memory_probs = self._get_memory_probs(all_alignments, state.time)
-
-            # check that we do not pay attention to `tensor_not_to_copy`
-            bin_likelihood_not_to_copy, _ = self._likelihood_fn(
-                cell_output_with_attn, self._tensor_not_to_copy
-            )
-            # recalculate probs
-            memory_probs *= 1 - bin_likelihood_not_to_copy
-
-            history_alignments = self._history_alignments(memory_probs)
-
-            # get previous output from the history
-            prev_output = self._prev_output(
-                cell_output_with_attn, history_alignments, state.time
-            )
-
-            # check that current output is close to
-            # the one in the history to which we pay attention to
-            bin_likelihood_to_copy, _ = self._likelihood_fn(
-                cell_output_with_attn, prev_output
-            )
-            # recalculate probs
-            memory_probs *= bin_likelihood_to_copy
-
-            history_alignments = self._history_alignments(memory_probs)
-            current_time_prob = history_alignments[:, -1:]
-
-            # create additional likelihoods to maximize
-            attn_likelihood = self._additional_likelihood(
-                attn_to_copy, prev_output, current_time_prob
-            )
-            state_likelihood = self._additional_likelihood(
-                cell_output + tf.stop_gradient(attn_to_copy),
-                prev_output,
-                current_time_prob,
-            )
-
-            # recalculate time_mask
-            time_mask = self._apply_alignments_to_history(
-                tf.cast(history_alignments, time_mask.dtype),
-                prev_time_masks[:, :-1, :],
-                time_mask,
-            )
-
-            # recalculate new next_cell_state based on history_alignments
-            next_cell_state = self._new_next_cell_state(
-                prev_all_cell_states,
-                next_cell_state,
-                cell_output_with_attn,
-                history_alignments,
-                state.time,
-            )
-
-            all_cell_states = self._all_cell_states(
-                prev_all_cell_states, next_cell_state, state.time
-            )
-
-            if self._output_attention:
-                # concatenate cell outputs, attention, additional likelihoods
-                # and copy_attn_debug
-                output = tf.concat(
-                    [
-                        cell_output_with_attn,
-                        cell_output,
-                        attention,
-                        # additional likelihoods
-                        attn_likelihood,
-                        state_likelihood,
-                        # copy_attn_debug
-                        bin_likelihood_not_to_copy,
-                        bin_likelihood_to_copy,
-                        current_time_prob,
-                    ],
-                    1,
-                )
-            else:
-                output = cell_output_with_attn
-
-        else:
-            # do not waste resources on storing history
-            all_cell_states = prev_all_cell_states
-
-            if self._output_attention:
-                output = tf.concat([cell_output, attention], 1)
-            else:
-                output = cell_output
-
-        all_time_masks = state.all_time_masks.write(state.time + 1, time_mask)
-
-        next_state = TimeAttentionWrapperState(
-            time=state.time + 1,
-            cell_state=next_cell_state,
-            attention=attention,
-            attention_state=self._item_or_tuple(all_attention_states),
-            alignments=self._item_or_tuple(all_alignments),
-            alignment_history=self._item_or_tuple(maybe_all_histories),
-            all_time_masks=all_time_masks,
-            all_cell_states=all_cell_states,
-        )
-        return output, next_state
-
-    # helper for TensorArray
-    @staticmethod
-    def _read_from_tensor_array(tensor_array, time):
-        """TensorArray time reader"""
-        return tf.transpose(tensor_array.gather(tf.range(0, time + 1)), [1, 0, 2])
-
-    # helper methods for copy mechanism
-    def _get_memory_probs(self, all_alignments, time):
-        """Helper method to get memory_probs from all_alignments"""
-
-        memory_probs = tf.stop_gradient(
-            all_alignments[self._index_of_attn_to_copy][:, :time]
-        )
-
-        # binarize memory_probs only if max value is larger than margin=0.1
-        memory_probs_max = tf.reduce_max(memory_probs, axis=1, keepdims=True)
-        memory_probs_max = tf.where(
-            memory_probs_max > 0.1, memory_probs_max, -memory_probs_max
-        )
-
-        return tf.where(
-            tf.equal(memory_probs, memory_probs_max),
-            tf.ones_like(memory_probs),
-            tf.zeros_like(memory_probs),
-        )
-
-    @staticmethod
-    def _history_alignments(memory_probs):
-        """Helper method to apply binary mask to memory_probs"""
-
-        current_time_prob = 1 - tf.reduce_sum(memory_probs, 1, keepdims=True)
-        return tf.concat([memory_probs, current_time_prob], 1)
-
-    @staticmethod
-    def _apply_alignments_to_history(alignments, history_states, state):
-        """Helper method to apply attention probabilities to rnn history
-
-        copied from tf's `_compute_attention(...)`"""
-
-        expanded_alignments = tf.stop_gradient(tf.expand_dims(alignments, 1))
-
-        history_states = tf.concat([history_states, tf.expand_dims(state, 1)], 1)
-
-        # Context is the inner product of alignments and values along the
-        # memory time dimension.
-        # expanded_alignments shape is
-        #   [batch_size, 1, memory_time]
-        # history_states shape is
-        #   [batch_size, memory_time, memory_size]
-        # the batched matmul is over memory_time, so the output shape is
-        #   [batch_size, 1, memory_size].
-        # we then squeeze out the singleton dim.
-
-        return tf.squeeze(tf.matmul(expanded_alignments, history_states), [1])
-
-    def _prev_output(self, state, alignments, time):
-        """Helper method to get previous output from memory"""
-
-        # get all previous outputs from appropriate
-        # attention mechanism's memory limited by current time
-        prev_outputs = tf.stop_gradient(
-            self._attention_mechanisms[self._index_of_attn_to_copy].values[:, :time, :]
-        )
-
-        # multiply by alignments to get one vector from one time step
-        return self._apply_alignments_to_history(alignments, prev_outputs, state)
-
-    def _additional_likelihood(self, output, prev_output, current_time_prob):
-        """Helper method to create additional likelihood to maximize"""
-
-        _, likelihood = self._likelihood_fn(output, tf.stop_gradient(prev_output))
-        return tf.where(current_time_prob < 0.5, likelihood, tf.ones_like(likelihood))
-
-    def _new_hidden_state(self, prev_all_cell_states, new_state, alignments, time):
-        """Helper method to look into rnn history"""
-
-        # reshape to (batch, time, memory_time) and
-        # do not include current time because
-        # we do not want to pay attention to it,
-        # but we need to read it instead of
-        # adding conditional flow if time == 0
-        prev_cell_states = self._read_from_tensor_array(prev_all_cell_states, time)[
-            :, :-1, :
-        ]
-
-        return self._apply_alignments_to_history(
-            alignments, prev_cell_states, new_state
-        )
-
-    def _new_next_cell_state(
-        self, prev_all_cell_states, next_cell_state, new_cell_output, alignments, time
-    ):
-        """Helper method to recalculate new next_cell_state"""
-
-        if isinstance(next_cell_state, tf.contrib.rnn.LSTMStateTuple):
-            next_cell_state_c = self._new_hidden_state(
-                prev_all_cell_states.c, next_cell_state.c, alignments, time
-            )
-            next_cell_state_h = self._new_hidden_state(
-                prev_all_cell_states.h, new_cell_output, alignments, time
-            )
-            return tf.contrib.rnn.LSTMStateTuple(next_cell_state_c, next_cell_state_h)
-        else:
-            return self._new_hidden_state(
-                prev_all_cell_states, alignments, new_cell_output, time
-            )
-
-    @staticmethod
-    def _all_cell_states(prev_all_cell_states, next_cell_state, time):
-        """Helper method to recalculate all_cell_states tensor array"""
-
-        if isinstance(next_cell_state, tf.contrib.rnn.LSTMStateTuple):
-            return tf.contrib.rnn.LSTMStateTuple(
-                prev_all_cell_states.c.write(time + 1, next_cell_state.c),
-                prev_all_cell_states.h.write(time + 1, next_cell_state.h),
-            )
-        else:
-            return prev_all_cell_states.write(time + 1, next_cell_state)
-
-
-class ChronoBiasLayerNormBasicLSTMCell(tf.contrib.rnn.LayerNormBasicLSTMCell):
-    """Custom LayerNormBasicLSTMCell that allows chrono initialization
-        of gate biases.
-
-        See super class for description.
-
-        See https://arxiv.org/abs/1804.11188
-        for details about chrono initialization
-    """
-
-    def __init__(
-        self,
-        num_units,
-        forget_bias=1.0,
-        input_bias=0.0,
-        activation=tf.tanh,
-        layer_norm=True,
-        norm_gain=1.0,
-        norm_shift=0.0,
-        dropout_keep_prob=1.0,
-        dropout_prob_seed=None,
-        out_layer_size=None,
-        reuse=None,
-    ):
-        """Initializes the basic LSTM cell
-
-        Additional args:
-            input_bias: float, The bias added to input gates.
-            out_layer_size: (optional) integer, The number of units in
-                the optional additional output layer.
-        """
-        super(ChronoBiasLayerNormBasicLSTMCell, self).__init__(
-            num_units,
-            forget_bias=forget_bias,
-            activation=activation,
-            layer_norm=layer_norm,
-            norm_gain=norm_gain,
-            norm_shift=norm_shift,
-            dropout_keep_prob=dropout_keep_prob,
-            dropout_prob_seed=dropout_prob_seed,
-            reuse=reuse,
-        )
-        self._input_bias = input_bias
-        self._out_layer_size = out_layer_size
-
-    @property
-    def output_size(self):
-        return self._out_layer_size or self._num_units
-
-    @property
-    def state_size(self):
-        return tf.contrib.rnn.LSTMStateTuple(self._num_units, self.output_size)
-
-    @staticmethod
-    def _dense_layer(args, layer_size):
-        """Optional out projection layer"""
-        proj_size = args.get_shape()[-1]
-        dtype = args.dtype
-        weights = tf.get_variable("kernel", [proj_size, layer_size], dtype=dtype)
-        bias = tf.get_variable("bias", [layer_size], dtype=dtype)
-        out = tf.nn.bias_add(tf.matmul(args, weights), bias)
-        return out
-
-    def call(self, inputs, state):
-        """LSTM cell with layer normalization and recurrent dropout."""
-        c, h = state
-        args = tf.concat([inputs, h], 1)
-        concat = self._linear(args)
-        dtype = args.dtype
-
-        i, j, f, o = tf.split(value=concat, num_or_size_splits=4, axis=1)
-        if self._layer_norm:
-            i = self._norm(i, "input", dtype=dtype)
-            j = self._norm(j, "transform", dtype=dtype)
-            f = self._norm(f, "forget", dtype=dtype)
-            o = self._norm(o, "output", dtype=dtype)
-
-        g = self._activation(j)
-        if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
-            g = tf.nn.dropout(g, self._keep_prob, seed=self._seed)
-
-        new_c = c * tf.sigmoid(f + self._forget_bias) + g * tf.sigmoid(
-            i + self._input_bias
-        )  # added input_bias
-
-        # do not do layer normalization on the new c,
-        # because there are no trainable weights
-        # if self._layer_norm:
-        #     new_c = self._norm(new_c, "state", dtype=dtype)
-
-        new_h = self._activation(new_c) * tf.sigmoid(o)
-
-        # added dropout to the hidden state h
-        if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
-            new_h = tf.nn.dropout(new_h, self._keep_prob, seed=self._seed)
-
-        # add postprocessing of the output
-        if self._out_layer_size is not None:
-            with tf.variable_scope("out_layer"):
-                new_h = self._dense_layer(new_h, self._out_layer_size)
-
-        new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
-        return new_h, new_state
diff --git a/requirements.txt b/requirements.txt
index cff22f35bc60..04edbadfa16c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,8 @@ fakeredis==1.0.3
 pymongo==3.8.0
 numpy==1.16.3
 scipy==1.2.1
-tensorflow==1.13.1
+tensorflow==1.14.0
+tensor2tensor=1.13.4
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.3

From bcd3bffd6b634c9e88eb908b47e2015fdbec55ee Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 19:38:39 +0200
Subject: [PATCH 23/50] fix featurizer, add t2t requirements

---
 rasa/core/policies/embedding_policy.py | 27 ++++++--------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 5145039c7a02..8c033315aa1f 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -25,20 +25,12 @@
 
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
-
-try:
-    from tensor2tensor.models.transformer import (
-        transformer_base,
-        transformer_prepare_encoder,
-        transformer_encoder,
-    )
-    from tensor2tensor.layers.common_attention import large_compatible_negative
-
-except ImportError:
-    transformer_base = None
-    transformer_prepare_encoder = None
-    transformer_encoder = None
-    large_compatible_negative = None
+from tensor2tensor.models.transformer import (
+    transformer_base,
+    transformer_prepare_encoder,
+    transformer_encoder,
+)
+from tensor2tensor.layers.common_attention import large_compatible_negative
 
 try:
     import cPickle as pickle
@@ -135,11 +127,6 @@ def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturize
                 LabelTokenizerSingleStateFeaturizer(), max_history=max_history
             )
 
-    @staticmethod
-    def _check_t2t() -> None:
-        if transformer_base is None:
-            raise ImportError("Please install tensor2tensor")
-
     def __init__(
         self,
         featurizer: Optional["TrackerFeaturizer"] = None,
@@ -161,8 +148,6 @@ def __init__(
         max_history: Optional[int] = None,
         **kwargs: Any
     ) -> None:
-        # check if t2t is installed
-        self._check_t2t()
 
         if not featurizer:
             featurizer = self._standard_featurizer(max_history)

From 9342569cab780f4e1fd646f16c8fbc83f2c0b7b7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 19:41:26 +0200
Subject: [PATCH 24/50] add tfp for t2t to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 04edbadfa16c..51d6de84a827 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ pymongo==3.8.0
 numpy==1.16.3
 scipy==1.2.1
 tensorflow==1.14.0
+tensorflow-probability==0.7.0
 tensor2tensor=1.13.4
 apscheduler==3.6.0
 tqdm==4.31.0

From 3c204b98849598f3815c4720c4e4613f9fec265a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 19:44:46 +0200
Subject: [PATCH 25/50] fix requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 51d6de84a827..bc58d8d43640 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ numpy==1.16.3
 scipy==1.2.1
 tensorflow==1.14.0
 tensorflow-probability==0.7.0
-tensor2tensor=1.13.4
+tensor2tensor==1.13.4
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.3

From 2cd22a865b6daa215193c0803a29c91fae8643a4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 22:14:25 +0200
Subject: [PATCH 26/50] remove check tf, update setup.py

---
 alt_requirements/requirements_full.txt        |  2 +-
 rasa/core/policies/embedding_policy.py        |  7 ++----
 .../embedding_intent_classifier.py            | 23 ++++---------------
 setup.py                                      |  3 ++-
 4 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/alt_requirements/requirements_full.txt b/alt_requirements/requirements_full.txt
index e9114035ccea..a700141c6b13 100644
--- a/alt_requirements/requirements_full.txt
+++ b/alt_requirements/requirements_full.txt
@@ -1,4 +1,4 @@
-# Minimum Instal Requirements
+# Minimum Install Requirements
 -r ../requirements.txt
 
 # Spacy Requirements
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 8c033315aa1f..c4f63d295230 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import pickle
 import warnings
 
 import numpy as np
@@ -32,11 +33,6 @@
 )
 from tensor2tensor.layers.common_attention import large_compatible_negative
 
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
 if typing.TYPE_CHECKING:
     from tensor2tensor.utils.hparam import HParams
 
@@ -148,6 +144,7 @@ def __init__(
         max_history: Optional[int] = None,
         **kwargs: Any
     ) -> None:
+        """Declare instant variables with default values"""
 
         if not featurizer:
             featurizer = self._standard_featurizer(max_history)
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a613321cb6ba..3fa504c3c79d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -11,6 +11,11 @@
 from rasa.nlu.components import Component
 from rasa.utils.common import is_logging_disabled
 
+import tensorflow as tf
+
+# avoid warning println on contrib import - remove for tf 2
+tf.contrib._warning = None
+
 logger = logging.getLogger(__name__)
 
 if typing.TYPE_CHECKING:
@@ -20,14 +25,6 @@
     from rasa.nlu.model import Metadata
     from rasa.nlu.training_data import Message
 
-try:
-    import tensorflow as tf
-
-    # avoid warning println on contrib import - remove for tf 2
-    tf.contrib._warning = None
-except ImportError:
-    tf = None
-
 
 class EmbeddingIntentClassifier(Component):
     """Intent classifier using supervised embeddings.
@@ -120,7 +117,6 @@ def __init__(
     ) -> None:
         """Declare instant variables with default values"""
 
-        self._check_tensorflow()
         super(EmbeddingIntentClassifier, self).__init__(component_config)
 
         self._load_params()
@@ -195,15 +191,6 @@ def _load_params(self) -> None:
     def required_packages(cls) -> List[Text]:
         return ["tensorflow"]
 
-    @staticmethod
-    def _check_tensorflow():
-        if tf is None:
-            raise ImportError(
-                "Failed to import `tensorflow`. "
-                "Please install `tensorflow`. "
-                "For example with `pip install tensorflow`."
-            )
-
     # training data helpers:
     @staticmethod
     def _create_intent_dict(training_data: "TrainingData") -> Dict[Text, int]:
diff --git a/setup.py b/setup.py
index 7efa8ec2e8cf..310bf0214908 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,8 @@
     "pymongo~=3.8",
     "numpy~=1.16",
     "scipy~=1.2",
-    "tensorflow~=1.13.0",
+    "tensorflow~=1.14.0",
+    "tensor2tensor~=1.13.4",
     "apscheduler~=3.0",
     "tqdm~=4.0",
     "networkx~=2.3",

From 0c089678e1e8924d00b5ee8301e19775b6ccb37d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 24 Jul 2019 22:37:08 +0200
Subject: [PATCH 27/50] remove unused variables

---
 rasa/core/policies/embedding_policy.py | 30 +++++++++-----------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index c4f63d295230..200118f63466 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -130,10 +130,8 @@ def __init__(
         encoded_all_actions: Optional["np.ndarray"] = None,
         graph: Optional["tf.Graph"] = None,
         session: Optional["tf.Session"] = None,
-        intent_placeholder: Optional["tf.Tensor"] = None,
-        action_placeholder: Optional["tf.Tensor"] = None,
-        slots_placeholder: Optional["tf.Tensor"] = None,
-        prev_act_placeholder: Optional["tf.Tensor"] = None,
+        user_placeholder: Optional["tf.Tensor"] = None,
+        bot_placeholder: Optional["tf.Tensor"] = None,
         similarity_all: Optional["tf.Tensor"] = None,
         pred_confidence: Optional["tf.Tensor"] = None,
         similarity: Optional["tf.Tensor"] = None,
@@ -162,10 +160,8 @@ def __init__(
         # tf related instances
         self.graph = graph
         self.session = session
-        self.a_in = intent_placeholder
-        self.b_in = action_placeholder
-        self.c_in = slots_placeholder
-        self.b_prev_in = prev_act_placeholder
+        self.a_in = user_placeholder
+        self.b_in = bot_placeholder
         self.sim_all = similarity_all
         self.pred_confidence = pred_confidence
         self.sim = similarity
@@ -1288,10 +1284,8 @@ def persist(self, path: Text) -> None:
         rasa.utils.io.create_directory_for_file(checkpoint)
 
         with self.graph.as_default():
-            self._persist_tensor("intent_placeholder", self.a_in)
-            self._persist_tensor("action_placeholder", self.b_in)
-            self._persist_tensor("slots_placeholder", self.c_in)
-            self._persist_tensor("prev_act_placeholder", self.b_prev_in)
+            self._persist_tensor("user_placeholder", self.a_in)
+            self._persist_tensor("bot_placeholder", self.b_in)
 
             self._persist_tensor("similarity_all", self.sim_all)
             self._persist_tensor("pred_confidence", self.pred_confidence)
@@ -1359,10 +1353,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
             saver.restore(session, checkpoint)
 
-            a_in = cls.load_tensor("intent_placeholder")
-            b_in = cls.load_tensor("action_placeholder")
-            c_in = cls.load_tensor("slots_placeholder")
-            b_prev_in = cls.load_tensor("prev_act_placeholder")
+            a_in = cls.load_tensor("user_placeholder")
+            b_in = cls.load_tensor("bot_placeholder")
 
             sim_all = cls.load_tensor("similarity_all")
             pred_confidence = cls.load_tensor("pred_confidence")
@@ -1387,10 +1379,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             encoded_all_actions=encoded_all_actions,
             graph=graph,
             session=session,
-            intent_placeholder=a_in,
-            action_placeholder=b_in,
-            slots_placeholder=c_in,
-            prev_act_placeholder=b_prev_in,
+            user_placeholder=a_in,
+            bot_placeholder=b_in,
             similarity_all=sim_all,
             pred_confidence=pred_confidence,
             similarity=sim,

From 8bfd238e34a3871587ca44637f759c1edca14d3e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 00:18:26 +0200
Subject: [PATCH 28/50] update setuptools

---
 requirements.txt | 1 +
 setup.py         | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index bc58d8d43640..56d5bfe5c6eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,3 +49,4 @@ SQLAlchemy~=1.3.3
 kafka-python==1.4.6
 sklearn-crfsuite==0.3.6
 psycopg2-binary==2.8.2
+setuptools==41.0.1
diff --git a/setup.py b/setup.py
index 310bf0214908..71853a681f3f 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@
     "numpy~=1.16",
     "scipy~=1.2",
     "tensorflow~=1.14.0",
+    "tensorflow-probability~=0.7.0",
     "tensor2tensor~=1.13.4",
     "apscheduler~=3.0",
     "tqdm~=4.0",
@@ -75,6 +76,7 @@
     "SQLAlchemy~=1.3.0",
     "kafka-python~=1.4",
     "sklearn-crfsuite~=0.3.6",
+    "setuptools~=41.0.1"
 ]
 
 extras_requires = {

From 185d38fff2e2d192cf064ea9e5f76ffcb0598ea1 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 10:01:39 +0200
Subject: [PATCH 29/50] change default tf config test

---
 tests/core/test_policies.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 95b22d96beb1..70440b2531a8 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -149,12 +149,13 @@ def test_persist_and_load_empty_policy(self, tmpdir):
 
     def test_tf_config(self, trained_policy, tmpdir):
         if hasattr(trained_policy, "session"):
+            import tensorflow as tf
             # noinspection PyProtectedMember
-            assert trained_policy.session._config is None
+            assert trained_policy.session._config == tf.Session()._config
             trained_policy.persist(tmpdir.strpath)
             loaded = trained_policy.__class__.load(tmpdir.strpath)
             # noinspection PyProtectedMember
-            assert loaded.session._config is None
+            assert loaded.session._config == tf.Session()._config
 
 
 class TestKerasPolicy(PolicyTestCollection):

From 45b225b550c57f0cdb37d5a3e88563aae4cc6e02 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 11:32:27 +0200
Subject: [PATCH 30/50] black it

---
 setup.py                    | 2 +-
 tests/core/test_policies.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 71853a681f3f..785138b45300 100644
--- a/setup.py
+++ b/setup.py
@@ -76,7 +76,7 @@
     "SQLAlchemy~=1.3.0",
     "kafka-python~=1.4",
     "sklearn-crfsuite~=0.3.6",
-    "setuptools~=41.0.1"
+    "setuptools~=41.0.1",
 ]
 
 extras_requires = {
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 70440b2531a8..80f34fad89ff 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -150,6 +150,7 @@ def test_persist_and_load_empty_policy(self, tmpdir):
     def test_tf_config(self, trained_policy, tmpdir):
         if hasattr(trained_policy, "session"):
             import tensorflow as tf
+
             # noinspection PyProtectedMember
             assert trained_policy.session._config == tf.Session()._config
             trained_policy.persist(tmpdir.strpath)

From 7c3a1eaeafe5cf79dbf1878e5235b01512082f5a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 12:29:25 +0200
Subject: [PATCH 31/50] refactor pre transformer embedding

---
 docs/core/policies.rst                 |  7 ++--
 rasa/core/policies/embedding_policy.py | 46 ++++++++++++--------------
 2 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 97606621df81..8049e280362f 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -175,10 +175,9 @@ following steps:
 
     - apply dense layers to create embeddings for user intents,
       entities and system actions including previous actions and slots;
-    - use the embeddings of previous user inputs as a user memory
-      and embeddings of previous system actions as a system memory;
-    - concatenate user input, previous system action and slots
-      embeddings for current time into an input vector to rnn;
+    - concatenate user input (user intents and entities),
+      previous system action and slots
+      for current time into an input vector to pre-transformer embedding layer;
     - using user and previous system action embeddings from the input
       vector, calculate attention probabilities over the user and
       system memories (for system memory, this policy uses
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 200118f63466..0afc2146eecf 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -9,7 +9,7 @@
 import numpy as np
 import typing
 from tqdm import tqdm
-from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator
+from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator, Callable
 
 import rasa.utils.io
 from rasa.core import utils
@@ -447,6 +447,9 @@ def _create_tf_nn(
         layer_sizes: List[int],
         droprate: float,
         layer_name_suffix: Text,
+        activation: Optional[Callable] = tf.nn.relu,
+        use_bias: bool = True,
+        kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
     ) -> "tf.Tensor":
         """Create nn with hidden layers and name suffix."""
 
@@ -456,7 +459,9 @@ def _create_tf_nn(
             x = tf.layers.dense(
                 inputs=x,
                 units=layer_size,
-                activation=tf.nn.relu,
+                activation=activation,
+                use_bias=use_bias,
+                kernel_initializer=kernel_initializer,
                 kernel_regularizer=reg,
                 name="hidden_layer_{}_{}".format(layer_name_suffix, i),
                 reuse=tf.AUTO_REUSE,
@@ -542,30 +547,23 @@ def _create_t2t_transformer_encoder(
         for key, value in hparams.values().items():
             if key.endswith("dropout") or key == "label_smoothing":
                 setattr(hparams, key, value * tf.cast(self._is_training, tf.float32))
-        reg = tf.contrib.layers.l2_regularizer(self.C2)
-
-        x = tf.nn.relu(x_in)
-        x = tf.layers.dense(
-            inputs=x,
-            units=hparams.hidden_size,
-            use_bias=False,
-            kernel_initializer=tf.random_normal_initializer(
-                0.0, hparams.hidden_size ** -0.5
-            ),
-            kernel_regularizer=reg,
-            name="transformer_embed_layer",
-            reuse=tf.AUTO_REUSE,
-        )
-        x = tf.layers.dropout(
-            x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training
-        )
-
-        if hparams.multiply_embedding_mode == "sqrt_depth":
-            x *= hparams.hidden_size ** 0.5
-
-        x *= tf.expand_dims(mask, -1)
 
         with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
+            x = self._create_tf_nn(
+                x_in,
+                [hparams.hidden_size],
+                hparams.layer_prepostprocess_dropout,
+                layer_name_suffix="pre_embed",
+                activation=None,
+                use_bias=False,
+                kernel_initializer=tf.random_normal_initializer(
+                    0.0, hparams.hidden_size ** -0.5
+                ),
+            )
+            if hparams.multiply_embedding_mode == "sqrt_depth":
+                x *= hparams.hidden_size ** 0.5
+
+            x *= tf.expand_dims(mask, -1)
             (
                 x,
                 self_attention_bias,

From 5765fd0041358c31dfe7a88694af6d064660ba27 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 13:01:21 +0200
Subject: [PATCH 32/50] update docs

---
 docs/core/policies.rst                 | 84 +++++++++++++-------------
 rasa/core/policies/embedding_policy.py | 30 ++++-----
 2 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 8049e280362f..7cf7a4b8882d 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -173,31 +173,14 @@ used in our paper: `<https://arxiv.org/abs/1811.11707>`_
 This policy has a pre-defined architecture, which comprises the
 following steps:
 
-    - apply dense layers to create embeddings for user intents,
-      entities and system actions including previous actions and slots;
-    - concatenate user input (user intents and entities),
-      previous system action and slots
-      for current time into an input vector to pre-transformer embedding layer;
-    - using user and previous system action embeddings from the input
-      vector, calculate attention probabilities over the user and
-      system memories (for system memory, this policy uses
-      `NTM mechanism <https://arxiv.org/abs/1410.5401>`_ with attention
-      by location);
-    - sum the user embedding and user attention vector and feed it
-      and the embeddings of the slots as an input to an LSTM cell;
-    - apply a dense layer to the output of the LSTM to get a raw
-      recurrent embedding of a dialogue;
-    - sum this raw recurrent embedding of a dialogue with system
-      attention vector to create dialogue level embedding, this step
-      allows the algorithm to repeat previous system action by copying
-      its embedding vector directly to the current time output;
-    - weight previous LSTM states with system attention probabilities
-      to get the previous action embedding, the policy is likely payed
-      attention to;
-    - if the similarity between this previous action embedding and
-      current time dialogue embedding is high, overwrite current LSTM
-      state with the one from the time when this action happened;
-    - for each LSTM time step, calculate the similarity between the
+    - concatenate user input (user intent and entities),
+      previous system action and slots for current time into an input vector
+      to pre-transformer embedding layer;
+    - feed it to tranformer;
+    - apply a dense layer to the output of the transformer
+      to get embeddings of a dialogue for each time step;
+    - apply a dense layer to create embeddings for system actions for each time step;
+    - calculate the similarity between the
       dialogue embedding and embedded system actions.
       This step is based on the
       `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
@@ -226,15 +209,21 @@ It is recommended to use
             - ``hidden_layers_sizes_b`` sets a list of hidden layers
               sizes before embedding layer for system actions, the number
               of hidden layers is equal to the length of the list;
-            - ``rnn_size`` sets the number of units in the LSTM cell;
+            - ``transformer_size`` sets the number of units in the transfomer;
+            - ``num_transformer_layers`` sets the number of transformer layers;
+            - ``pos_encoding`` sets the type of positional encoding in transformer,
+              it should be either ``timing`` or ``emb``;
+            - ``max_seq_length`` sets maximum sequence length
+              if embedding positional encodings are used;
+            - ``num_heads`` sets the number of heads in multihead attention;
 
         - training:
 
-            - ``layer_norm`` if ``true`` layer normalization for lstm
-              cell is turned on,  default ``true``;
             - ``batch_size`` sets the number of training examples in one
               forward/backward pass, the higher the batch size, the more
               memory space you'll need;
+            - ``batch_strategy`` sets the type of batching strategy,
+              it should be either ``sequence`` or ``balanced``;
             - ``epochs`` sets the number of times the algorithm will see
               training data, where one ``epoch`` equals one forward pass and
               one backward pass of all the training examples;
@@ -244,38 +233,49 @@ It is recommended to use
         - embedding:
 
             - ``embed_dim`` sets the dimension of embedding space;
-            - ``mu_pos`` controls how similar the algorithm should try
-              to make embedding vectors for correct intent labels;
-            - ``mu_neg`` controls maximum negative similarity for
-              incorrect intents;
-            - ``similarity_type`` sets the type of the similarity,
-              it should be either ``cosine`` or ``inner``;
             - ``num_neg`` sets the number of incorrect intent labels,
               the algorithm will minimize their similarity to the user
               input during training;
+            - ``similarity_type`` sets the type of the similarity,
+              it should be either ``auto``, ``cosine`` or ``inner``,
+              if ``auto``, it will be set depending on ``loss_type``,
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+            - ``loss_type`` sets the type of the loss function,
+              it should be either ``softmax`` or ``margin``;
+            - ``mu_pos`` controls how similar the algorithm should try
+              to make embedding vectors for correct intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``mu_neg`` controls maximum negative similarity for
+              incorrect intents,
+              used only if ``loss_type`` is set to ``margin``;
             - ``use_max_sim_neg`` if ``true`` the algorithm only
-              minimizes maximum similarity over incorrect intent labels;
+              minimizes maximum similarity over incorrect intent labels,
+              used only if ``loss_type`` is set to ``margin``;
 
         - regularization:
 
             - ``C2`` sets the scale of L2 regularization
             - ``C_emb`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different
-              intent labels;
-            - ``droprate_a`` sets the dropout rate between hidden
+              intent labels, used only if ``loss_type`` is set to ``margin``;
+            - ``droprate_a`` sets the dropout rate between
               layers before embedding layer for user inputs;
-            - ``droprate_b`` sets the dropout rate between hidden layers
+            - ``droprate_b`` sets the dropout rate between layers
               before embedding layer for system actions;
-            - ``droprate_rnn`` sets the recurrent dropout rate on
-              the LSTM hidden state `<https://arxiv.org/abs/1603.05118>`_;
 
         - train accuracy calculation:
 
             - ``evaluate_every_num_epochs`` sets how often to calculate
               train accuracy, small values may hurt performance;
             - ``evaluate_on_num_examples`` how many examples to use for
-              calculation of train accuracy, large values may hurt
-              performance.
+              hold out validation set to calculate of validation accuracy,
+              large values may hurt performance.
+
+    .. warning::
+
+        if ``evaluate_on_num_examples`` is non zero, random examples will be
+        picked by stratified split and used as **hold out** validation set,
+        so they will be excluded from training data.
 
     .. note::
 
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 0afc2146eecf..b8534c7e22c6 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -58,16 +58,16 @@ class EmbeddingPolicy(Policy):
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_bot": [],
+        # number of units in transformer
+        "transformer_sizes": 128,
+        # number of transformer layers
+        "num_transformer_layers": 1,
         # type of positional encoding in transformer
-        "pos_encoding": "timing",  # {"timing", "emb"}
+        "pos_encoding": "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
         "max_seq_length": 256,
         # number of attention heads in transformer
         "num_heads": 4,
-        # number of units in transformer
-        "transformer_size": 128,
-        # number of transformer layers
-        "num_transformer_layers": 1,
         # training parameters
         # initial and final batch sizes:
         # batch size will be linearly increased for each epoch
@@ -81,19 +81,19 @@ class EmbeddingPolicy(Policy):
         # embedding parameters
         # dimension size of embedding vectors
         "embed_dim": 20,
+        # the type of the similarity
+        "num_neg": 20,
+        # flag if minimize only maximum similarity over incorrect actions
+        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
+        "loss_type": "softmax",  # string 'softmax' or 'margin'
         # how similar the algorithm should try
         # to make embedding vectors for correct actions
         "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect actions
         "mu_neg": -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # the type of the similarity
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
         # the number of incorrect actions, the algorithm will minimize
         # their similarity to the user input during training
-        "num_neg": 20,
-        # flag if minimize only maximum similarity over incorrect actions
         "use_max_sim_neg": True,  # flag which loss function to use
         # regularization
         # the scale of L2 regularization
@@ -101,10 +101,10 @@ class EmbeddingPolicy(Policy):
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different actions
         "C_emb": 0.8,
-        # dropout rate for bot nn
-        "droprate_bot": 0.0,
         # dropout rate for dial nn
-        "droprate_dial": 0.1,
+        "droprate_a": 0.1,
+        # dropout rate for bot nn
+        "droprate_b": 0.0,
         # visualization of accuracy
         # how often calculate validation accuracy
         "evaluate_every_num_epochs": 20,  # small values may hurt performance
@@ -213,7 +213,7 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
     def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
         self.C2 = config["C2"]
         self.C_emb = config["C_emb"]
-        self.droprate = {"bot": config["droprate_bot"], "dial": config["droprate_dial"]}
+        self.droprate = {"bot": config["droprate_b"], "dial": config["droprate_a"]}
 
     def _load_visual_params(self, config: Dict[Text, Any]) -> None:
         self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]

From bed12fa69a740f3592306c7e3f7b49e9cb6b5429 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 13:02:36 +0200
Subject: [PATCH 33/50] update docs

---
 docs/core/policies.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 7cf7a4b8882d..068ebe9d1510 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -174,7 +174,8 @@ This policy has a pre-defined architecture, which comprises the
 following steps:
 
     - concatenate user input (user intent and entities),
-      previous system action and slots for current time into an input vector
+      previous system action, slots and active form
+      for each time step into an input vector
       to pre-transformer embedding layer;
     - feed it to tranformer;
     - apply a dense layer to the output of the transformer

From aaa4dc883b8db9554e6104b9d4a99405caf2931b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 14:24:52 +0200
Subject: [PATCH 34/50] fix typo in defaults

---
 rasa/core/policies/embedding_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b8534c7e22c6..f8c08470e4a4 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -59,7 +59,7 @@ class EmbeddingPolicy(Policy):
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_bot": [],
         # number of units in transformer
-        "transformer_sizes": 128,
+        "transformer_size": 128,
         # number of transformer layers
         "num_transformer_layers": 1,
         # type of positional encoding in transformer

From 7cbba6ec0f4af301f4ae565ee3313bbe4b97262c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 14:40:19 +0200
Subject: [PATCH 35/50] update changelog

---
 CHANGELOG.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e07da8e4a551..e87e1cee3578 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -22,6 +22,8 @@ Changed
 - Update pytype to ``2019.7.11``
 - Substitute LSTM with Transformer in ``EmbeddingPolicy``
 - ``EmbeddingPolicy`` can now use ``MaxHistoryTrackerFeaturizer``
+- in ``EmbeddingPolicy``, non zero ``evaluate_on_num_examples`` is the size of
+  hold out validation set that is excluded from training data
 
 Removed
 -------

From f9917dcacf200d3fddfd8478b2675ad529f50e2c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 15:06:46 +0200
Subject: [PATCH 36/50] update docstrings in featurizers

---
 rasa/core/featurizers.py | 149 ++++++++++++++++++++++++---------------
 1 file changed, 94 insertions(+), 55 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index 4c32158260ee..2d6b0c750631 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -19,18 +19,21 @@
 
 
 class SingleStateFeaturizer(object):
-    """Base class for mechanisms to transform the conversations state
-    into machine learning formats.
+    """Base class for mechanisms to transform the conversations state into ML formats.
 
     Subclasses of SingleStateFeaturizer decide how the bot will transform
     the conversation state to a format which a classifier can read:
-    feature vector."""
+    feature vector.
+    """
 
     def prepare_from_domain(self, domain: Domain) -> None:
-        """Helper method to init based on domain"""
+        """Helper method to init based on domain."""
+
         pass
 
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
+        """Encode user input."""
+
         raise NotImplementedError(
             "SingleStateFeaturizer must have "
             "the capacity to "
@@ -39,6 +42,8 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray:
 
     @staticmethod
     def action_as_one_hot(action: Text, domain: Domain) -> np.ndarray:
+        """Encode system action as one-hot vector."""
+
         if action is None:
             return np.ones(domain.num_actions, dtype=int) * -1
 
@@ -47,46 +52,50 @@ def action_as_one_hot(action: Text, domain: Domain) -> np.ndarray:
         return y
 
     def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
-        """Create matrix with all actions from domain
-            encoded in rows."""
+        """Create matrix with all actions from domain encoded in rows."""
+
         pass
 
 
 class BinarySingleStateFeaturizer(SingleStateFeaturizer):
     """Assumes all features are binary.
 
-    All features should be either on or off, denoting them with 1 or 0."""
+    All features should be either on or off, denoting them with 1 or 0.
+    """
 
     def __init__(self):
         """Declares instant variables."""
+
         super(BinarySingleStateFeaturizer, self).__init__()
 
         self.num_features = None
         self.input_state_map = None
 
     def prepare_from_domain(self, domain: Domain) -> None:
+        """Use Domain to prepare featurizer."""
+
         self.num_features = domain.num_states
         self.input_state_map = domain.input_state_map
 
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
         """Returns a binary vector indicating which features are active.
 
-            Given a dictionary of states (e.g. 'intent_greet',
-            'prev_action_listen',...) return a binary vector indicating which
-            features of `self.input_features` are in the bag. NB it's a
-            regular double precision float array type.
+        Given a dictionary of states (e.g. 'intent_greet',
+        'prev_action_listen',...) return a binary vector indicating which
+        features of `self.input_features` are in the bag. NB it's a
+        regular double precision float array type.
 
-            For example with two active features out of five possible features
-            this would return a vector like `[0 0 1 0 1]`
+        For example with two active features out of five possible features
+        this would return a vector like `[0 0 1 0 1]`
 
-            If intent features are given with a probability, for example
-            with two active features and two uncertain intents out
-            of five possible features this would return a vector
-            like `[0.3, 0.7, 1.0, 0, 1.0]`.
+        If intent features are given with a probability, for example
+        with two active features and two uncertain intents out
+        of five possible features this would return a vector
+        like `[0.3, 0.7, 1.0, 0, 1.0]`.
 
-            If this is just a padding vector we set all values to `-1`.
-            padding vectors are specified by a `None` or `[None]`
-            value for states.
+        If this is just a padding vector we set all values to `-1`.
+        padding vectors are specified by a `None` or `[None]`
+        value for states.
         """
 
         if not self.num_features:
@@ -119,15 +128,16 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray:
             return used_features
 
     def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
-        """Create matrix with all actions from domain
-            encoded in rows as bag of words."""
+        """Create matrix with all actions from domain encoded in rows as bag of words"""
+
         return np.eye(domain.num_actions)
 
 
 class LabelTokenizerSingleStateFeaturizer(SingleStateFeaturizer):
-    """SingleStateFeaturizer that splits user intents and
-    bot action names into tokens and uses these tokens to
-    create bag-of-words feature vectors.
+    """Creates bag-of-words feature vectors.
+
+    User intents and bot action names are split into tokens
+    and used to create bag-of-words feature vectors.
 
     Args:
         split_symbol: The symbol that separates words in
@@ -157,8 +167,10 @@ def __init__(
     @staticmethod
     def _create_label_token_dict(labels, split_symbol="_"):
         """Splits labels into tokens by using provided symbol.
+
         Creates the lookup dictionary for this tokens.
-        Values in this dict are used for featurization."""
+        Values in this dict are used for featurization.
+        """
 
         distinct_tokens = set(
             [token for label in labels for token in label.split(split_symbol)]
@@ -166,8 +178,8 @@ def _create_label_token_dict(labels, split_symbol="_"):
         return {token: idx for idx, token in enumerate(sorted(distinct_tokens))}
 
     def prepare_from_domain(self, domain: Domain) -> None:
-        """Creates internal vocabularies for user intents
-        and bot actions to use for featurization"""
+        """Creates internal vocabularies for user intents and bot actions."""
+
         self.user_labels = domain.intent_states + domain.entity_states
         self.slot_labels = domain.slot_states + domain.form_states
         self.bot_labels = domain.action_names
@@ -189,10 +201,9 @@ def prepare_from_domain(self, domain: Domain) -> None:
             len(self.user_vocab) + len(self.slot_labels) + len(self.bot_vocab)
         )
 
-        self.user_feature_len = len(self.user_vocab)
-        self.slot_feature_len = len(self.slot_labels)
-
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
+        """Returns a binary vector indicating which tokens are present."""
+
         if not self.num_features:
             raise Exception(
                 "LabelTokenizerSingleStateFeaturizer "
@@ -238,8 +249,8 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray:
             return used_features
 
     def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
-        """Create matrix with all actions from domain
-            encoded in rows as bag of words."""
+        """Create matrix with all actions from domain encoded in rows as bag of words"""
+
         encoded_all_actions = np.zeros(
             (domain.num_actions, len(self.bot_vocab)), dtype=np.int32
         )
@@ -250,7 +261,7 @@ def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
 
 
 class TrackerFeaturizer(object):
-    """Base class for actual tracker featurizers"""
+    """Base class for actual tracker featurizers."""
 
     def __init__(
         self,
@@ -268,9 +279,12 @@ def _create_states(
         is_binary_training: bool = False,
     ) -> List[Dict[Text, float]]:
         """Create states: a list of dictionaries.
-            If use_intent_probabilities is False (default behaviour),
-            pick the most probable intent out of all provided ones and
-            set its probability to 1.0, while all the others to 0.0."""
+
+        If use_intent_probabilities is False (default behaviour),
+        pick the most probable intent out of all provided ones and
+        set its probability to 1.0, while all the others to 0.0.
+        """
+
         states = tracker.past_states(domain)
 
         # during training we encounter only 1 or 0
@@ -304,12 +318,15 @@ def _create_states(
             return [dict(state) for state in states]
 
     def _pad_states(self, states: List[Any]) -> List[Any]:
+        """Pads states."""
+
         return states
 
     def _featurize_states(
         self, trackers_as_states: List[List[Dict[Text, float]]]
     ) -> Tuple[np.ndarray, List[int]]:
-        """Create X"""
+        """Create X."""
+
         features = []
         true_lengths = []
 
@@ -338,7 +355,7 @@ def _featurize_states(
     def _featurize_labels(
         self, trackers_as_actions: List[List[Text]], domain: Domain
     ) -> np.ndarray:
-        """Create y"""
+        """Create y."""
 
         labels = []
         for tracker_actions in trackers_as_actions:
@@ -363,7 +380,8 @@ def _featurize_labels(
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[Dict]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions"""
+        """Transforms list of trackers to lists of states and actions."""
+
         raise NotImplementedError(
             "Featurizer must have the capacity to encode trackers to feature vectors"
         )
@@ -371,7 +389,8 @@ def training_states_and_actions(
     def featurize_trackers(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> DialogueTrainingData:
-        """Create training data"""
+        """Create training data."""
+
         self.state_featurizer.prepare_from_domain(domain)
 
         (trackers_as_states, trackers_as_actions) = self.training_states_and_actions(
@@ -387,7 +406,8 @@ def featurize_trackers(
     def prediction_states(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> List[List[Dict[Text, float]]]:
-        """Transforms list of trackers to lists of states for prediction"""
+        """Transforms list of trackers to lists of states for prediction."""
+
         raise NotImplementedError(
             "Featurizer must have the capacity to create feature vector"
         )
@@ -396,7 +416,7 @@ def prediction_states(
     def create_X(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> np.ndarray:
-        """Create X for prediction"""
+        """Create X for prediction."""
 
         trackers_as_states = self.prediction_states(trackers, domain)
         X, _ = self._featurize_states(trackers_as_states)
@@ -411,6 +431,8 @@ def persist(self, path):
 
     @staticmethod
     def load(path):
+        """Loads the featurizer from file."""
+
         featurizer_file = os.path.join(path, "featurizer.json")
         if os.path.isfile(featurizer_file):
             return jsonpickle.decode(rasa.utils.io.read_file(featurizer_file))
@@ -423,17 +445,18 @@ def load(path):
 
 
 class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
-    """Tracker featurizer that takes the trackers
-    and creates full dialogue training data for
-    time distributed rnn.
-    Training data is padded up to the length of the longest
-    dialogue with -1"""
+    """Creates full dialogue training data for time distributed architectures.
+
+    Creates training data that uses each time output for prediction.
+    Training data is padded up to the length of the longest dialogue with -1.
+    """
 
     def __init__(
         self,
         state_featurizer: SingleStateFeaturizer,
         use_intent_probabilities: bool = False,
     ) -> None:
+
         super(FullDialogueTrackerFeaturizer, self).__init__(
             state_featurizer, use_intent_probabilities
         )
@@ -441,13 +464,15 @@ def __init__(
 
     @staticmethod
     def _calculate_max_len(trackers_as_actions):
+        """Calculate the length of the longest dialogue."""
+
         if trackers_as_actions:
             return max([len(states) for states in trackers_as_actions])
         else:
             return None
 
     def _pad_states(self, states: List[Any]) -> List[Any]:
-        """Pads states up to max_len"""
+        """Pads states up to max_len."""
 
         if len(states) < self.max_len:
             states += [None] * (self.max_len - len(states))
@@ -457,6 +482,10 @@ def _pad_states(self, states: List[Any]) -> List[Any]:
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[Dict]], List[List[Text]]]:
+        """Transforms list of trackers to lists of states and actions.
+
+        Training data is padded up to the length of the longest dialogue with -1.
+        """
 
         trackers_as_states = []
         trackers_as_actions = []
@@ -504,6 +533,7 @@ def training_states_and_actions(
     def prediction_states(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> List[List[Dict[Text, float]]]:
+        """Transforms list of trackers to lists of states for prediction."""
 
         trackers_as_states = [
             self._create_states(tracker, domain) for tracker in trackers
@@ -513,11 +543,11 @@ def prediction_states(
 
 
 class MaxHistoryTrackerFeaturizer(TrackerFeaturizer):
-    """Tracker featurizer that takes the trackers,
-    slices them into max_history batches and
-    creates  training data for rnn that uses last output
-    for prediction.
-    Training data is padded up to the max_history with -1"""
+    """Slices the tracker history into max_history batches.
+
+    Creates training data that uses last output for prediction.
+    Training data is padded up to the max_history with -1.
+    """
 
     MAX_HISTORY_DEFAULT = 5
 
@@ -528,6 +558,7 @@ def __init__(
         remove_duplicates: bool = True,
         use_intent_probabilities: bool = False,
     ) -> None:
+
         super(MaxHistoryTrackerFeaturizer, self).__init__(
             state_featurizer, use_intent_probabilities
         )
@@ -541,7 +572,8 @@ def slice_state_history(
         """Slices states from the trackers history.
 
         If the slice is at the array borders, padding will be added to ensure
-        the slice length."""
+        the slice length.
+        """
 
         slice_end = len(states)
         slice_start = max(0, slice_end - slice_length)
@@ -552,6 +584,8 @@ def slice_state_history(
 
     @staticmethod
     def _hash_example(states, action):
+        """Hash states for efficient deduplication."""
+
         frozen_states = tuple(
             (s if s is None else frozenset(s.items()) for s in states)
         )
@@ -561,6 +595,10 @@ def _hash_example(states, action):
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]:
+        """Transforms list of trackers to lists of states and actions.
+
+        Training data is padded up to the max_history with -1.
+        """
 
         trackers_as_states = []
         trackers_as_actions = []
@@ -615,6 +653,7 @@ def training_states_and_actions(
     def prediction_states(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> List[List[Dict[Text, float]]]:
+        """Transforms list of trackers to lists of states for prediction."""
 
         trackers_as_states = [
             self._create_states(tracker, domain) for tracker in trackers

From 8364807241ff815fbda8c779428a6d301f1874e7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 25 Jul 2019 15:14:37 +0200
Subject: [PATCH 37/50] do not persist encoded_all_actions

---
 rasa/core/policies/embedding_policy.py | 29 +++++---------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index f8c08470e4a4..bf834d14bcfe 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -127,7 +127,6 @@ def __init__(
         self,
         featurizer: Optional["TrackerFeaturizer"] = None,
         priority: int = 1,
-        encoded_all_actions: Optional["np.ndarray"] = None,
         graph: Optional["tf.Graph"] = None,
         session: Optional["tf.Session"] = None,
         user_placeholder: Optional["tf.Tensor"] = None,
@@ -150,12 +149,8 @@ def __init__(
 
         self._load_params(**kwargs)
 
-        # chrono initialization for forget bias
-        self.characteristic_time = None
-
         # encode all actions with numbers
-        # persist this array for prediction time
-        self.encoded_all_actions = encoded_all_actions
+        self._encoded_all_actions = None
 
         # tf related instances
         self.graph = graph
@@ -248,7 +243,7 @@ def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray":
                 [
                     np.stack(
                         [
-                            self.encoded_all_actions[action_idx]
+                            self._encoded_all_actions[action_idx]
                             for action_idx in action_ids
                         ]
                     )
@@ -257,7 +252,7 @@ def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray":
             )
         else:
             return np.stack(
-                [self.encoded_all_actions[action_idx] for action_idx in labels]
+                [self._encoded_all_actions[action_idx] for action_idx in labels]
             )
 
     # noinspection PyPep8Naming
@@ -879,7 +874,7 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         self.a_in, self.b_in = self._iterator.get_next()
 
         all_actions = tf.constant(
-            self.encoded_all_actions, dtype=tf.float32, name="all_actions"
+            self._encoded_all_actions, dtype=tf.float32, name="all_actions"
         )
 
         self.dial_embed, mask = self._create_tf_dial()
@@ -1119,7 +1114,7 @@ def train(
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
 
         # encode all actions with policies' featurizer
-        self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
+        self._encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
             domain
         )
 
@@ -1298,12 +1293,6 @@ def persist(self, path: Text) -> None:
             saver = tf.train.Saver()
             saver.save(self.session, checkpoint)
 
-        encoded_actions_file = os.path.join(
-            path, file_name + ".encoded_all_actions.pkl"
-        )
-        with open(encoded_actions_file, "wb") as f:
-            pickle.dump(self.encoded_all_actions, f)
-
         tf_config_file = os.path.join(path, file_name + ".tf_config.pkl")
         with open(tf_config_file, "wb") as f:
             pickle.dump(self._tf_config, f)
@@ -1364,17 +1353,9 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
             attention_weights = cls.load_tensor("attention_weights")
 
-        encoded_actions_file = os.path.join(
-            path, "{}.encoded_all_actions.pkl".format(file_name)
-        )
-
-        with open(encoded_actions_file, "rb") as f:
-            encoded_all_actions = pickle.load(f)
-
         return cls(
             featurizer=featurizer,
             priority=meta["priority"],
-            encoded_all_actions=encoded_all_actions,
             graph=graph,
             session=session,
             user_placeholder=a_in,

From 1d4470f37d3740bf028c63603ba9fa37f0a84867 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 26 Jul 2019 17:33:30 +0200
Subject: [PATCH 38/50] refactor test_policies, add test for policy.featurizer
 types

---
 rasa/core/featurizers.py             |   2 +-
 rasa/core/policies/mapping_policy.py |   4 +
 tests/core/test_policies.py          | 264 +++++++++++++++++----------
 3 files changed, 169 insertions(+), 101 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index 2d6b0c750631..ae1dab2f49d0 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -269,7 +269,7 @@ def __init__(
         use_intent_probabilities: bool = False,
     ) -> None:
 
-        self.state_featurizer = state_featurizer or SingleStateFeaturizer()
+        self.state_featurizer = state_featurizer
         self.use_intent_probabilities = use_intent_probabilities
 
     def _create_states(
diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py
index 1d194fba9cd2..a6b653b70118 100644
--- a/rasa/core/policies/mapping_policy.py
+++ b/rasa/core/policies/mapping_policy.py
@@ -27,6 +27,10 @@ class MappingPolicy(Policy):
     executed whenever the intent is detected. This policy takes precedence over
     any other policy."""
 
+    @staticmethod
+    def _standard_featurizer():
+        return None
+
     def __init__(self, priority: int = 3) -> None:
         """Create a new Mapping policy."""
 
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 80f34fad89ff..7ba4b42cc5e3 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -5,7 +5,7 @@
 import pytest
 
 import rasa.utils.io
-from rasa.core import training, utils
+from rasa.core import training
 from rasa.core.actions.action import (
     ACTION_DEFAULT_ASK_AFFIRMATION_NAME,
     ACTION_DEFAULT_ASK_REPHRASE_NAME,
@@ -18,7 +18,9 @@
 from rasa.core.events import ActionExecuted
 from rasa.core.featurizers import (
     BinarySingleStateFeaturizer,
+    LabelTokenizerSingleStateFeaturizer,
     MaxHistoryTrackerFeaturizer,
+    FullDialogueTrackerFeaturizer,
 )
 from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy
 from rasa.core.policies.embedding_policy import EmbeddingPolicy
@@ -113,6 +115,20 @@ async def trained_policy(self, featurizer, priority):
         policy.train(training_trackers, default_domain)
         return policy
 
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer.__class__ is self.featurizer().__class__
+        assert (
+            trained_policy.featurizer.state_featurizer.__class__
+            is self.featurizer().state_featurizer.__class__
+        )
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer.__class__ is self.featurizer().__class__
+        assert (
+            loaded.featurizer.state_featurizer.__class__
+            is self.featurizer().state_featurizer.__class__
+        )
+
     async def test_persist_and_load(self, trained_policy, default_domain, tmpdir):
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
@@ -179,99 +195,6 @@ def test_tf_config(self, trained_policy, tmpdir):
         assert loaded.session._config == session_config()
 
 
-class TestFallbackPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        p = FallbackPolicy(priority=priority)
-        return p
-
-    @pytest.mark.parametrize(
-        "nlu_confidence, last_action_name, should_nlu_fallback",
-        [
-            (0.1, "some_action", False),
-            (0.1, "action_listen", True),
-            (0.9, "some_action", False),
-            (0.9, "action_listen", False),
-        ],
-    )
-    def test_should_nlu_fallback(
-        self, trained_policy, nlu_confidence, last_action_name, should_nlu_fallback
-    ):
-        assert (
-            trained_policy.should_nlu_fallback(nlu_confidence, last_action_name)
-            is should_nlu_fallback
-        )
-
-
-class TestMappingPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        p = MappingPolicy()
-        return p
-
-
-class TestMemoizationPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        max_history = None
-        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
-            max_history = featurizer.max_history
-        p = MemoizationPolicy(priority=priority, max_history=max_history)
-        return p
-
-    async def test_memorise(self, trained_policy, default_domain):
-        trackers = await train_trackers(default_domain, augmentation_factor=20)
-        trained_policy.train(trackers, default_domain)
-        lookup_with_augmentation = trained_policy.lookup
-
-        trackers = [
-            t for t in trackers if not hasattr(t, "is_augmented") or not t.is_augmented
-        ]
-
-        (
-            all_states,
-            all_actions,
-        ) = trained_policy.featurizer.training_states_and_actions(
-            trackers, default_domain
-        )
-
-        for tracker, states, actions in zip(trackers, all_states, all_actions):
-            recalled = trained_policy.recall(states, tracker, default_domain)
-            assert recalled == default_domain.index_for_action(actions[0])
-
-        nums = np.random.randn(default_domain.num_states)
-        random_states = [{f: num for f, num in zip(default_domain.input_states, nums)}]
-        assert trained_policy._recall_states(random_states) is None
-
-        # compare augmentation for augmentation_factor of 0 and 20:
-        trackers_no_augmentation = await train_trackers(
-            default_domain, augmentation_factor=0
-        )
-        trained_policy.train(trackers_no_augmentation, default_domain)
-        lookup_no_augmentation = trained_policy.lookup
-
-        assert lookup_no_augmentation == lookup_with_augmentation
-
-    def test_memorise_with_nlu(self, trained_policy, default_domain):
-        filename = "data/test_dialogues/default.json"
-        dialogue = read_dialogue_file(filename)
-
-        tracker = DialogueStateTracker(dialogue.name, default_domain.slots)
-        tracker.recreate_from_dialogue(dialogue)
-        states = trained_policy.featurizer.prediction_states([tracker], default_domain)[
-            0
-        ]
-
-        recalled = trained_policy.recall(states, tracker, default_domain)
-        assert recalled is not None
-
-
-class TestAugmentedMemoizationPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        max_history = None
-        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
-            max_history = featurizer.max_history
-        p = AugmentedMemoizationPolicy(priority=priority, max_history=max_history)
-        return p
-
-
 class TestSklearnPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority, **kwargs):
         p = SklearnPolicy(featurizer, priority, **kwargs)
@@ -409,6 +332,20 @@ def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(priority=priority)
         return p
 
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer.__class__ is FullDialogueTrackerFeaturizer
+        assert (
+            trained_policy.featurizer.state_featurizer.__class__
+            is LabelTokenizerSingleStateFeaturizer
+        )
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer.__class__ is FullDialogueTrackerFeaturizer
+        assert (
+            loaded.featurizer.state_featurizer.__class__
+            is LabelTokenizerSingleStateFeaturizer
+        )
+
 
 class TestEmbeddingPolicyWithMaxHistory(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
@@ -418,13 +355,24 @@ def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(priority=priority, max_history=self.max_history)
         return p
 
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer
+        assert (
+            trained_policy.featurizer.state_featurizer.__class__
+            is LabelTokenizerSingleStateFeaturizer
+        )
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer
+        assert (
+            loaded.featurizer.state_featurizer.__class__
+            is LabelTokenizerSingleStateFeaturizer
+        )
+
 
 class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
-        # if max_history is not specified
-        p = EmbeddingPolicy(priority=priority, **tf_defaults())
+        p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
         return p
 
     def test_tf_config(self, trained_policy, tmpdir):
@@ -436,7 +384,79 @@ def test_tf_config(self, trained_policy, tmpdir):
         assert loaded.session._config == session_config()
 
 
-class TestFormPolicy(PolicyTestCollection):
+class TestMemoizationPolicy(PolicyTestCollection):
+    def create_policy(self, featurizer, priority):
+        max_history = None
+        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
+            max_history = featurizer.max_history
+        p = MemoizationPolicy(priority=priority, max_history=max_history)
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer
+        assert trained_policy.featurizer.state_featurizer is None
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer
+        assert loaded.featurizer.state_featurizer is None
+
+    async def test_memorise(self, trained_policy, default_domain):
+        trackers = await train_trackers(default_domain, augmentation_factor=20)
+        trained_policy.train(trackers, default_domain)
+        lookup_with_augmentation = trained_policy.lookup
+
+        trackers = [
+            t for t in trackers if not hasattr(t, "is_augmented") or not t.is_augmented
+        ]
+
+        (
+            all_states,
+            all_actions,
+        ) = trained_policy.featurizer.training_states_and_actions(
+            trackers, default_domain
+        )
+
+        for tracker, states, actions in zip(trackers, all_states, all_actions):
+            recalled = trained_policy.recall(states, tracker, default_domain)
+            assert recalled == default_domain.index_for_action(actions[0])
+
+        nums = np.random.randn(default_domain.num_states)
+        random_states = [{f: num for f, num in zip(default_domain.input_states, nums)}]
+        assert trained_policy._recall_states(random_states) is None
+
+        # compare augmentation for augmentation_factor of 0 and 20:
+        trackers_no_augmentation = await train_trackers(
+            default_domain, augmentation_factor=0
+        )
+        trained_policy.train(trackers_no_augmentation, default_domain)
+        lookup_no_augmentation = trained_policy.lookup
+
+        assert lookup_no_augmentation == lookup_with_augmentation
+
+    def test_memorise_with_nlu(self, trained_policy, default_domain):
+        filename = "data/test_dialogues/default.json"
+        dialogue = read_dialogue_file(filename)
+
+        tracker = DialogueStateTracker(dialogue.name, default_domain.slots)
+        tracker.recreate_from_dialogue(dialogue)
+        states = trained_policy.featurizer.prediction_states([tracker], default_domain)[
+            0
+        ]
+
+        recalled = trained_policy.recall(states, tracker, default_domain)
+        assert recalled is not None
+
+
+class TestAugmentedMemoizationPolicy(TestMemoizationPolicy):
+    def create_policy(self, featurizer, priority):
+        max_history = None
+        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
+            max_history = featurizer.max_history
+        p = AugmentedMemoizationPolicy(priority=priority, max_history=max_history)
+        return p
+
+
+class TestFormPolicy(TestMemoizationPolicy):
     def create_policy(self, featurizer, priority):
         p = FormPolicy(priority=priority)
         return p
@@ -499,8 +519,52 @@ async def test_memorise(self, trained_policy, default_domain):
         random_states = [{f: num for f, num in zip(domain.input_states, nums)}]
         assert trained_policy.recall(random_states, None, domain) is None
 
+    def test_memorise_with_nlu(self, trained_policy, default_domain):
+        pass
+
+
+class TestMappingPolicy(PolicyTestCollection):
+    def create_policy(self, featurizer, priority):
+        p = MappingPolicy()
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer is None
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer is None
+
+
+class TestFallbackPolicy(PolicyTestCollection):
+    def create_policy(self, featurizer, priority):
+        p = FallbackPolicy(priority=priority)
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer is None
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer is None
+
+    @pytest.mark.parametrize(
+        "nlu_confidence, last_action_name, should_nlu_fallback",
+        [
+            (0.1, "some_action", False),
+            (0.1, "action_listen", True),
+            (0.9, "some_action", False),
+            (0.9, "action_listen", False),
+        ],
+    )
+    def test_should_nlu_fallback(
+        self, trained_policy, nlu_confidence, last_action_name, should_nlu_fallback
+    ):
+        assert (
+            trained_policy.should_nlu_fallback(nlu_confidence, last_action_name)
+            is should_nlu_fallback
+        )
+
 
-class TestTwoStageFallbackPolicy(PolicyTestCollection):
+class TestTwoStageFallbackPolicy(TestFallbackPolicy):
     def create_policy(self, featurizer, priority):
         p = TwoStageFallbackPolicy(
             priority=priority, deny_suggestion_intent_name="deny"

From 565f7303b92159f6f2543e5beb3885cd2f4eb6be Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 26 Jul 2019 18:26:23 +0200
Subject: [PATCH 39/50] fix test_policies

---
 tests/core/test_policies.py | 48 +++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 7ba4b42cc5e3..13e6d89f2719 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -116,17 +116,15 @@ async def trained_policy(self, featurizer, priority):
         return policy
 
     def test_featurizer(self, trained_policy, tmpdir):
-        assert trained_policy.featurizer.__class__ is self.featurizer().__class__
-        assert (
-            trained_policy.featurizer.state_featurizer.__class__
-            is self.featurizer().state_featurizer.__class__
+        assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert isinstance(
+            trained_policy.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
-        assert loaded.featurizer.__class__ is self.featurizer().__class__
-        assert (
-            loaded.featurizer.state_featurizer.__class__
-            is self.featurizer().state_featurizer.__class__
+        assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert isinstance(
+            loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
 
     async def test_persist_and_load(self, trained_policy, default_domain, tmpdir):
@@ -333,17 +331,16 @@ def create_policy(self, featurizer, priority):
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
-        assert trained_policy.featurizer.__class__ is FullDialogueTrackerFeaturizer
-        assert (
-            trained_policy.featurizer.state_featurizer.__class__
-            is LabelTokenizerSingleStateFeaturizer
+        assert isinstance(trained_policy.featurizer, FullDialogueTrackerFeaturizer)
+        assert isinstance(
+            trained_policy.featurizer.state_featurizer,
+            LabelTokenizerSingleStateFeaturizer,
         )
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
-        assert loaded.featurizer.__class__ is FullDialogueTrackerFeaturizer
-        assert (
-            loaded.featurizer.state_featurizer.__class__
-            is LabelTokenizerSingleStateFeaturizer
+        assert isinstance(loaded.featurizer, FullDialogueTrackerFeaturizer)
+        assert isinstance(
+            loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer
         )
 
 
@@ -356,17 +353,16 @@ def create_policy(self, featurizer, priority):
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
-        assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer
-        assert (
-            trained_policy.featurizer.state_featurizer.__class__
-            is LabelTokenizerSingleStateFeaturizer
+        assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert isinstance(
+            trained_policy.featurizer.state_featurizer,
+            LabelTokenizerSingleStateFeaturizer,
         )
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
-        assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer
-        assert (
-            loaded.featurizer.state_featurizer.__class__
-            is LabelTokenizerSingleStateFeaturizer
+        assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert isinstance(
+            loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer
         )
 
 
@@ -393,11 +389,11 @@ def create_policy(self, featurizer, priority):
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
-        assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer
+        assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
         assert trained_policy.featurizer.state_featurizer is None
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
-        assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer
+        assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
         assert loaded.featurizer.state_featurizer is None
 
     async def test_memorise(self, trained_policy, default_domain):

From 03f0892cdd66e0d8330e4d6f4fe646d687e32931 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 26 Jul 2019 18:29:22 +0200
Subject: [PATCH 40/50] also check for max_history

---
 tests/core/test_policies.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 13e6d89f2719..1551583a68a9 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -117,12 +117,14 @@ async def trained_policy(self, featurizer, priority):
 
     def test_featurizer(self, trained_policy, tmpdir):
         assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert trained_policy.featurizer.max_history == self.max_history
         assert isinstance(
             trained_policy.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
         assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert loaded.featurizer.max_history == self.max_history
         assert isinstance(
             loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
@@ -354,6 +356,7 @@ def create_policy(self, featurizer, priority):
 
     def test_featurizer(self, trained_policy, tmpdir):
         assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert trained_policy.featurizer.max_history == self.max_history
         assert isinstance(
             trained_policy.featurizer.state_featurizer,
             LabelTokenizerSingleStateFeaturizer,
@@ -361,6 +364,7 @@ def test_featurizer(self, trained_policy, tmpdir):
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
         assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert loaded.featurizer.max_history == self.max_history
         assert isinstance(
             loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer
         )

From 4123d65855b8bdea2932ea3bbdcc71e6a50f235f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 29 Jul 2019 17:57:59 +0200
Subject: [PATCH 41/50] add continue_training tests

---
 rasa/core/featurizers.py               |  5 ++--
 rasa/core/policies/embedding_policy.py | 37 ++++++++++++++------------
 tests/core/test_policies.py            | 36 ++++++++++++++++++++++---
 3 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index ae1dab2f49d0..fa728b669103 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -525,8 +525,9 @@ def training_states_and_actions(
             trackers_as_states.append(states[:-1])
             trackers_as_actions.append(actions)
 
-        self.max_len = self._calculate_max_len(trackers_as_actions)
-        logger.debug("The longest dialogue has {} actions.".format(self.max_len))
+        if self.max_len is None:
+            self.max_len = self._calculate_max_len(trackers_as_actions)
+            logger.debug("The longest dialogue has {} actions.".format(self.max_len))
 
         return trackers_as_states, trackers_as_actions
 
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index bf834d14bcfe..f996bc5ab6dd 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1194,25 +1194,28 @@ def continue_training(
         batch_size = kwargs.get("batch_size", 5)
         epochs = kwargs.get("epochs", 50)
 
-        for _ in range(epochs):
-            training_data = self._training_data_for_continue_training(
-                batch_size, training_trackers, domain
-            )
-
-            session_data = self._create_session_data(training_data.X, training_data.y)
-            train_dataset = self._create_tf_dataset(session_data, batch_size)
-            train_init_op = self._iterator.make_initializer(train_dataset)
-            self.session.run(train_init_op)
+        with self.graph.as_default():
+            for _ in range(epochs):
+                training_data = self._training_data_for_continue_training(
+                    batch_size, training_trackers, domain
+                )
 
-            # fit to one extra example using updated trackers
-            while True:
-                try:
-                    self.session.run(
-                        self._train_op, feed_dict={self._is_training: True}
-                    )
+                session_data = self._create_session_data(
+                    training_data.X, training_data.y
+                )
+                train_dataset = self._create_tf_dataset(session_data, batch_size)
+                train_init_op = self._iterator.make_initializer(train_dataset)
+                self.session.run(train_init_op)
+
+                # fit to one extra example using updated trackers
+                while True:
+                    try:
+                        self.session.run(
+                            self._train_op, feed_dict={self._is_training: True}
+                        )
 
-                except tf.errors.OutOfRangeError:
-                    break
+                    except tf.errors.OutOfRangeError:
+                        break
 
     def tf_feed_dict_for_prediction(
         self, tracker: "DialogueStateTracker", domain: "Domain"
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 1551583a68a9..b8a64a270891 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -129,6 +129,12 @@ def test_featurizer(self, trained_policy, tmpdir):
             loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
 
+    async def test_continue_training(self, trained_policy, default_domain):
+        training_trackers = await train_trackers(default_domain, augmentation_factor=0)
+        trained_policy.continue_training(
+            training_trackers, default_domain, **{"epochs": 1}
+        )
+
     async def test_persist_and_load(self, trained_policy, default_domain, tmpdir):
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
@@ -318,13 +324,35 @@ def test_train_with_shuffle_false(
         policy.train(trackers, domain=default_domain)
 
 
-class TestEmbeddingPolicyWithFeaturizer(PolicyTestCollection):
+class TestEmbeddingPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(featurizer=featurizer, priority=priority)
         return p
 
+    def test_similarity_type(self, trained_policy):
+        assert trained_policy.similarity_type == "inner"
+
+
+class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):
+    def create_policy(self, featurizer, priority):
+        p = EmbeddingPolicy(
+            featurizer=featurizer, priority=priority, **{"loss_type": "margin"}
+        )
+        return p
+
+    def test_similarity_type(self, trained_policy):
+        assert trained_policy.similarity_type == "cosine"
+
+
+class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
+    def create_policy(self, featurizer, priority):
+        p = EmbeddingPolicy(
+            featurizer=featurizer, priority=priority, **{"evaluate_on_num_examples": 4}
+        )
+        return p
+
 
-class TestEmbeddingPolicyWithFullDialogue(PolicyTestCollection):
+class TestEmbeddingPolicyWithFullDialogue(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using FullDialogueTrackerFeaturizer
@@ -346,7 +374,7 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestEmbeddingPolicyWithMaxHistory(PolicyTestCollection):
+class TestEmbeddingPolicyWithMaxHistory(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using MaxHistoryTrackerFeaturizer
@@ -370,7 +398,7 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection):
+class TestEmbeddingPolicyWithTfConfig(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
         return p

From d06aeeeb67c18327ed7070630a26df2140e9c5e8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 29 Jul 2019 18:18:49 +0200
Subject: [PATCH 42/50] use dynamic sequence length, because of
 continue_trainig

---
 rasa/core/featurizers.py               |  5 ++---
 rasa/core/policies/embedding_policy.py | 14 ++++++++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index fa728b669103..ae1dab2f49d0 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -525,9 +525,8 @@ def training_states_and_actions(
             trackers_as_states.append(states[:-1])
             trackers_as_actions.append(actions)
 
-        if self.max_len is None:
-            self.max_len = self._calculate_max_len(trackers_as_actions)
-            logger.debug("The longest dialogue has {} actions.".format(self.max_len))
+        self.max_len = self._calculate_max_len(trackers_as_actions)
+        logger.debug("The longest dialogue has {} actions.".format(self.max_len))
 
         return trackers_as_states, trackers_as_actions
 
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index f996bc5ab6dd..03995eaf807f 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -405,6 +405,7 @@ def _gen_batch(
 
             yield batch_x, batch_y
 
+    # noinspection PyPep8Naming
     def _create_tf_dataset(
         self,
         session_data: "SessionData",
@@ -414,15 +415,20 @@ def _create_tf_dataset(
     ) -> "tf.data.Dataset":
         """Create tf dataset."""
 
+        # set batch and sequence length to None
+        shape_X = (None, None, session_data.X[0].shape[-1])
+
+        if session_data.Y[0].ndim == 1:
+            shape_Y = (None, session_data.Y[0].shape[-1])
+        else:
+            shape_Y = (None, None, session_data.Y[0].shape[-1])
+
         return tf.data.Dataset.from_generator(
             lambda batch_size_: self._gen_batch(
                 session_data, batch_size_, batch_strategy, shuffle
             ),
             output_types=(tf.float32, tf.float32),
-            output_shapes=(
-                [None] + list(session_data.X[0].shape),  # set batch to None
-                [None] + list(session_data.Y[0].shape),  # set batch to None
-            ),
+            output_shapes=(shape_X, shape_Y),
             args=([batch_size]),
         )
 

From 5ae7f112012457a7a4d59c07f96b0cacdb4c1e13 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 29 Jul 2019 22:55:32 +0200
Subject: [PATCH 43/50] add one more test

---
 tests/core/test_policies.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index b8a64a270891..31f65211cdc7 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -333,6 +333,14 @@ def test_similarity_type(self, trained_policy):
         assert trained_policy.similarity_type == "inner"
 
 
+class TestEmbeddingPolicySequence(TestEmbeddingPolicy):
+    def create_policy(self, featurizer, priority):
+        p = EmbeddingPolicy(
+            featurizer=featurizer, priority=priority, **{"batch_strategy": "sequence"}
+        )
+        return p
+
+
 class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(

From 956ca961d2d2675bb58beef31b1a536084a514ec Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 29 Jul 2019 23:20:38 +0200
Subject: [PATCH 44/50] add test_gen_batch

---
 tests/core/test_policies.py | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 31f65211cdc7..3bb2c745ab24 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -332,13 +332,36 @@ def create_policy(self, featurizer, priority):
     def test_similarity_type(self, trained_policy):
         assert trained_policy.similarity_type == "inner"
 
-
-class TestEmbeddingPolicySequence(TestEmbeddingPolicy):
-    def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"batch_strategy": "sequence"}
+    async def test_gen_batch(self, trained_policy, default_domain):
+        training_trackers = await train_trackers(default_domain, augmentation_factor=0)
+        training_data = trained_policy.featurize_for_training(
+            training_trackers, default_domain
+        )
+        session_data = trained_policy._create_session_data(
+            training_data.X, training_data.y
+        )
+        batch_size = 2
+        batch_x, batch_y = next(
+            trained_policy._gen_batch(session_data=session_data, batch_size=batch_size)
+        )
+        assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
+        assert (
+            batch_x[0].shape == session_data.X[0].shape
+            and batch_y[0].shape == session_data.Y[0].shape
+        )
+        batch_x, batch_y = next(
+            trained_policy._gen_batch(
+                session_data=session_data,
+                batch_size=batch_size,
+                batch_strategy="balanced",
+                shuffle=True,
+            )
+        )
+        assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
+        assert (
+            batch_x[0].shape == session_data.X[0].shape
+            and batch_y[0].shape == session_data.Y[0].shape
         )
-        return p
 
 
 class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):

From b97bf0bacb66308408fcedc56250d29d7eda833a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 30 Jul 2019 18:23:59 +0200
Subject: [PATCH 45/50] return pre dial nn

---
 rasa/core/policies/embedding_policy.py | 27 ++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 03995eaf807f..2846f6d0a4f5 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -55,6 +55,9 @@ class EmbeddingPolicy(Policy):
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
+        # a list of hidden layers sizes before user embed layer
+        # number of hidden layers is equal to the length of this list
+        "hidden_layers_sizes_dial": [],
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_bot": [],
@@ -174,7 +177,10 @@ def __init__(
 
     # init helpers
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"]
+        self.hidden_layers_sizes = {
+            "a": config["hidden_layers_sizes_dial"],
+            "b": config["hidden_layers_sizes_bot"],
+        }
 
         self.pos_encoding = config["pos_encoding"]
         self.max_seq_length = config["max_seq_length"]
@@ -504,7 +510,7 @@ def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
 
         b = self._create_tf_nn(
             b_in,
-            self.hidden_layer_sizes_bot,
+            self.hidden_layers_sizes["b"],
             self.droprate["bot"],
             layer_name_suffix="bot",
         )
@@ -595,18 +601,23 @@ def _create_t2t_transformer_encoder(
                 tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout
             )
 
-    def _create_tf_dial(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
+    def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Create dialogue level embedding and mask."""
 
         # mask different length sequences
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
-        self.attention_weights = {}
-        a = self._create_t2t_transformer_encoder(
-            self.a_in, mask, self.attention_weights
+        a = self._create_tf_nn(
+            a_in,
+            self.hidden_layers_sizes["a"],
+            self.droprate["dial"],
+            layer_name_suffix="dial",
         )
 
+        self.attention_weights = {}
+        a = self._create_t2t_transformer_encoder(a, mask, self.attention_weights)
+
         dial_embed = self._create_tf_embed(a, layer_name_suffix="dial")
 
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
@@ -883,7 +894,7 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
             self._encoded_all_actions, dtype=tf.float32, name="all_actions"
         )
 
-        self.dial_embed, mask = self._create_tf_dial()
+        self.dial_embed, mask = self._create_tf_dial(self.a_in)
 
         self.bot_embed = self._create_tf_bot_embed(self.b_in)
         self.all_bot_embed = self._create_tf_bot_embed(all_actions)
@@ -1065,7 +1076,7 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         self._create_tf_placeholders(session_data)
 
-        self.dial_embed, mask = self._create_tf_dial()
+        self.dial_embed, mask = self._create_tf_dial(self.a_in)
 
         self.sim_all = self._tf_raw_sim(
             self.dial_embed[:, :, tf.newaxis, :],

From d9ea8be8d7dc5aeb79ded4fc19d293191098f142 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 30 Jul 2019 18:25:35 +0200
Subject: [PATCH 46/50] return pre dial nn

---
 rasa/core/policies/embedding_policy.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 2846f6d0a4f5..75be9ab62878 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -57,7 +57,7 @@ class EmbeddingPolicy(Policy):
         # nn architecture
         # a list of hidden layers sizes before user embed layer
         # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_dial": [],
+        "hidden_layers_sizes_pre_dial": [],
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes_bot": [],
@@ -178,8 +178,8 @@ def __init__(
     # init helpers
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.hidden_layers_sizes = {
-            "a": config["hidden_layers_sizes_dial"],
-            "b": config["hidden_layers_sizes_bot"],
+            "pre_dial": config["hidden_layers_sizes_pre_dial"],
+            "bot": config["hidden_layers_sizes_bot"],
         }
 
         self.pos_encoding = config["pos_encoding"]
@@ -510,7 +510,7 @@ def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
 
         b = self._create_tf_nn(
             b_in,
-            self.hidden_layers_sizes["b"],
+            self.hidden_layers_sizes["bot"],
             self.droprate["bot"],
             layer_name_suffix="bot",
         )
@@ -610,9 +610,9 @@ def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
 
         a = self._create_tf_nn(
             a_in,
-            self.hidden_layers_sizes["a"],
+            self.hidden_layers_sizes["pre_dial"],
             self.droprate["dial"],
-            layer_name_suffix="dial",
+            layer_name_suffix="pre_dial",
         )
 
         self.attention_weights = {}

From 4ef6660b8d51fef9d288c93929df3c93af0bb739 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 30 Jul 2019 18:41:48 +0200
Subject: [PATCH 47/50] add scale_loss option

---
 docs/core/policies.rst                 |  3 +++
 rasa/core/policies/embedding_policy.py | 14 +++++++++-----
 tests/core/test_policies.py            |  3 ++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 068ebe9d1510..f5250b2479af 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -252,6 +252,9 @@ It is recommended to use
             - ``use_max_sim_neg`` if ``true`` the algorithm only
               minimizes maximum similarity over incorrect intent labels,
               used only if ``loss_type`` is set to ``margin``;
+            - ``scale_loss`` if ``true`` the algorithm will downscale the loss
+              for examples where correct label is predicted with high confidence,
+              used only if ``loss_type`` is set to ``softmax``;
 
         - regularization:
 
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 75be9ab62878..47b9f473293b 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -98,6 +98,8 @@ class EmbeddingPolicy(Policy):
         # the number of incorrect actions, the algorithm will minimize
         # their similarity to the user input during training
         "use_max_sim_neg": True,  # flag which loss function to use
+        # scale loss inverse proportionally to confidence of correct prediction
+        "scale_loss": True,
         # regularization
         # the scale of L2 regularization
         "C2": 0.001,
@@ -210,6 +212,7 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
 
         self.num_neg = config["num_neg"]
         self.use_max_sim_neg = config["use_max_sim_neg"]
+        self.scale_loss = config["scale_loss"]
 
     def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
         self.C2 = config["C2"]
@@ -818,8 +821,8 @@ def _tf_loss_margin(
 
         return loss
 
-    @staticmethod
     def _tf_loss_softmax(
+        self,
         sim_pos: "tf.Tensor",
         sim_neg: "tf.Tensor",
         sim_neg_bot_bot: "tf.Tensor",
@@ -838,11 +841,12 @@ def _tf_loss_softmax(
         neg_labels = tf.zeros_like(logits[:, :, 1:])
         labels = tf.concat([pos_labels, neg_labels], -1)
 
-        # mask loss by prediction confidence
-        pred = tf.nn.softmax(logits)
-        already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
+        if self.scale_loss:
+            # mask loss by prediction confidence
+            pred = tf.nn.softmax(logits)
+            mask *= tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
 
-        loss = tf.losses.softmax_cross_entropy(labels, logits, mask * already_learned)
+        loss = tf.losses.softmax_cross_entropy(labels, logits, mask)
         # add regularization losses
         loss += tf.losses.get_regularization_loss()
 
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 3bb2c745ab24..b894aac5c7cd 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -378,7 +378,8 @@ def test_similarity_type(self, trained_policy):
 class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"evaluate_on_num_examples": 4}
+            featurizer=featurizer, priority=priority, **{"scale_loss": False,
+                                                         "evaluate_on_num_examples": 4}
         )
         return p
 

From 54950c1458acc7486eb53dbef4974651b1eee560 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 30 Jul 2019 18:42:54 +0200
Subject: [PATCH 48/50] black

---
 tests/core/test_policies.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index b894aac5c7cd..ca0db2c987c0 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -378,8 +378,9 @@ def test_similarity_type(self, trained_policy):
 class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"scale_loss": False,
-                                                         "evaluate_on_num_examples": 4}
+            featurizer=featurizer,
+            priority=priority,
+            **{"scale_loss": False, "evaluate_on_num_examples": 4}
         )
         return p
 

From b2f9f890c92be36bf0f2f4412b14573fcaa75728 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <mr.voov@gmail.com>
Date: Mon, 5 Aug 2019 17:11:58 +0200
Subject: [PATCH 49/50] Update docs/core/policies.rst

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 docs/core/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 1bdfa6a34baa..f4a02d392f52 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -177,7 +177,7 @@ following steps:
       previous system action, slots and active form
       for each time step into an input vector
       to pre-transformer embedding layer;
-    - feed it to tranformer;
+    - feed it to transformer;
     - apply a dense layer to the output of the transformer
       to get embeddings of a dialogue for each time step;
     - apply a dense layer to create embeddings for system actions for each time step;

From 1a5221936dfd22edf2d9f7a5cb63e4fe6929e8b7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 5 Aug 2019 17:19:55 +0200
Subject: [PATCH 50/50] fix changelog, remove unneeded else

---
 CHANGELOG.rst                          | 3 ---
 rasa/core/policies/embedding_policy.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index fd24dc0af564..97ee7bbdd0f9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -43,9 +43,6 @@ Changed
 -------
 - new event broker class: ``SQLProducer``. This event broker is now used when running locally with
   Rasa X
-
-Removed
--------
 - API requests are not longer logged to ``rasa_core.log`` by default in order to avoid
   problems when running on OpenShift (use ``--log-file rasa_core.log`` to retain the
   old behavior)
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 47b9f473293b..6224513951cc 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1114,8 +1114,6 @@ def _extract_attention(self) -> Optional["tf.Tensor"]:
 
         if attention:
             return tf.concat(attention, 0)
-        else:
-            return
 
     # training methods
     def train(