diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 92378e48a579..46c262e99b8d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -15,7 +15,10 @@ Added
 
 Changed
 -------
-
+- substitute LSTM with Transformer in ``EmbeddingPolicy``
+- ``EmbeddingPolicy`` can now use ``MaxHistoryTrackerFeaturizer``
+- non zero ``evaluate_on_num_examples`` in ``EmbeddingPolicy`` is the size of
+  hold out validation set that is excluded from training data
 
 Removed
 -------
@@ -23,7 +26,9 @@ Removed
 
 Fixed
 -----
-
+- ``MappingPolicy`` standard featurizer is set to ``None``
+- ``Flood control exceeded`` error in Telegram connector which happened because the
+  webhook was set twice
 
 [1.2.2] - 2019-08-07
 ^^^^^^^^^^^^^^^^^^^^
@@ -66,8 +71,8 @@ Changed
 Fixed
 -----
 - ``rasa test core`` can handle compressed model files
-- Rasa can handle story files containing multi line comments
-- Template will retain `{` if escaped with `{`. e.g. `{{"foo": {bar}}}` will result in `{"foo": "replaced value"}`
+- rasa can handle story files containing multi line comments
+- template will retain `{` if escaped with `{`. e.g. `{{"foo": {bar}}}` will result in `{"foo": "replaced value"}`
 
 [1.1.8] - 2019-07-25
 ^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 89d71db64bef..a178cbac5897 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -167,47 +167,27 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 Embedding Policy
 ^^^^^^^^^^^^^^^^
 
-The Recurrent Embedding Dialogue Policy (REDP)
-described in our paper: `<https://arxiv.org/abs/1811.11707>`_
+Transformer Embedding Dialogue Policy (TEDP)
+
+Transformer version of the Recurrent Embedding Dialogue Policy (REDP)
+used in our paper: `<https://arxiv.org/abs/1811.11707>`_
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
 
-    - apply dense layers to create embeddings for user intents,
-      entities and system actions including previous actions and slots;
-    - use the embeddings of previous user inputs as a user memory
-      and embeddings of previous system actions as a system memory;
-    - concatenate user input, previous system action and slots
-      embeddings for current time into an input vector to rnn;
-    - using user and previous system action embeddings from the input
-      vector, calculate attention probabilities over the user and
-      system memories (for system memory, this policy uses
-      `NTM mechanism <https://arxiv.org/abs/1410.5401>`_ with attention
-      by location);
-    - sum the user embedding and user attention vector and feed it
-      and the embeddings of the slots as an input to an LSTM cell;
-    - apply a dense layer to the output of the LSTM to get a raw
-      recurrent embedding of a dialogue;
-    - sum this raw recurrent embedding of a dialogue with system
-      attention vector to create dialogue level embedding, this step
-      allows the algorithm to repeat previous system action by copying
-      its embedding vector directly to the current time output;
-    - weight previous LSTM states with system attention probabilities
-      to get the previous action embedding, the policy is likely payed
-      attention to;
-    - if the similarity between this previous action embedding and
-      current time dialogue embedding is high, overwrite current LSTM
-      state with the one from the time when this action happened;
-    - for each LSTM time step, calculate the similarity between the
+    - concatenate user input (user intent and entities),
+      previous system action, slots and active form
+      for each time step into an input vector
+      to pre-transformer embedding layer;
+    - feed it to transformer;
+    - apply a dense layer to the output of the transformer
+      to get embeddings of a dialogue for each time step;
+    - apply a dense layer to create embeddings for system actions for each time step;
+    - calculate the similarity between the
       dialogue embedding and embedded system actions.
       This step is based on the
       `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
 
-.. note::
-
-    This policy only works with
-    ``FullDialogueTrackerFeaturizer(state_featurizer)``.
-
 It is recommended to use
 ``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
 (see :ref:`featurization` for details).
@@ -221,52 +201,32 @@ It is recommended to use
 
         Pass an appropriate number of ``epochs`` to the ``EmbeddingPolicy``,
         otherwise the policy will be trained only for ``1``
-        epoch. Since this is an embedding based policy, it requires a large
-        number of epochs, which depends on the complexity of the
-        training data and whether attention is used or not.
-
-    The main feature of this policy is an **attention** mechanism over
-    previous user input and system actions.
-    **Attention is turned on by default**; in order to turn it off,
-    configure the following parameters:
-
-        - ``attn_before_rnn`` if ``true`` the algorithm will use
-          attention mechanism over previous user input, default ``true``;
-        - ``attn_after_rnn`` if ``true`` the algorithm will use
-          attention mechanism over previous system actions and will be
-          able to copy previously executed action together with LSTM's
-          hidden state from its history, default ``true``;
-        - ``sparse_attention`` if ``true`` ``sparsemax`` will be used
-          instead of ``softmax`` for attention probabilities, default
-          ``false``;
-        - ``attn_shift_range`` the range of allowed location-based
-          attention shifts for system memory (``attn_after_rnn``), see
-          `<https://arxiv.org/abs/1410.5401>`_ for details;
+        epoch.
 
-    .. note::
-
-        Attention requires larger values of ``epochs`` and takes longer
-        to train. But it can learn more complicated and nonlinear behaviour.
+    The main feature of this policy is **transformer**.
 
     The algorithm also has hyper-parameters to control:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_a`` sets a list of hidden layers
-              sizes before embedding layer for user inputs, the number
-              of hidden layers is equal to the length of the list;
             - ``hidden_layers_sizes_b`` sets a list of hidden layers
               sizes before embedding layer for system actions, the number
               of hidden layers is equal to the length of the list;
-            - ``rnn_size`` sets the number of units in the LSTM cell;
+            - ``transformer_size`` sets the number of units in the transfomer;
+            - ``num_transformer_layers`` sets the number of transformer layers;
+            - ``pos_encoding`` sets the type of positional encoding in transformer,
+              it should be either ``timing`` or ``emb``;
+            - ``max_seq_length`` sets maximum sequence length
+              if embedding positional encodings are used;
+            - ``num_heads`` sets the number of heads in multihead attention;
 
         - training:
 
-            - ``layer_norm`` if ``true`` layer normalization for lstm
-              cell is turned on,  default ``true``;
             - ``batch_size`` sets the number of training examples in one
               forward/backward pass, the higher the batch size, the more
               memory space you'll need;
+            - ``batch_strategy`` sets the type of batching strategy,
+              it should be either ``sequence`` or ``balanced``;
             - ``epochs`` sets the number of times the algorithm will see
               training data, where one ``epoch`` equals one forward pass and
               one backward pass of all the training examples;
@@ -276,38 +236,52 @@ It is recommended to use
         - embedding:
 
             - ``embed_dim`` sets the dimension of embedding space;
-            - ``mu_pos`` controls how similar the algorithm should try
-              to make embedding vectors for correct intent labels;
-            - ``mu_neg`` controls maximum negative similarity for
-              incorrect intents;
-            - ``similarity_type`` sets the type of the similarity,
-              it should be either ``cosine`` or ``inner``;
             - ``num_neg`` sets the number of incorrect intent labels,
               the algorithm will minimize their similarity to the user
               input during training;
+            - ``similarity_type`` sets the type of the similarity,
+              it should be either ``auto``, ``cosine`` or ``inner``,
+              if ``auto``, it will be set depending on ``loss_type``,
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+            - ``loss_type`` sets the type of the loss function,
+              it should be either ``softmax`` or ``margin``;
+            - ``mu_pos`` controls how similar the algorithm should try
+              to make embedding vectors for correct intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``mu_neg`` controls maximum negative similarity for
+              incorrect intents,
+              used only if ``loss_type`` is set to ``margin``;
             - ``use_max_sim_neg`` if ``true`` the algorithm only
-              minimizes maximum similarity over incorrect intent labels;
+              minimizes maximum similarity over incorrect intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``scale_loss`` if ``true`` the algorithm will downscale the loss
+              for examples where correct label is predicted with high confidence,
+              used only if ``loss_type`` is set to ``softmax``;
 
         - regularization:
 
             - ``C2`` sets the scale of L2 regularization
             - ``C_emb`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different
-              intent labels;
-            - ``droprate_a`` sets the dropout rate between hidden
+              intent labels, used only if ``loss_type`` is set to ``margin``;
+            - ``droprate_a`` sets the dropout rate between
               layers before embedding layer for user inputs;
-            - ``droprate_b`` sets the dropout rate between hidden layers
+            - ``droprate_b`` sets the dropout rate between layers
               before embedding layer for system actions;
-            - ``droprate_rnn`` sets the recurrent dropout rate on
-              the LSTM hidden state `<https://arxiv.org/abs/1603.05118>`_;
 
         - train accuracy calculation:
 
             - ``evaluate_every_num_epochs`` sets how often to calculate
               train accuracy, small values may hurt performance;
             - ``evaluate_on_num_examples`` how many examples to use for
-              calculation of train accuracy, large values may hurt
-              performance.
+              hold out validation set to calculate of validation accuracy,
+              large values may hurt performance.
+
+    .. warning::
+
+        if ``evaluate_on_num_examples`` is non zero, random examples will be
+        picked by stratified split and used as **hold out** validation set,
+        so they will be excluded from training data.
 
     .. note::
 
diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py
index 76634cac1517..ae1dab2f49d0 100644
--- a/rasa/core/featurizers.py
+++ b/rasa/core/featurizers.py
@@ -19,23 +19,21 @@
 
 
 class SingleStateFeaturizer(object):
-    """Base class for mechanisms to transform the conversations state
-    into machine learning formats.
+    """Base class for mechanisms to transform the conversations state into ML formats.
 
     Subclasses of SingleStateFeaturizer decide how the bot will transform
     the conversation state to a format which a classifier can read:
-    feature vector."""
-
-    def __init__(self):
-        """Declares instant variables."""
-        self.user_feature_len = None
-        self.slot_feature_len = None
+    feature vector.
+    """
 
     def prepare_from_domain(self, domain: Domain) -> None:
-        """Helper method to init based on domain"""
+        """Helper method to init based on domain."""
+
         pass
 
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
+        """Encode user input."""
+
         raise NotImplementedError(
             "SingleStateFeaturizer must have "
             "the capacity to "
@@ -44,6 +42,8 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray:
 
     @staticmethod
     def action_as_one_hot(action: Text, domain: Domain) -> np.ndarray:
+        """Encode system action as one-hot vector."""
+
         if action is None:
             return np.ones(domain.num_actions, dtype=int) * -1
 
@@ -52,49 +52,50 @@ def action_as_one_hot(action: Text, domain: Domain) -> np.ndarray:
         return y
 
     def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
-        """Create matrix with all actions from domain
-            encoded in rows."""
+        """Create matrix with all actions from domain encoded in rows."""
+
         pass
 
 
 class BinarySingleStateFeaturizer(SingleStateFeaturizer):
     """Assumes all features are binary.
 
-    All features should be either on or off, denoting them with 1 or 0."""
+    All features should be either on or off, denoting them with 1 or 0.
+    """
 
     def __init__(self):
         """Declares instant variables."""
+
         super(BinarySingleStateFeaturizer, self).__init__()
 
         self.num_features = None
         self.input_state_map = None
 
     def prepare_from_domain(self, domain: Domain) -> None:
+        """Use Domain to prepare featurizer."""
+
         self.num_features = domain.num_states
         self.input_state_map = domain.input_state_map
 
-        self.user_feature_len = len(domain.intent_states) + len(domain.entity_states)
-        self.slot_feature_len = len(domain.slot_states)
-
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
         """Returns a binary vector indicating which features are active.
 
-            Given a dictionary of states (e.g. 'intent_greet',
-            'prev_action_listen',...) return a binary vector indicating which
-            features of `self.input_features` are in the bag. NB it's a
-            regular double precision float array type.
+        Given a dictionary of states (e.g. 'intent_greet',
+        'prev_action_listen',...) return a binary vector indicating which
+        features of `self.input_features` are in the bag. NB it's a
+        regular double precision float array type.
 
-            For example with two active features out of five possible features
-            this would return a vector like `[0 0 1 0 1]`
+        For example with two active features out of five possible features
+        this would return a vector like `[0 0 1 0 1]`
 
-            If intent features are given with a probability, for example
-            with two active features and two uncertain intents out
-            of five possible features this would return a vector
-            like `[0.3, 0.7, 1.0, 0, 1.0]`.
+        If intent features are given with a probability, for example
+        with two active features and two uncertain intents out
+        of five possible features this would return a vector
+        like `[0.3, 0.7, 1.0, 0, 1.0]`.
 
-            If this is just a padding vector we set all values to `-1`.
-            padding vectors are specified by a `None` or `[None]`
-            value for states.
+        If this is just a padding vector we set all values to `-1`.
+        padding vectors are specified by a `None` or `[None]`
+        value for states.
         """
 
         if not self.num_features:
@@ -127,15 +128,16 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray:
             return used_features
 
     def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
-        """Create matrix with all actions from domain
-            encoded in rows as bag of words."""
+        """Create matrix with all actions from domain encoded in rows as bag of words"""
+
         return np.eye(domain.num_actions)
 
 
 class LabelTokenizerSingleStateFeaturizer(SingleStateFeaturizer):
-    """SingleStateFeaturizer that splits user intents and
-    bot action names into tokens and uses these tokens to
-    create bag-of-words feature vectors.
+    """Creates bag-of-words feature vectors.
+
+    User intents and bot action names are split into tokens
+    and used to create bag-of-words feature vectors.
 
     Args:
         split_symbol: The symbol that separates words in
@@ -165,8 +167,10 @@ def __init__(
     @staticmethod
     def _create_label_token_dict(labels, split_symbol="_"):
         """Splits labels into tokens by using provided symbol.
+
         Creates the lookup dictionary for this tokens.
-        Values in this dict are used for featurization."""
+        Values in this dict are used for featurization.
+        """
 
         distinct_tokens = set(
             [token for label in labels for token in label.split(split_symbol)]
@@ -174,10 +178,10 @@ def _create_label_token_dict(labels, split_symbol="_"):
         return {token: idx for idx, token in enumerate(sorted(distinct_tokens))}
 
     def prepare_from_domain(self, domain: Domain) -> None:
-        """Creates internal vocabularies for user intents
-        and bot actions to use for featurization"""
+        """Creates internal vocabularies for user intents and bot actions."""
+
         self.user_labels = domain.intent_states + domain.entity_states
-        self.slot_labels = domain.slot_states
+        self.slot_labels = domain.slot_states + domain.form_states
         self.bot_labels = domain.action_names
 
         if self.use_shared_vocab:
@@ -197,10 +201,9 @@ def prepare_from_domain(self, domain: Domain) -> None:
             len(self.user_vocab) + len(self.slot_labels) + len(self.bot_vocab)
         )
 
-        self.user_feature_len = len(self.user_vocab)
-        self.slot_feature_len = len(self.slot_labels)
-
     def encode(self, state: Dict[Text, float]) -> np.ndarray:
+        """Returns a binary vector indicating which tokens are present."""
+
         if not self.num_features:
             raise Exception(
                 "LabelTokenizerSingleStateFeaturizer "
@@ -246,10 +249,10 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray:
             return used_features
 
     def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
-        """Create matrix with all actions from domain
-            encoded in rows as bag of words."""
+        """Create matrix with all actions from domain encoded in rows as bag of words"""
+
         encoded_all_actions = np.zeros(
-            (domain.num_actions, len(self.bot_vocab)), dtype=int
+            (domain.num_actions, len(self.bot_vocab)), dtype=np.int32
         )
         for idx, name in enumerate(domain.action_names):
             for t in name.split(self.split_symbol):
@@ -258,7 +261,7 @@ def create_encoded_all_actions(self, domain: Domain) -> np.ndarray:
 
 
 class TrackerFeaturizer(object):
-    """Base class for actual tracker featurizers"""
+    """Base class for actual tracker featurizers."""
 
     def __init__(
         self,
@@ -266,7 +269,7 @@ def __init__(
         use_intent_probabilities: bool = False,
     ) -> None:
 
-        self.state_featurizer = state_featurizer or SingleStateFeaturizer()
+        self.state_featurizer = state_featurizer
         self.use_intent_probabilities = use_intent_probabilities
 
     def _create_states(
@@ -276,9 +279,12 @@ def _create_states(
         is_binary_training: bool = False,
     ) -> List[Dict[Text, float]]:
         """Create states: a list of dictionaries.
-            If use_intent_probabilities is False (default behaviour),
-            pick the most probable intent out of all provided ones and
-            set its probability to 1.0, while all the others to 0.0."""
+
+        If use_intent_probabilities is False (default behaviour),
+        pick the most probable intent out of all provided ones and
+        set its probability to 1.0, while all the others to 0.0.
+        """
+
         states = tracker.past_states(domain)
 
         # during training we encounter only 1 or 0
@@ -312,12 +318,15 @@ def _create_states(
             return [dict(state) for state in states]
 
     def _pad_states(self, states: List[Any]) -> List[Any]:
+        """Pads states."""
+
         return states
 
     def _featurize_states(
         self, trackers_as_states: List[List[Dict[Text, float]]]
     ) -> Tuple[np.ndarray, List[int]]:
-        """Create X"""
+        """Create X."""
+
         features = []
         true_lengths = []
 
@@ -346,7 +355,7 @@ def _featurize_states(
     def _featurize_labels(
         self, trackers_as_actions: List[List[Text]], domain: Domain
     ) -> np.ndarray:
-        """Create y"""
+        """Create y."""
 
         labels = []
         for tracker_actions in trackers_as_actions:
@@ -361,15 +370,18 @@ def _featurize_labels(
 
             labels.append(story_labels)
 
+        y = np.array(labels)
         # if it is MaxHistoryFeaturizer, squeeze out time axis
-        y = np.array(labels).squeeze()
+        if y.ndim == 3 and isinstance(self, MaxHistoryTrackerFeaturizer):
+            y = y[:, 0, :]
 
         return y
 
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[Dict]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions"""
+        """Transforms list of trackers to lists of states and actions."""
+
         raise NotImplementedError(
             "Featurizer must have the capacity to encode trackers to feature vectors"
         )
@@ -377,7 +389,8 @@ def training_states_and_actions(
     def featurize_trackers(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> DialogueTrainingData:
-        """Create training data"""
+        """Create training data."""
+
         self.state_featurizer.prepare_from_domain(domain)
 
         (trackers_as_states, trackers_as_actions) = self.training_states_and_actions(
@@ -393,7 +406,8 @@ def featurize_trackers(
     def prediction_states(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> List[List[Dict[Text, float]]]:
-        """Transforms list of trackers to lists of states for prediction"""
+        """Transforms list of trackers to lists of states for prediction."""
+
         raise NotImplementedError(
             "Featurizer must have the capacity to create feature vector"
         )
@@ -402,7 +416,7 @@ def prediction_states(
     def create_X(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> np.ndarray:
-        """Create X for prediction"""
+        """Create X for prediction."""
 
         trackers_as_states = self.prediction_states(trackers, domain)
         X, _ = self._featurize_states(trackers_as_states)
@@ -417,6 +431,8 @@ def persist(self, path):
 
     @staticmethod
     def load(path):
+        """Loads the featurizer from file."""
+
         featurizer_file = os.path.join(path, "featurizer.json")
         if os.path.isfile(featurizer_file):
             return jsonpickle.decode(rasa.utils.io.read_file(featurizer_file))
@@ -429,17 +445,18 @@ def load(path):
 
 
 class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
-    """Tracker featurizer that takes the trackers
-    and creates full dialogue training data for
-    time distributed rnn.
-    Training data is padded up to the length of the longest
-    dialogue with -1"""
+    """Creates full dialogue training data for time distributed architectures.
+
+    Creates training data that uses each time output for prediction.
+    Training data is padded up to the length of the longest dialogue with -1.
+    """
 
     def __init__(
         self,
         state_featurizer: SingleStateFeaturizer,
         use_intent_probabilities: bool = False,
     ) -> None:
+
         super(FullDialogueTrackerFeaturizer, self).__init__(
             state_featurizer, use_intent_probabilities
         )
@@ -447,13 +464,15 @@ def __init__(
 
     @staticmethod
     def _calculate_max_len(trackers_as_actions):
+        """Calculate the length of the longest dialogue."""
+
         if trackers_as_actions:
             return max([len(states) for states in trackers_as_actions])
         else:
             return None
 
     def _pad_states(self, states: List[Any]) -> List[Any]:
-        """Pads states up to max_len"""
+        """Pads states up to max_len."""
 
         if len(states) < self.max_len:
             states += [None] * (self.max_len - len(states))
@@ -463,6 +482,10 @@ def _pad_states(self, states: List[Any]) -> List[Any]:
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[Dict]], List[List[Text]]]:
+        """Transforms list of trackers to lists of states and actions.
+
+        Training data is padded up to the length of the longest dialogue with -1.
+        """
 
         trackers_as_states = []
         trackers_as_actions = []
@@ -510,6 +533,7 @@ def training_states_and_actions(
     def prediction_states(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> List[List[Dict[Text, float]]]:
+        """Transforms list of trackers to lists of states for prediction."""
 
         trackers_as_states = [
             self._create_states(tracker, domain) for tracker in trackers
@@ -519,11 +543,11 @@ def prediction_states(
 
 
 class MaxHistoryTrackerFeaturizer(TrackerFeaturizer):
-    """Tracker featurizer that takes the trackers,
-    slices them into max_history batches and
-    creates  training data for rnn that uses last output
-    for prediction.
-    Training data is padded up to the max_history with -1"""
+    """Slices the tracker history into max_history batches.
+
+    Creates training data that uses last output for prediction.
+    Training data is padded up to the max_history with -1.
+    """
 
     MAX_HISTORY_DEFAULT = 5
 
@@ -534,6 +558,7 @@ def __init__(
         remove_duplicates: bool = True,
         use_intent_probabilities: bool = False,
     ) -> None:
+
         super(MaxHistoryTrackerFeaturizer, self).__init__(
             state_featurizer, use_intent_probabilities
         )
@@ -547,7 +572,8 @@ def slice_state_history(
         """Slices states from the trackers history.
 
         If the slice is at the array borders, padding will be added to ensure
-        the slice length."""
+        the slice length.
+        """
 
         slice_end = len(states)
         slice_start = max(0, slice_end - slice_length)
@@ -558,6 +584,8 @@ def slice_state_history(
 
     @staticmethod
     def _hash_example(states, action):
+        """Hash states for efficient deduplication."""
+
         frozen_states = tuple(
             (s if s is None else frozenset(s.items()) for s in states)
         )
@@ -567,6 +595,10 @@ def _hash_example(states, action):
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]:
+        """Transforms list of trackers to lists of states and actions.
+
+        Training data is padded up to the max_history with -1.
+        """
 
         trackers_as_states = []
         trackers_as_actions = []
@@ -621,6 +653,7 @@ def training_states_and_actions(
     def prediction_states(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> List[List[Dict[Text, float]]]:
+        """Transforms list of trackers to lists of states for prediction."""
 
         trackers_as_states = [
             self._create_states(tracker, domain) for tracker in trackers
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index d1e5e1864cf1..6224513951cc 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -3,65 +3,51 @@
 import json
 import logging
 import os
+import pickle
 import warnings
 
 import numpy as np
 import typing
 from tqdm import tqdm
-from typing import Any, List, Optional, Text, Dict, Tuple, Union
+from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator, Callable
 
 import rasa.utils.io
 from rasa.core import utils
-from rasa.core.actions.action import ACTION_LISTEN_NAME
 from rasa.core.domain import Domain
 from rasa.core.featurizers import (
     TrackerFeaturizer,
     FullDialogueTrackerFeaturizer,
     LabelTokenizerSingleStateFeaturizer,
+    MaxHistoryTrackerFeaturizer,
 )
 from rasa.core.policies.policy import Policy
+from rasa.core.trackers import DialogueStateTracker
+from rasa.utils.common import is_logging_disabled
 
+from sklearn.model_selection import train_test_split
 import tensorflow as tf
-from rasa.core.policies.tf_utils import (
-    TimeAttentionWrapper,
-    ChronoBiasLayerNormBasicLSTMCell,
+from tensor2tensor.models.transformer import (
+    transformer_base,
+    transformer_prepare_encoder,
+    transformer_encoder,
 )
-from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.common import is_logging_disabled
+from tensor2tensor.layers.common_attention import large_compatible_negative
 
 if typing.TYPE_CHECKING:
-    from rasa.core.policies.tf_utils import TimeAttentionWrapperState
-
-try:
-    import cPickle as pickle  # pytype: disable=import-error
-except ImportError:
-    import pickle
-
-tf.contrib._warning = None  # avoid warning println on contrib import - remove for tf 2
+    from tensor2tensor.utils.hparam import HParams
 
+# avoid warning println on contrib import - remove for tf 2
+tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
 # namedtuple for all tf session related data
-SessionData = namedtuple(
-    "SessionData",
-    (
-        "X",
-        "Y",
-        "slots",
-        "previous_actions",
-        "actions_for_Y",
-        "x_for_no_intent",
-        "y_for_no_action",
-        "y_for_action_listen",
-        "all_Y_d",
-    ),
-)
+SessionData = namedtuple("SessionData", ("X", "Y", "labels"))
 
 
 class EmbeddingPolicy(Policy):
-    """Recurrent Embedding Dialogue Policy (REDP)
+    """Transformer Embedding Dialogue Policy (TEDP)
 
-    The policy that is used in our paper https://arxiv.org/abs/1811.11707
+    Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
     """
 
     SUPPORTS_ONLINE_TRAINING = True
@@ -71,18 +57,26 @@ class EmbeddingPolicy(Policy):
         # nn architecture
         # a list of hidden layers sizes before user embed layer
         # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_a": [],
+        "hidden_layers_sizes_pre_dial": [],
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_b": [],
-        # number of units in rnn cell
-        "rnn_size": 64,
+        "hidden_layers_sizes_bot": [],
+        # number of units in transformer
+        "transformer_size": 128,
+        # number of transformer layers
+        "num_transformer_layers": 1,
+        # type of positional encoding in transformer
+        "pos_encoding": "timing",  # string 'timing' or 'emb'
+        # max sequence length if pos_encoding='emb'
+        "max_seq_length": 256,
+        # number of attention heads in transformer
+        "num_heads": 4,
         # training parameters
-        # flag if to turn on layer normalization for lstm cell
-        "layer_norm": True,
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
+        # initial and final batch sizes:
+        # batch size will be linearly increased for each epoch
         "batch_size": [8, 32],
+        # how to create batches
+        "batch_strategy": "sequence",  # string 'sequence' or 'balanced'
         # number of epochs
         "epochs": 1,
         # set random seed to any int to get reproducible results
@@ -90,165 +84,115 @@ class EmbeddingPolicy(Policy):
         # embedding parameters
         # dimension size of embedding vectors
         "embed_dim": 20,
+        # the type of the similarity
+        "num_neg": 20,
+        # flag if minimize only maximum similarity over incorrect actions
+        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
+        "loss_type": "softmax",  # string 'softmax' or 'margin'
         # how similar the algorithm should try
         # to make embedding vectors for correct actions
         "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect actions
         "mu_neg": -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # the type of the similarity
-        "similarity_type": "cosine",  # string 'cosine' or 'inner'
         # the number of incorrect actions, the algorithm will minimize
         # their similarity to the user input during training
-        "num_neg": 20,
-        # flag if minimize only maximum similarity over incorrect actions
         "use_max_sim_neg": True,  # flag which loss function to use
+        # scale loss inverse proportionally to confidence of correct prediction
+        "scale_loss": True,
         # regularization
         # the scale of L2 regularization
         "C2": 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different actions
         "C_emb": 0.8,
-        # scale loss with inverse frequency of bot actions
-        "scale_loss_by_action_counts": True,
-        # dropout rate for user nn
-        "droprate_a": 0.0,
+        # dropout rate for dial nn
+        "droprate_a": 0.1,
         # dropout rate for bot nn
         "droprate_b": 0.0,
-        # dropout rate for rnn
-        "droprate_rnn": 0.1,
-        # attention parameters
-        # flag to use attention over user input
-        # as an input to rnn
-        "attn_before_rnn": True,
-        # flag to use attention over prev bot actions
-        # and copy it to output bypassing rnn
-        "attn_after_rnn": True,
-        # flag to use `sparsemax` instead of `softmax` for attention
-        "sparse_attention": False,  # flag to use sparsemax for probs
-        # the range of allowed location-based attention shifts
-        "attn_shift_range": None,  # if None, set to mean dialogue length / 2
         # visualization of accuracy
-        # how often calculate train accuracy
+        # how often calculate validation accuracy
         "evaluate_every_num_epochs": 20,  # small values may hurt performance
-        # how many examples to use for calculation of train accuracy
-        "evaluate_on_num_examples": 100,  # large values may hurt performance
+        # how many examples to use for hold out validation set
+        "evaluate_on_num_examples": 0,  # large values may hurt performance
     }
 
     # end default properties (DOC MARKER - don't remove)
 
-    @classmethod
-    def _standard_featurizer(cls):
-        return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
+    @staticmethod
+    def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturizer":
+        if max_history is None:
+            return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
+        else:
+            return MaxHistoryTrackerFeaturizer(
+                LabelTokenizerSingleStateFeaturizer(), max_history=max_history
+            )
 
     def __init__(
         self,
-        featurizer: Optional[FullDialogueTrackerFeaturizer] = None,
+        featurizer: Optional["TrackerFeaturizer"] = None,
         priority: int = 1,
-        encoded_all_actions: Optional[np.ndarray] = None,
-        graph: Optional[tf.Graph] = None,
-        session: Optional[tf.Session] = None,
-        intent_placeholder: Optional[tf.Tensor] = None,
-        action_placeholder: Optional[tf.Tensor] = None,
-        slots_placeholder: Optional[tf.Tensor] = None,
-        prev_act_placeholder: Optional[tf.Tensor] = None,
-        dialogue_len: Optional[tf.Tensor] = None,
-        x_for_no_intent: Optional[tf.Tensor] = None,
-        y_for_no_action: Optional[tf.Tensor] = None,
-        y_for_action_listen: Optional[tf.Tensor] = None,
-        similarity_op: Optional[tf.Tensor] = None,
-        alignment_history: Optional[tf.Tensor] = None,
-        user_embed: Optional[tf.Tensor] = None,
-        bot_embed: Optional[tf.Tensor] = None,
-        slot_embed: Optional[tf.Tensor] = None,
-        dial_embed: Optional[tf.Tensor] = None,
-        rnn_embed: Optional[tf.Tensor] = None,
-        attn_embed: Optional[tf.Tensor] = None,
-        copy_attn_debug: Optional[tf.Tensor] = None,
-        all_time_masks: Optional[tf.Tensor] = None,
+        graph: Optional["tf.Graph"] = None,
+        session: Optional["tf.Session"] = None,
+        user_placeholder: Optional["tf.Tensor"] = None,
+        bot_placeholder: Optional["tf.Tensor"] = None,
+        similarity_all: Optional["tf.Tensor"] = None,
+        pred_confidence: Optional["tf.Tensor"] = None,
+        similarity: Optional["tf.Tensor"] = None,
+        dial_embed: Optional["tf.Tensor"] = None,
+        bot_embed: Optional["tf.Tensor"] = None,
+        all_bot_embed: Optional["tf.Tensor"] = None,
+        attention_weights: Optional["tf.Tensor"] = None,
+        max_history: Optional[int] = None,
         **kwargs: Any
     ) -> None:
-        if featurizer:
-            if not isinstance(featurizer, FullDialogueTrackerFeaturizer):
-                raise TypeError(
-                    "Passed tracker featurizer of type {}, "
-                    "should be FullDialogueTrackerFeaturizer."
-                    "".format(type(featurizer).__name__)
-                )
-        super(EmbeddingPolicy, self).__init__(featurizer, priority)
+        """Declare instant variables with default values"""
 
-        # flag if to use the same embeddings for user and bot
-        try:
-            self.share_embedding = self.featurizer.state_featurizer.use_shared_vocab
-        except AttributeError:
-            self.share_embedding = False
+        if not featurizer:
+            featurizer = self._standard_featurizer(max_history)
+        super(EmbeddingPolicy, self).__init__(featurizer, priority)
 
         self._load_params(**kwargs)
 
-        # chrono initialization for forget bias
-        self.characteristic_time = None
-
         # encode all actions with numbers
-        # persist this array for prediction time
-        self.encoded_all_actions = encoded_all_actions
+        self._encoded_all_actions = None
 
         # tf related instances
         self.graph = graph
         self.session = session
-        self.a_in = intent_placeholder
-        self.b_in = action_placeholder
-        self.c_in = slots_placeholder
-        self.b_prev_in = prev_act_placeholder
-        self._dialogue_len = dialogue_len
-        self._x_for_no_intent_in = x_for_no_intent
-        self._y_for_no_action_in = y_for_no_action
-        self._y_for_action_listen_in = y_for_action_listen
-        self.sim_op = similarity_op
-
-        # store attention probability distribution as
-        # concatenated tensor of each attention types
-        self.alignment_history = alignment_history
+        self.a_in = user_placeholder
+        self.b_in = bot_placeholder
+        self.sim_all = similarity_all
+        self.pred_confidence = pred_confidence
+        self.sim = similarity
 
         # persisted embeddings
-        self.user_embed = user_embed
-        self.bot_embed = bot_embed
-        self.slot_embed = slot_embed
         self.dial_embed = dial_embed
+        self.bot_embed = bot_embed
+        self.all_bot_embed = all_bot_embed
 
-        self.rnn_embed = rnn_embed
-        self.attn_embed = attn_embed
-        self.copy_attn_debug = copy_attn_debug
-
-        self.all_time_masks = all_time_masks
-
+        self.attention_weights = attention_weights
         # internal tf instances
+        self._iterator = None
         self._train_op = None
         self._is_training = None
-        self._loss_scales = None
 
     # init helpers
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layer_sizes = {
-            "a": config["hidden_layers_sizes_a"],
-            "b": config["hidden_layers_sizes_b"],
+        self.hidden_layers_sizes = {
+            "pre_dial": config["hidden_layers_sizes_pre_dial"],
+            "bot": config["hidden_layers_sizes_bot"],
         }
 
-        if self.share_embedding:
-            if self.hidden_layer_sizes["a"] != self.hidden_layer_sizes["b"]:
-                raise ValueError(
-                    "Due to sharing vocabulary "
-                    "in the featurizer, embedding weights "
-                    "are shared as well. "
-                    "So hidden_layers_sizes_a={} should be "
-                    "equal to hidden_layers_sizes_b={}"
-                    "".format(
-                        self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"]
-                    )
-                )
+        self.pos_encoding = config["pos_encoding"]
+        self.max_seq_length = config["max_seq_length"]
+        self.num_heads = config["num_heads"]
 
-        self.rnn_size = config["rnn_size"]
-        self.layer_norm = config["layer_norm"]
+        self.transformer_size = config["transformer_size"]
+        self.num_transformer_layers = config["num_transformer_layers"]
 
         self.batch_size = config["batch_size"]
+        self.batch_strategy = config["batch_strategy"]
 
         self.epochs = config["epochs"]
 
@@ -259,27 +203,21 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
         self.mu_pos = config["mu_pos"]
         self.mu_neg = config["mu_neg"]
         self.similarity_type = config["similarity_type"]
+        self.loss_type = config["loss_type"]
+        if self.similarity_type == "auto":
+            if self.loss_type == "softmax":
+                self.similarity_type = "inner"
+            elif self.loss_type == "margin":
+                self.similarity_type = "cosine"
+
         self.num_neg = config["num_neg"]
         self.use_max_sim_neg = config["use_max_sim_neg"]
+        self.scale_loss = config["scale_loss"]
 
     def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
         self.C2 = config["C2"]
         self.C_emb = config["C_emb"]
-        self.scale_loss_by_action_counts = config["scale_loss_by_action_counts"]
-        self.droprate = {
-            "a": config["droprate_a"],
-            "b": config["droprate_b"],
-            "rnn": config["droprate_rnn"],
-        }
-
-    def _load_attn_params(self, config: Dict[Text, Any]) -> None:
-        self.sparse_attention = config["sparse_attention"]
-        self.attn_shift_range = config["attn_shift_range"]
-        self.attn_after_rnn = config["attn_after_rnn"]
-        self.attn_before_rnn = config["attn_before_rnn"]
-
-    def is_using_attention(self):
-        return self.attn_after_rnn or self.attn_before_rnn
+        self.droprate = {"bot": config["droprate_b"], "dial": config["droprate_a"]}
 
     def _load_visual_params(self, config: Dict[Text, Any]) -> None:
         self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
@@ -295,116 +233,234 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self._load_nn_architecture_params(config)
         self._load_embedding_params(config)
         self._load_regularization_params(config)
-        self._load_attn_params(config)
         self._load_visual_params(config)
 
     # data helpers
-    # noinspection PyPep8Naming
-    def _create_X_slots_previous_actions(
-        self, data_X: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Extract feature vectors
-
-        for user input (X), slots and
-        previously executed actions from training data.
-        """
-
-        featurizer = self.featurizer.state_featurizer
-        slot_start = featurizer.user_feature_len
-        previous_start = slot_start + featurizer.slot_feature_len
-
-        X = data_X[:, :, :slot_start]
-        slots = data_X[:, :, slot_start:previous_start]
-        previous_actions = data_X[:, :, previous_start:]
-
-        return X, slots, previous_actions
-
     # noinspection PyPep8Naming
     @staticmethod
-    def _actions_for_Y(data_Y: np.ndarray) -> np.ndarray:
+    def _labels_for_Y(data_Y: "np.ndarray") -> "np.ndarray":
         """Prepare Y data for training: extract actions indices."""
+
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
-    def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray:
+    def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray":
         """Prepare Y data for training: features for action labels."""
 
-        return np.stack(
-            [
-                np.stack(
-                    [self.encoded_all_actions[action_idx] for action_idx in action_ids]
-                )
-                for action_ids in actions_for_Y
-            ]
-        )
+        if len(labels.shape) == 2:
+            return np.stack(
+                [
+                    np.stack(
+                        [
+                            self._encoded_all_actions[action_idx]
+                            for action_idx in action_ids
+                        ]
+                    )
+                    for action_ids in labels
+                ]
+            )
+        else:
+            return np.stack(
+                [self._encoded_all_actions[action_idx] for action_idx in labels]
+            )
 
     # noinspection PyPep8Naming
-    @staticmethod
-    def _create_zero_vector(X: np.ndarray) -> np.ndarray:
-        """Create zero vector of shape (1, X.shape[-1])."""
+    def _create_session_data(
+        self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
+    ) -> "SessionData":
+        """Combine all tf session related data into a named tuple"""
 
-        return np.zeros((1, X.shape[-1]), X.dtype)
+        if data_Y is not None:
+            # training time
+            labels = self._labels_for_Y(data_Y)
+            Y = self._action_features_for_Y(labels)
+
+            # idea taken from sklearn's stratify split
+            if labels.ndim == 2:
+                # for multi-label y, map each distinct row to a string repr
+                # using join because str(row) uses an ellipsis if len(row) > 1000
+                labels = np.array([" ".join(row.astype("str")) for row in labels])
+        else:
+            # prediction time
+            labels = None
+            Y = None
 
-    def _create_y_for_action_listen(self, domain: "Domain") -> np.ndarray:
-        """Extract feature vector for action_listen"""
-        action_listen_idx = domain.index_for_action(ACTION_LISTEN_NAME)
-        return self.encoded_all_actions[action_listen_idx : action_listen_idx + 1]
+        return SessionData(X=data_X, Y=Y, labels=labels)
 
     # noinspection PyPep8Naming
-    def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray:
-        """Stack encoded_all_intents on top of each other
+    def _train_val_split(
+        self, session_data: "SessionData"
+    ) -> Tuple["SessionData", "SessionData"]:
+        """Create random hold out validation set using stratified split."""
 
-        to create candidates for training examples and
-        to calculate training accuracy.
-        """
+        label_counts = dict(
+            zip(*np.unique(session_data.labels, return_counts=True, axis=0))
+        )
+        counts = np.array([label_counts[label] for label in session_data.labels])
+
+        multi_X = session_data.X[counts > 1]
+        multi_Y = session_data.Y[counts > 1]
+        multi_labels = session_data.labels[counts > 1]
+
+        solo_X = session_data.X[counts == 1]
+        solo_Y = session_data.Y[counts == 1]
+        solo_labels = session_data.labels[counts == 1]
+
+        (X_train, X_val, Y_train, Y_val, labels_train, labels_val) = train_test_split(
+            multi_X,
+            multi_Y,
+            multi_labels,
+            test_size=self.evaluate_on_num_examples,
+            random_state=self.random_seed,
+            stratify=multi_labels,
+        )
+        X_train = np.concatenate([X_train, solo_X])
+        Y_train = np.concatenate([Y_train, solo_Y])
+        labels_train = np.concatenate([labels_train, solo_labels])
+
+        return (
+            SessionData(X=X_train, Y=Y_train, labels=labels_train),
+            SessionData(X=X_val, Y=Y_val, labels=labels_val),
+        )
 
-        return np.stack([self.encoded_all_actions] * dialogue_len)
+    @staticmethod
+    def _shuffle_session_data(session_data: "SessionData") -> "SessionData":
+        """Shuffle session data."""
 
+        ids = np.random.permutation(len(session_data.X))
+        return SessionData(
+            X=session_data.X[ids],
+            Y=session_data.Y[ids],
+            labels=session_data.labels[ids],
+        )
+
+    # tf helpers:
     # noinspection PyPep8Naming
-    def _create_tf_session_data(
-        self, domain: "Domain", data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
-    ) -> SessionData:
-        """Combine all tf session related data into a named tuple"""
+    def _gen_batch(
+        self,
+        session_data: "SessionData",
+        batch_size: int,
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]:
+        """Generate batches."""
+
+        if shuffle:
+            session_data = self._shuffle_session_data(session_data)
+
+        if batch_strategy == "balanced":
+            num_examples = len(session_data.X)
+            unique_labels, counts_labels = np.unique(
+                session_data.labels, return_counts=True, axis=0
+            )
+            num_labels = len(unique_labels)
+
+            label_data = []
+            for label in unique_labels:
+                label_data.append(
+                    SessionData(
+                        X=session_data.X[session_data.labels == label],
+                        Y=session_data.Y[session_data.labels == label],
+                        labels=None,  # ignore new labels
+                    )
+                )
 
-        X, slots, previous_actions = self._create_X_slots_previous_actions(data_X)
+            data_idx = [0] * num_labels
+            num_data_cycles = [0] * num_labels
+            skipped = [False] * num_labels
+            new_X = []
+            new_Y = []
+            while min(num_data_cycles) == 0:
+                if shuffle:
+                    ids = np.random.permutation(num_labels)
+                else:
+                    ids = range(num_labels)
+
+                for i in ids:
+                    if num_data_cycles[i] > 0 and not skipped[i]:
+                        skipped[i] = True
+                        continue
+                    else:
+                        skipped[i] = False
+
+                    num_i = int(counts_labels[i] / num_examples * batch_size) + 1
+
+                    new_X.append(label_data[i].X[data_idx[i] : data_idx[i] + num_i])
+                    new_Y.append(label_data[i].Y[data_idx[i] : data_idx[i] + num_i])
+
+                    data_idx[i] += num_i
+                    if data_idx[i] >= counts_labels[i]:
+                        num_data_cycles[i] += 1
+                        data_idx[i] = 0
+
+                    if min(num_data_cycles) > 0:
+                        break
+
+            session_data = SessionData(
+                X=np.concatenate(new_X), Y=np.concatenate(new_Y), labels=None
+            )  # ignore new labels
+
+        num_batches = session_data.X.shape[0] // batch_size + int(
+            session_data.X.shape[0] % batch_size > 0
+        )
 
-        if data_Y is not None:
-            # training time
-            actions_for_Y = self._actions_for_Y(data_Y)
-            Y = self._action_features_for_Y(actions_for_Y)
-        else:
-            # prediction time
-            actions_for_Y = None
-            Y = None
+        for batch_num in range(num_batches):
+            batch_x = session_data.X[
+                batch_num * batch_size : (batch_num + 1) * batch_size
+            ]
+            batch_y = session_data.Y[
+                batch_num * batch_size : (batch_num + 1) * batch_size
+            ]
 
-        x_for_no_intent = self._create_zero_vector(X)
-        y_for_no_action = self._create_zero_vector(previous_actions)
-        y_for_action_listen = self._create_y_for_action_listen(domain)
+            yield batch_x, batch_y
 
-        # is needed to calculate train accuracy
-        all_Y_d = self._create_all_Y_d(X.shape[1])
+    # noinspection PyPep8Naming
+    def _create_tf_dataset(
+        self,
+        session_data: "SessionData",
+        batch_size: Union["tf.Tensor", int],
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ) -> "tf.data.Dataset":
+        """Create tf dataset."""
+
+        # set batch and sequence length to None
+        shape_X = (None, None, session_data.X[0].shape[-1])
+
+        if session_data.Y[0].ndim == 1:
+            shape_Y = (None, session_data.Y[0].shape[-1])
+        else:
+            shape_Y = (None, None, session_data.Y[0].shape[-1])
 
-        return SessionData(
-            X=X,
-            Y=Y,
-            slots=slots,
-            previous_actions=previous_actions,
-            actions_for_Y=actions_for_Y,
-            x_for_no_intent=x_for_no_intent,
-            y_for_no_action=y_for_no_action,
-            y_for_action_listen=y_for_action_listen,
-            all_Y_d=all_Y_d,
+        return tf.data.Dataset.from_generator(
+            lambda batch_size_: self._gen_batch(
+                session_data, batch_size_, batch_strategy, shuffle
+            ),
+            output_types=(tf.float32, tf.float32),
+            output_shapes=(shape_X, shape_Y),
+            args=([batch_size]),
         )
 
-        # tf helpers:
+    @staticmethod
+    def _create_tf_iterator(dataset: "tf.data.Dataset") -> "tf.data.Iterator":
+        """Create tf iterator."""
+
+        return tf.data.Iterator.from_structure(
+            dataset.output_types,
+            dataset.output_shapes,
+            output_classes=dataset.output_classes,
+        )
 
     def _create_tf_nn(
         self,
-        x_in: tf.Tensor,
-        layer_sizes: List,
+        x_in: "tf.Tensor",
+        layer_sizes: List[int],
         droprate: float,
         layer_name_suffix: Text,
-    ) -> tf.Tensor:
+        activation: Optional[Callable] = tf.nn.relu,
+        use_bias: bool = True,
+        kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
+    ) -> "tf.Tensor":
         """Create nn with hidden layers and name suffix."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -413,7 +469,9 @@ def _create_tf_nn(
             x = tf.layers.dense(
                 inputs=x,
                 units=layer_size,
-                activation=tf.nn.relu,
+                activation=activation,
+                use_bias=use_bias,
+                kernel_initializer=kernel_initializer,
                 kernel_regularizer=reg,
                 name="hidden_layer_{}_{}".format(layer_name_suffix, i),
                 reuse=tf.AUTO_REUSE,
@@ -421,7 +479,21 @@ def _create_tf_nn(
             x = tf.layers.dropout(x, rate=droprate, training=self._is_training)
         return x
 
-    def _create_embed(self, x: tf.Tensor, layer_name_suffix: Text) -> tf.Tensor:
+    def _tf_normalize_if_cosine(self, x: "tf.Tensor") -> "tf.Tensor":
+        """Normalize embedding if similarity type is cosine."""
+
+        if self.similarity_type == "cosine":
+            return tf.nn.l2_normalize(x, -1)
+        elif self.similarity_type == "inner":
+            return x
+        else:
+            raise ValueError(
+                "Wrong similarity type '{}', "
+                "should be 'cosine' or 'inner'"
+                "".format(self.similarity_type)
+            )
+
+    def _create_tf_embed(self, x: "tf.Tensor", layer_name_suffix: Text) -> "tf.Tensor":
         """Create dense embedding layer with a name."""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -433,656 +505,441 @@ def _create_embed(self, x: tf.Tensor, layer_name_suffix: Text) -> tf.Tensor:
             name="embed_layer_{}".format(layer_name_suffix),
             reuse=tf.AUTO_REUSE,
         )
-        return embed_x
-
-    def _create_tf_user_embed(self, a_in: tf.Tensor) -> tf.Tensor:
-        """Create embedding user vector."""
+        # normalize embedding vectors for cosine similarity
+        return self._tf_normalize_if_cosine(embed_x)
 
-        layer_name_suffix = "a_and_b" if self.share_embedding else "a"
-
-        a = self._create_tf_nn(
-            a_in,
-            self.hidden_layer_sizes["a"],
-            self.droprate["a"],
-            layer_name_suffix=layer_name_suffix,
-        )
-        return self._create_embed(a, layer_name_suffix=layer_name_suffix)
-
-    def _create_tf_bot_embed(self, b_in: tf.Tensor) -> tf.Tensor:
+    def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
         """Create embedding bot vector."""
 
-        layer_name_suffix = "a_and_b" if self.share_embedding else "b"
-
         b = self._create_tf_nn(
             b_in,
-            self.hidden_layer_sizes["b"],
-            self.droprate["b"],
-            layer_name_suffix=layer_name_suffix,
+            self.hidden_layers_sizes["bot"],
+            self.droprate["bot"],
+            layer_name_suffix="bot",
         )
-        return self._create_embed(b, layer_name_suffix=layer_name_suffix)
+        return self._create_tf_embed(b, layer_name_suffix="bot")
 
-    def _create_tf_no_intent_embed(self, x_for_no_intent_i: tf.Tensor) -> tf.Tensor:
-        """Create embedding user vector for empty intent."""
+    def _create_t2t_hparams(self) -> "HParams":
+        """Create parameters for t2t transformer."""
 
-        layer_name_suffix = "a_and_b" if self.share_embedding else "a"
+        hparams = transformer_base()
 
-        x_for_no_intent = self._create_tf_nn(
-            x_for_no_intent_i,
-            self.hidden_layer_sizes["a"],
-            droprate=0,
-            layer_name_suffix=layer_name_suffix,
-        )
-        return tf.stop_gradient(
-            self._create_embed(x_for_no_intent, layer_name_suffix=layer_name_suffix)
-        )
+        hparams.num_hidden_layers = self.num_transformer_layers
+        hparams.hidden_size = self.transformer_size
+        # it seems to be factor of 4 for transformer architectures in t2t
+        hparams.filter_size = hparams.hidden_size * 4
+        hparams.num_heads = self.num_heads
+        hparams.relu_dropout = self.droprate["dial"]
+        hparams.pos = self.pos_encoding
 
-    def _create_tf_no_action_embed(self, y_for_no_action_in: tf.Tensor) -> tf.Tensor:
-        """Create embedding bot vector for empty action and action_listen."""
+        hparams.max_length = self.max_seq_length
 
-        layer_name_suffix = "a_and_b" if self.share_embedding else "b"
+        hparams.unidirectional_encoder = True
 
-        y_for_no_action = self._create_tf_nn(
-            y_for_no_action_in,
-            self.hidden_layer_sizes["b"],
-            droprate=0,
-            layer_name_suffix=layer_name_suffix,
-        )
-        return tf.stop_gradient(
-            self._create_embed(y_for_no_action, layer_name_suffix=layer_name_suffix)
-        )
+        hparams.self_attention_type = "dot_product_relative_v2"
+        hparams.max_relative_position = 5
+        hparams.add_relative_to_values = True
 
-    def _create_rnn_cell(self) -> tf.contrib.rnn.RNNCell:
-        """Create one rnn cell."""
+        return hparams
 
-        # chrono initialization for forget bias
-        # assuming that characteristic time is max dialogue length
-        # left border that initializes forget gate close to 0
-        bias_0 = -1.0
+    # noinspection PyUnresolvedReferences
+    def _create_t2t_transformer_encoder(
+        self,
+        x_in: "tf.Tensor",
+        mask: "tf.Tensor",
+        attention_weights: Dict[Text, "tf.Tensor"],
+    ) -> "tf.Tensor":
+        """Create t2t transformer encoder."""
+
+        hparams = self._create_t2t_hparams()
+
+        # When not in training mode, set all forms of dropout to zero.
+        for key, value in hparams.values().items():
+            if key.endswith("dropout") or key == "label_smoothing":
+                setattr(hparams, key, value * tf.cast(self._is_training, tf.float32))
+
+        with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
+            x = self._create_tf_nn(
+                x_in,
+                [hparams.hidden_size],
+                hparams.layer_prepostprocess_dropout,
+                layer_name_suffix="pre_embed",
+                activation=None,
+                use_bias=False,
+                kernel_initializer=tf.random_normal_initializer(
+                    0.0, hparams.hidden_size ** -0.5
+                ),
+            )
+            if hparams.multiply_embedding_mode == "sqrt_depth":
+                x *= hparams.hidden_size ** 0.5
 
-        # right border that initializes forget gate close to 1
-        bias_1 = np.log(self.characteristic_time - 1.0)
-        fbias = (bias_1 - bias_0) * np.random.random(self.rnn_size) + bias_0
+            x *= tf.expand_dims(mask, -1)
+            (
+                x,
+                self_attention_bias,
+                encoder_decoder_attention_bias,
+            ) = transformer_prepare_encoder(x, None, hparams)
+
+            x *= tf.expand_dims(mask, -1)
+
+            x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+
+            attn_bias_for_padding = None
+            # Otherwise the encoder will just use encoder_self_attention_bias.
+            if hparams.unidirectional_encoder:
+                attn_bias_for_padding = encoder_decoder_attention_bias
+
+            x = transformer_encoder(
+                x,
+                self_attention_bias,
+                hparams,
+                nonpadding=mask,
+                save_weights_to=attention_weights,
+                attn_bias_for_padding=attn_bias_for_padding,
+            )
 
-        if self.attn_after_rnn:
-            # since attention is copied to rnn output,
-            # embedding should be performed inside the cell
-            embed_layer_size = self.embed_dim
-        else:
-            embed_layer_size = None
+            x *= tf.expand_dims(mask, -1)
 
-        keep_prob = 1.0 - (
-            self.droprate["rnn"] * tf.cast(self._is_training, tf.float32)
-        )
+            return tf.nn.dropout(
+                tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout
+            )
 
-        return ChronoBiasLayerNormBasicLSTMCell(
-            num_units=self.rnn_size,
-            layer_norm=self.layer_norm,
-            forget_bias=fbias,
-            input_bias=-fbias,
-            dropout_keep_prob=keep_prob,
-            out_layer_size=embed_layer_size,
-        )
+    def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Create dialogue level embedding and mask."""
 
-    @staticmethod
-    def _num_units(memory: tf.Tensor) -> int:
-        return memory.shape[-1].value
-
-    def _create_attn_mech(
-        self, memory: tf.Tensor, real_length: tf.Tensor
-    ) -> tf.contrib.seq2seq.AttentionMechanism:
-
-        return tf.contrib.seq2seq.BahdanauAttention(
-            num_units=self._num_units(memory),
-            memory=memory,
-            memory_sequence_length=real_length,
-            normalize=True,
-            probability_fn=tf.identity,
-            # we only attend to memory up to a current time step
-            # it does not affect alignments, but
-            # is important for interpolation gate
-            score_mask_value=0,
-        )
+        # mask different length sequences
+        # if there is at least one `-1` it should be masked
+        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
 
-    def cell_input_fn(
-        self,
-        rnn_inputs: tf.Tensor,
-        attention: tf.Tensor,
-        num_cell_input_memory_units: int,
-    ) -> tf.Tensor:
-        """Combine rnn inputs and attention into cell input.
+        a = self._create_tf_nn(
+            a_in,
+            self.hidden_layers_sizes["pre_dial"],
+            self.droprate["dial"],
+            layer_name_suffix="pre_dial",
+        )
 
-        Args:
-          rnn_inputs: Tensor, first output from `rnn_and_attn_inputs_fn`.
+        self.attention_weights = {}
+        a = self._create_t2t_transformer_encoder(a, mask, self.attention_weights)
 
-          attention: Tensor, concatenated all attentions for one time step.
+        dial_embed = self._create_tf_embed(a, layer_name_suffix="dial")
 
-          num_cell_input_memory_units: int, number of the first units in
-                                       `attention` that are responsible for
-                                       enhancing cell input.
+        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+            # pick last action if max history featurizer is used
+            dial_embed = dial_embed[:, -1:, :]
+            mask = mask[:, -1:]
 
-        Returns:
-          A Tensor `cell_inputs` to feed to an rnn cell.
-        """
+        return dial_embed, mask
 
-        if num_cell_input_memory_units:
-            if num_cell_input_memory_units == self.embed_dim:
-                # since attention can contain additional
-                # attention mechanisms, only attention
-                # from previous user input is used as an input
-                # for rnn cell and only if memory before rnn
-                # is the same size as embed_utter
-                return tf.concat(
-                    [
-                        rnn_inputs[:, : self.embed_dim]
-                        + attention[:, :num_cell_input_memory_units],
-                        rnn_inputs[:, self.embed_dim :],
-                    ],
-                    -1,
-                )
-            else:
-                # in current implementation it cannot fall here,
-                # but this Exception exists in case
-                # attention before rnn is changed
-                raise ValueError(
-                    "Number of memory units {} is not "
-                    "equal to number of utter units {}. "
-                    "Please modify cell input function "
-                    "accordingly."
-                    "".format(num_cell_input_memory_units, self.embed_dim)
-                )
-        else:
-            return rnn_inputs
+    @staticmethod
+    def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":
+        """Make tensor 2D."""
 
-    def rnn_and_attn_inputs_fn(
-        self, inputs: tf.Tensor, cell_state: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Construct rnn input and attention mechanism input.
+        return tf.reshape(x, (-1, x.shape[-1]))
 
-        Args:
-          inputs: Tensor, concatenated all embeddings for one time step:
-                  [embed_utter, embed_slots, embed_prev_action].
+    @staticmethod
+    def _tf_sample_neg(
+        batch_size: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
+    ) -> "tf.Tensor":
+        """Sample negative examples for given indices"""
 
-          cell_state: Tensor, state of an rnn cell.
+        tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
 
-        Returns:
-          Tuple of Tensors `rnn_inputs, attn_inputs` to feed to
-          rnn and attention mechanisms.
-        """
+        return tf.batch_gather(tiled_all_bs, neg_ids)
 
-        # the hidden state c and slots are not included,
-        # in hope that algorithm would learn correct attention
-        # regardless of the hidden state c of an lstm and slots
-        if isinstance(cell_state, tf.contrib.rnn.LSTMStateTuple):
-            attn_inputs = tf.concat([inputs[:, : self.embed_dim], cell_state.h], -1)
-        else:
-            attn_inputs = tf.concat([inputs[:, : self.embed_dim], cell_state], -1)
+    def _tf_calc_iou_mask(
+        self, pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
+    ) -> "tf.Tensor":
+        """Calculate IOU mask for given indices"""
 
-        # include slots in inputs but exclude previous action, since
-        # rnn should get previous action from its hidden state
-        rnn_inputs = inputs[:, : (self.embed_dim + self.embed_dim)]
+        pos_b_in_flat = tf.expand_dims(pos_b, -2)
+        neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
 
-        return rnn_inputs, attn_inputs
+        intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
+        union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
 
-    def _create_attn_cell(
-        self,
-        cell: tf.contrib.rnn.RNNCell,
-        embed_utter: tf.Tensor,
-        embed_prev_action: tf.Tensor,
-        real_length: tf.Tensor,
-        embed_for_no_intent: tf.Tensor,
-        embed_for_no_action: tf.Tensor,
-        embed_for_action_listen: tf.Tensor,
-    ) -> tf.contrib.rnn.RNNCell:
-        """Wrap cell in attention wrapper with given memory."""
-
-        if self.attn_before_rnn:
-            # create attention over previous user input
-            num_memory_units_before_rnn = self._num_units(embed_utter)
-            attn_mech = self._create_attn_mech(embed_utter, real_length)
-
-            # create mask for empty user input not to pay attention to it
-            ignore_mask = tf.reduce_all(
-                tf.equal(tf.expand_dims(embed_for_no_intent, 0), embed_utter), -1
-            )
+        iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(
+            union_b_in_flat, -1
+        )
+        return 1.0 - tf.nn.relu(tf.sign(1.0 - iou))
 
-            # do not use attention by location before rnn
-            attn_shift_range = 0
-        else:
-            attn_mech = None
-            ignore_mask = None
-            num_memory_units_before_rnn = None
-            attn_shift_range = None
-
-        if self.attn_after_rnn:
-            # create attention over previous bot actions
-            attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length)
-
-            # create mask for empty bot action or action_listen
-            # not to pay attention to them
-            ignore_mask_listen = tf.logical_or(
-                tf.reduce_all(
-                    tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action),
-                    -1,
-                ),
-                tf.reduce_all(
-                    tf.equal(
-                        tf.expand_dims(embed_for_action_listen, 0), embed_prev_action
-                    ),
-                    -1,
-                ),
-            )
+    def _tf_get_negs(
+        self, all_embed: "tf.Tensor", all_raw: "tf.Tensor", raw_pos: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Get negative examples from given tensor."""
 
-            if attn_mech is not None:
-                # if there is another attention mechanism,
-                # create a list of attention mechanisms
-                attn_mech = [attn_mech, attn_mech_after_rnn]
-                ignore_mask = [ignore_mask, ignore_mask_listen]
-                attn_shift_range = [attn_shift_range, self.attn_shift_range]
-            else:
-                attn_mech = attn_mech_after_rnn
-                ignore_mask = ignore_mask_listen
-                attn_shift_range = self.attn_shift_range
+        batch_size = tf.shape(raw_pos)[0]
+        seq_length = tf.shape(raw_pos)[1]
+        raw_flat = self._tf_make_flat(raw_pos)
 
-            # this particular attention mechanism is unusual
-            # in the sense that its calculated attention vector is directly
-            # added to cell output, therefore enabling copy mechanism
+        total_candidates = tf.shape(all_embed)[0]
 
-            # `index_of_attn_to_copy` is used by `TimeAttentionWrapper`,
-            # to know which attention to copy
-            index_of_attn_to_copy = -1
-        else:
-            index_of_attn_to_copy = None
-
-        return TimeAttentionWrapper(
-            cell=cell,
-            attention_mechanism=attn_mech,
-            sequence_len=self._dialogue_len,
-            attn_shift_range=attn_shift_range,
-            sparse_attention=self.sparse_attention,
-            rnn_and_attn_inputs_fn=self.rnn_and_attn_inputs_fn,
-            ignore_mask=ignore_mask,
-            cell_input_fn=lambda inputs, attention: (
-                self.cell_input_fn(inputs, attention, num_memory_units_before_rnn)
-            ),
-            index_of_attn_to_copy=index_of_attn_to_copy,
-            likelihood_fn=lambda emb_1, emb_2: (self._tf_sim(emb_1, emb_2, None)),
-            tensor_not_to_copy=embed_for_action_listen,
-            output_attention=True,
-            alignment_history=True,
+        all_indices = tf.tile(
+            tf.expand_dims(tf.range(0, total_candidates, 1), 0),
+            (batch_size * seq_length, 1),
+        )
+        shuffled_indices = tf.transpose(
+            tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
         )
+        neg_ids = shuffled_indices[:, : self.num_neg]
 
-    def _create_tf_dial_embed(
-        self,
-        embed_utter: tf.Tensor,
-        embed_slots: tf.Tensor,
-        embed_prev_action: tf.Tensor,
-        mask: tf.Tensor,
-        embed_for_no_intent: tf.Tensor,
-        embed_for_no_action: tf.Tensor,
-        embed_for_action_listen: tf.Tensor,
-    ) -> Tuple[tf.Tensor, Union[tf.Tensor, "TimeAttentionWrapperState"]]:
-        """Create rnn for dialogue level embedding."""
-
-        cell_input = tf.concat([embed_utter, embed_slots, embed_prev_action], -1)
-
-        cell = self._create_rnn_cell()
-
-        real_length = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
-
-        if self.is_using_attention():
-            cell = self._create_attn_cell(
-                cell,
-                embed_utter,
-                embed_prev_action,
-                real_length,
-                embed_for_no_intent,
-                embed_for_no_action,
-                embed_for_action_listen,
-            )
+        bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
+        bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1))
 
-        return tf.nn.dynamic_rnn(
-            cell,
-            cell_input,
-            dtype=tf.float32,
-            sequence_length=real_length,
-            scope="rnn_decoder",
+        neg_embed_flat = self._tf_sample_neg(
+            batch_size * seq_length, all_embed, neg_ids
+        )
+        neg_embed = tf.reshape(
+            neg_embed_flat, (batch_size, seq_length, -1, all_embed.shape[-1])
         )
 
-    @staticmethod
-    def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> tf.Tensor:
-        """Extract alignments history form final rnn cell state."""
-
-        alignments_from_state = final_state.alignment_history
-        if not isinstance(alignments_from_state, tuple):
-            alignments_from_state = [alignments_from_state]
+        return neg_embed, bad_negs
 
-        alignment_history = []
-        for alignments in alignments_from_state:
-            # reshape to (batch, time, memory_time)
-            alignment_history.append(tf.transpose(alignments.stack(), [1, 0, 2]))
+    def _sample_negatives(
+        self, all_actions: "tf.Tensor"
+    ) -> Tuple[
+        "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"
+    ]:
+        """Sample negative examples."""
 
-        return tf.concat(alignment_history, -1)
+        pos_dial_embed = tf.expand_dims(self.dial_embed, -2)
+        neg_dial_embed, dial_bad_negs = self._tf_get_negs(
+            self._tf_make_flat(self.dial_embed),
+            self._tf_make_flat(self.b_in),
+            self.b_in,
+        )
+        pos_bot_embed = tf.expand_dims(self.bot_embed, -2)
+        neg_bot_embed, bot_bad_negs = self._tf_get_negs(
+            self.all_bot_embed, all_actions, self.b_in
+        )
+        return (
+            pos_dial_embed,
+            pos_bot_embed,
+            neg_dial_embed,
+            neg_bot_embed,
+            dial_bad_negs,
+            bot_bad_negs,
+        )
 
     @staticmethod
-    def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> tf.Tensor:
-        """Extract all time masks form final rnn cell state."""
-
-        # reshape to (batch, time, memory_time) and ignore last time
-        # because time_mask is created for the next time step
-        return tf.transpose(final_state.all_time_masks.stack(), [1, 0, 2])[:, :-1, :]
-
-    def _sims_rnn_to_max_from(self, cell_output: tf.Tensor) -> List[tf.Tensor]:
-        """Save intermediate tensors for debug purposes."""
-
-        if self.attn_after_rnn:
-            # extract additional debug tensors
-            num_add = TimeAttentionWrapper.additional_output_size()
-            self.copy_attn_debug = cell_output[:, :, -num_add:]
-
-            # extract additional similarity to maximize
-            sim_attn_to_max = cell_output[:, :, -num_add]
-            sim_state_to_max = cell_output[:, :, -num_add + 1]
-            return [sim_attn_to_max, sim_state_to_max]
-        else:
-            return []
-
-    def _embed_dialogue_from(self, cell_output: tf.Tensor) -> tf.Tensor:
-        """Extract or calculate dialogue level embedding from cell_output."""
-
-        if self.attn_after_rnn:
-            # embedding layer is inside rnn cell
-            embed_dialogue = cell_output[:, :, : self.embed_dim]
-
-            # extract additional debug tensors
-            num_add = TimeAttentionWrapper.additional_output_size()
-            self.rnn_embed = cell_output[
-                :, :, self.embed_dim : (self.embed_dim + self.embed_dim)
-            ]
-            self.attn_embed = cell_output[
-                :, :, (self.embed_dim + self.embed_dim) : -num_add
-            ]
-        else:
-            # add embedding layer to rnn cell output
-            embed_dialogue = self._create_embed(
-                cell_output[:, :, : self.rnn_size], layer_name_suffix="out"
-            )
-            if self.attn_before_rnn:
-                # extract additional debug tensors
-                self.attn_embed = cell_output[:, :, self.rnn_size :]
+    def _tf_raw_sim(a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor") -> "tf.Tensor":
+        """Calculate similarity between given tensors."""
 
-        return embed_dialogue
+        return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2)
 
     def _tf_sim(
         self,
-        embed_dialogue: tf.Tensor,
-        embed_action: tf.Tensor,
-        mask: Optional[tf.Tensor],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Define similarity.
-
-        This method has two roles:
-        - calculate similarity between
-            two embedding vectors of the same size
-            and output binary mask and similarity;
-        - calculate similarity with several embedded actions for the loss
-            and output similarities between user input and bot actions
-            and similarities between bot actions.
-
-        They are kept in the same helper method,
-        because it is necessary for them to be mathematically identical.
-        """
-
-        if self.similarity_type == "cosine":
-            # normalize embedding vectors for cosine similarity
-            embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1)
-            embed_action = tf.nn.l2_normalize(embed_action, -1)
-
-        if self.similarity_type in {"cosine", "inner"}:
-
-            if len(embed_dialogue.shape) == len(embed_action.shape):
-                # calculate similarity between
-                # two embedding vectors of the same size
-                sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True)
-                bin_sim = tf.where(
-                    sim > (self.mu_pos - self.mu_neg) / 2.0,
-                    tf.ones_like(sim),
-                    tf.zeros_like(sim),
-                )
-
-                # output binary mask and similarity
-                return bin_sim, sim
-
-            else:
-                # calculate similarity with several
-                # embedded actions for the loss
-                sim = tf.reduce_sum(
-                    tf.expand_dims(embed_dialogue, -2) * embed_action, -1
-                )
-                sim *= tf.expand_dims(mask, 2)
-
-                sim_act = tf.reduce_sum(
-                    embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1
-                )
-                sim_act *= tf.expand_dims(mask, 2)
-
-                # output similarities between user input and bot actions
-                # and similarities between bot actions
-                return sim, sim_act
+        pos_dial_embed: "tf.Tensor",
+        pos_bot_embed: "tf.Tensor",
+        neg_dial_embed: "tf.Tensor",
+        neg_bot_embed: "tf.Tensor",
+        dial_bad_negs: "tf.Tensor",
+        bot_bad_negs: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
+        """Define similarity."""
+
+        # calculate similarity with several
+        # embedded actions for the loss
+        neg_inf = large_compatible_negative(pos_dial_embed.dtype)
+
+        sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
+        sim_neg = (
+            self._tf_raw_sim(pos_dial_embed, neg_bot_embed, mask)
+            + neg_inf * bot_bad_negs
+        )
+        sim_neg_bot_bot = (
+            self._tf_raw_sim(pos_bot_embed, neg_bot_embed, mask)
+            + neg_inf * bot_bad_negs
+        )
+        sim_neg_dial_dial = (
+            self._tf_raw_sim(pos_dial_embed, neg_dial_embed, mask)
+            + neg_inf * dial_bad_negs
+        )
+        sim_neg_bot_dial = (
+            self._tf_raw_sim(pos_bot_embed, neg_dial_embed, mask)
+            + neg_inf * dial_bad_negs
+        )
 
-        else:
-            raise ValueError(
-                "Wrong similarity type {}, "
-                "should be 'cosine' or 'inner'"
-                "".format(self.similarity_type)
-            )
+        # output similarities between user input and bot actions
+        # and similarities between bot actions and similarities between user inputs
+        return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
 
-    def _regularization_loss(self) -> Union[tf.Tensor, int]:
-        """Add regularization to the embed layer inside rnn cell."""
+    @staticmethod
+    def _tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
+        """Calculate accuracy"""
 
-        if self.attn_after_rnn:
-            return self.C2 * tf.add_n(
-                [
-                    tf.nn.l2_loss(tf_var)
-                    for tf_var in tf.trainable_variables()
-                    if "cell/out_layer/kernel" in tf_var.name
-                ]
-            )
-        else:
-            return 0
+        max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
+        return tf.reduce_mean(
+            tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), tf.float32)
+        )
 
-    def _tf_loss(
+    def _tf_loss_margin(
         self,
-        sim: tf.Tensor,
-        sim_act: tf.Tensor,
-        sims_rnn_to_max: List[tf.Tensor],
-        mask: tf.Tensor,
-    ) -> tf.Tensor:
-        """Define loss."""
+        sim_pos: "tf.Tensor",
+        sim_neg: "tf.Tensor",
+        sim_neg_bot_bot: "tf.Tensor",
+        sim_neg_dial_dial: "tf.Tensor",
+        sim_neg_bot_dial: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> "tf.Tensor":
+        """Define max margin loss."""
 
         # loss for maximizing similarity with correct action
-        loss = tf.maximum(0.0, self.mu_pos - sim[:, :, 0])
+        loss = tf.maximum(0.0, self.mu_pos - sim_pos[:, :, 0])
 
         # loss for minimizing similarity with `num_neg` incorrect actions
         if self.use_max_sim_neg:
             # minimize only maximum similarity over incorrect actions
-            max_sim_neg = tf.reduce_max(sim[:, :, 1:], -1)
+            max_sim_neg = tf.reduce_max(sim_neg, -1)
             loss += tf.maximum(0.0, self.mu_neg + max_sim_neg)
         else:
             # minimize all similarities with incorrect actions
-            max_margin = tf.maximum(0.0, self.mu_neg + sim[:, :, 1:])
+            max_margin = tf.maximum(0.0, self.mu_neg + sim_neg)
             loss += tf.reduce_sum(max_margin, -1)
 
-        if self.scale_loss_by_action_counts:
-            # scale loss inverse proportionally to number of action counts
-            loss *= self._loss_scales
+        # penalize max similarity between pos bot and neg bot embeddings
+        max_sim_neg_bot = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_bot, -1))
+        loss += max_sim_neg_bot * self.C_emb
 
-        # penalize max similarity between intent embeddings
-        loss_act = tf.maximum(0.0, tf.reduce_max(sim_act, -1))
-        loss += loss_act * self.C_emb
+        # penalize max similarity between pos dial and neg dial embeddings
+        max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_dial_dial, -1))
+        loss += max_sim_neg_dial * self.C_emb
 
-        # maximize similarity returned by time attention wrapper
-        for sim_to_add in sims_rnn_to_max:
-            loss += tf.maximum(0.0, 1.0 - sim_to_add)
+        # penalize max similarity between pos bot and neg dial embeddings
+        max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_dial, -1))
+        loss += max_sim_neg_dial * self.C_emb
 
         # mask loss for different length sequences
         loss *= mask
         # average the loss over sequence length
         loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1)
-
         # average the loss over the batch
-        loss = (
-            tf.reduce_mean(loss)
-            # add regularization losses
-            + self._regularization_loss()
-            + tf.losses.get_regularization_loss()
-        )
-        return loss
+        loss = tf.reduce_mean(loss)
 
-        # training methods
+        # add regularization losses
+        loss += tf.losses.get_regularization_loss()
 
-    def train(
-        self,
-        training_trackers: List[DialogueStateTracker],
-        domain: Domain,
-        **kwargs: Any
-    ) -> None:
-        """Train the policy on given training trackers."""
-
-        logger.debug("Started training embedding policy.")
-
-        # set numpy random seed
-        np.random.seed(self.random_seed)
-
-        # dealing with training data
-        training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
-        # assume that characteristic time is the mean length of the dialogues
-        self.characteristic_time = np.mean(training_data.true_length)
-        if self.attn_shift_range is None:
-            self.attn_shift_range = int(self.characteristic_time / 2)
+        return loss
 
-        # encode all actions with policies' featurizer
-        self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
-            domain
+    def _tf_loss_softmax(
+        self,
+        sim_pos: "tf.Tensor",
+        sim_neg: "tf.Tensor",
+        sim_neg_bot_bot: "tf.Tensor",
+        sim_neg_dial_dial: "tf.Tensor",
+        sim_neg_bot_dial: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> "tf.Tensor":
+        """Define softmax loss."""
+
+        logits = tf.concat(
+            [sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial], -1
         )
 
-        # check if number of negatives is less than number of actions
-        logger.debug(
-            "Check if num_neg {} is smaller "
-            "than number of actions {}, "
-            "else set num_neg to the number of actions - 1"
-            "".format(self.num_neg, domain.num_actions)
-        )
-        self.num_neg = min(self.num_neg, domain.num_actions - 1)
+        # create labels for softmax
+        pos_labels = tf.ones_like(logits[:, :, :1])
+        neg_labels = tf.zeros_like(logits[:, :, 1:])
+        labels = tf.concat([pos_labels, neg_labels], -1)
 
-        # extract actual training data to feed to tf session
-        session_data = self._create_tf_session_data(
-            domain, training_data.X, training_data.y
-        )
+        if self.scale_loss:
+            # mask loss by prediction confidence
+            pred = tf.nn.softmax(logits)
+            mask *= tf.pow((1 - pred[:, :, 0]) / 0.5, 4)
 
-        self.graph = tf.Graph()
+        loss = tf.losses.softmax_cross_entropy(labels, logits, mask)
+        # add regularization losses
+        loss += tf.losses.get_regularization_loss()
 
-        with self.graph.as_default():
-            # set random seed in tf
-            tf.set_random_seed(self.random_seed)
-
-            dialogue_len = None  # use dynamic time for rnn
-            # create placeholders
-            self.a_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.X.shape[-1]),
-                name="a",
-            )
-            self.b_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
-                name="b",
-            )
-            self.c_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.slots.shape[-1]),
-                name="slt",
-            )
-            self.b_prev_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(None, dialogue_len, session_data.Y.shape[-1]),
-                name="b_prev",
-            )
-            self._dialogue_len = tf.placeholder(
-                dtype=tf.int32, shape=(), name="dialogue_len"
-            )
-            self._x_for_no_intent_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(1, session_data.X.shape[-1]),
-                name="x_for_no_intent",
-            )
-            self._y_for_no_action_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(1, session_data.Y.shape[-1]),
-                name="y_for_no_action",
-            )
-            self._y_for_action_listen_in = tf.placeholder(
-                dtype=tf.float32,
-                shape=(1, session_data.Y.shape[-1]),
-                name="y_for_action_listen",
-            )
-            self._is_training = tf.placeholder_with_default(False, shape=())
-
-            self._loss_scales = tf.placeholder(
-                dtype=tf.float32, shape=(None, dialogue_len)
-            )
-
-            # create embedding vectors
-            self.user_embed = self._create_tf_user_embed(self.a_in)
-            self.bot_embed = self._create_tf_bot_embed(self.b_in)
-            self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt")
+        return loss
 
-            embed_prev_action = self._create_tf_bot_embed(self.b_prev_in)
-            embed_for_no_intent = self._create_tf_no_intent_embed(
-                self._x_for_no_intent_in
+    def _choose_loss(
+        self,
+        sim_pos: "tf.Tensor",
+        sim_neg: "tf.Tensor",
+        sim_neg_bot_bot: "tf.Tensor",
+        sim_neg_dial_dial: "tf.Tensor",
+        sim_neg_bot_dial: "tf.Tensor",
+        mask: "tf.Tensor",
+    ) -> "tf.Tensor":
+        """Use loss depending on given option."""
+
+        if self.loss_type == "margin":
+            return self._tf_loss_margin(
+                sim_pos,
+                sim_neg,
+                sim_neg_bot_bot,
+                sim_neg_dial_dial,
+                sim_neg_bot_dial,
+                mask,
             )
-            embed_for_no_action = self._create_tf_no_action_embed(
-                self._y_for_no_action_in
+        elif self.loss_type == "softmax":
+            return self._tf_loss_softmax(
+                sim_pos,
+                sim_neg,
+                sim_neg_bot_bot,
+                sim_neg_dial_dial,
+                sim_neg_bot_dial,
+                mask,
             )
-            embed_for_action_listen = self._create_tf_no_action_embed(
-                self._y_for_action_listen_in
+        else:
+            raise ValueError(
+                "Wrong loss type '{}', "
+                "should be 'margin' or 'softmax'"
+                "".format(self.loss_type)
             )
 
-            # mask different length sequences
-            # if there is at least one `-1` it should be masked
-            mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
-
-            # get rnn output
-            cell_output, final_state = self._create_tf_dial_embed(
-                self.user_embed,
-                self.slot_embed,
-                embed_prev_action,
-                mask,
-                embed_for_no_intent,
-                embed_for_no_action,
-                embed_for_action_listen,
-            )
-            # process rnn output
-            if self.is_using_attention():
-                self.alignment_history = self._alignments_history_from(final_state)
+    def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Bulid train graph using iterator."""
 
-                self.all_time_masks = self._all_time_masks_from(final_state)
+        # session data are int counts but we need a float tensors
+        self.a_in, self.b_in = self._iterator.get_next()
 
-            sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output)
-            self.dial_embed = self._embed_dialogue_from(cell_output)
+        all_actions = tf.constant(
+            self._encoded_all_actions, dtype=tf.float32, name="all_actions"
+        )
 
-            # calculate similarities
-            self.sim_op, sim_act = self._tf_sim(self.dial_embed, self.bot_embed, mask)
-            # construct loss
-            loss = self._tf_loss(self.sim_op, sim_act, sims_rnn_to_max, mask)
+        self.dial_embed, mask = self._create_tf_dial(self.a_in)
+
+        self.bot_embed = self._create_tf_bot_embed(self.b_in)
+        self.all_bot_embed = self._create_tf_bot_embed(all_actions)
+
+        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+            # add time dimension if max history featurizer is used
+            self.b_in = self.b_in[:, tf.newaxis, :]
+            self.bot_embed = self.bot_embed[:, tf.newaxis, :]
+
+        (
+            pos_dial_embed,
+            pos_bot_embed,
+            neg_dial_embed,
+            neg_bot_embed,
+            dial_bad_negs,
+            bot_bad_negs,
+        ) = self._sample_negatives(all_actions)
+
+        # calculate similarities
+        (
+            sim_pos,
+            sim_neg,
+            sim_neg_bot_bot,
+            sim_neg_dial_dial,
+            sim_neg_bot_dial,
+        ) = self._tf_sim(
+            pos_dial_embed,
+            pos_bot_embed,
+            neg_dial_embed,
+            neg_bot_embed,
+            dial_bad_negs,
+            bot_bad_negs,
+            mask,
+        )
 
-            # define which optimizer to use
-            self._train_op = tf.train.AdamOptimizer(
-                learning_rate=0.001, epsilon=1e-16
-            ).minimize(loss)
-            # train tensorflow graph
-            self.session = tf.Session(config=self._tf_config)
+        acc = self._tf_calc_accuracy(sim_pos, sim_neg)
 
-            self._train_tf(session_data, loss, mask)
+        loss = self._choose_loss(
+            sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial, mask
+        )
+        return loss, acc
 
     # training helpers
     def _linearly_increasing_batch_size(self, epoch: int) -> int:
@@ -1102,196 +959,253 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int:
         else:
             return int(self.batch_size[0])
 
-    def _create_batch_b(
-        self, batch_pos_b: np.ndarray, intent_ids: np.ndarray
-    ) -> np.ndarray:
-        """Create batch of actions.
+    def _train_tf_dataset(
+        self,
+        train_init_op: "tf.Operation",
+        eval_init_op: "tf.Operation",
+        batch_size_in: "tf.Tensor",
+        loss: "tf.Tensor",
+        acc: "tf.Tensor",
+    ) -> None:
+        """Train tf graph"""
 
-        The first is correct action
-        and the rest are wrong actions sampled randomly.
-        """
+        self.session.run(tf.global_variables_initializer())
 
-        batch_pos_b = batch_pos_b[:, :, np.newaxis, :]
+        if self.evaluate_on_num_examples:
+            logger.info(
+                "Validation accuracy is calculated every {} epochs"
+                "".format(self.evaluate_every_num_epochs)
+            )
+        pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled())
 
-        # sample negatives
-        batch_neg_b = np.zeros(
-            (
-                batch_pos_b.shape[0],
-                batch_pos_b.shape[1],
-                self.num_neg,
-                batch_pos_b.shape[-1],
-            ),
-            dtype=int,
-        )
-        for b in range(batch_pos_b.shape[0]):
-            for h in range(batch_pos_b.shape[1]):
-                # create negative indexes out of possible ones
-                # except for correct index of b
-                negative_indexes = [
-                    i
-                    for i in range(self.encoded_all_actions.shape[0])
-                    if i != intent_ids[b, h]
-                ]
+        train_loss = 0
+        train_acc = 0
+        eval_loss = 0
+        eval_acc = 0
+        for ep in pbar:
 
-                negs = np.random.choice(negative_indexes, size=self.num_neg)
+            batch_size = self._linearly_increasing_batch_size(ep)
 
-                batch_neg_b[b, h] = self.encoded_all_actions[negs]
+            self.session.run(train_init_op, feed_dict={batch_size_in: batch_size})
 
-        return np.concatenate([batch_pos_b, batch_neg_b], -2)
+            ep_train_loss = 0
+            ep_train_acc = 0
+            batches_per_epoch = 0
+            while True:
+                try:
+                    _, batch_train_loss, batch_train_acc = self.session.run(
+                        [self._train_op, loss, acc], feed_dict={self._is_training: True}
+                    )
+                    batches_per_epoch += 1
+                    ep_train_loss += batch_train_loss
+                    ep_train_acc += batch_train_acc
 
-    # noinspection PyPep8Naming
-    def _scale_loss_by_count_actions(
-        self,
-        X: np.ndarray,
-        slots: np.ndarray,
-        previous_actions: np.ndarray,
-        actions_for_Y: np.ndarray,
-    ) -> Union[np.ndarray, List[List]]:
-        """Calculate inverse proportionality of repeated actions."""
-
-        if self.scale_loss_by_action_counts:
-            full_X = np.concatenate(
-                [X, slots, previous_actions, actions_for_Y[:, :, np.newaxis]], -1
+                except tf.errors.OutOfRangeError:
+                    break
+
+            train_loss = ep_train_loss / batches_per_epoch
+            train_acc = ep_train_acc / batches_per_epoch
+
+            pbar.set_postfix(
+                {"loss": "{:.3f}".format(train_loss), "acc": "{:.3f}".format(train_acc)}
+            )
+
+            if eval_init_op is not None:
+                if (ep + 1) % self.evaluate_every_num_epochs == 0 or (
+                    ep + 1
+                ) == self.epochs:
+                    eval_loss, eval_acc = self._output_training_stat_dataset(
+                        eval_init_op, loss, acc
+                    )
+                    if (ep + 1) != self.epochs:
+                        logger.info(
+                            "Evaluation results: "
+                            "validation loss: {:.3f}, "
+                            "validation accuracy: {:.3f}"
+                            "".format(eval_loss, eval_acc)
+                        )
+
+        final_message = (
+            "Finished training embedding policy, "
+            "train loss={:.3f}, train accuracy={:.3f}"
+            "".format(train_loss, train_acc)
+        )
+        if eval_init_op is not None:
+            final_message += (
+                ", validation loss={:.3f}, validation accuracy={:.3f}"
+                "".format(eval_loss, eval_acc)
             )
-            full_X = full_X.reshape((-1, full_X.shape[-1]))
+        logger.info(final_message)
+
+    def _output_training_stat_dataset(
+        self, eval_init_op: "tf.Operation", loss: "tf.Tensor", acc: "tf.Tensor"
+    ) -> Tuple[float, float]:
+        """Output training statistics"""
+
+        self.session.run(eval_init_op)
+        ep_val_loss = 0
+        ep_val_acc = 0
+        batches_per_epoch = 0
+        while True:
+            try:
+                batch_val_loss, batch_val_acc = self.session.run(
+                    [loss, acc], feed_dict={self._is_training: False}
+                )
+                batches_per_epoch += 1
+                ep_val_loss += batch_val_loss
+                ep_val_acc += batch_val_acc
+            except tf.errors.OutOfRangeError:
+                break
+
+        return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch
+
+    # prepare for prediction
+    def _create_tf_placeholders(self, session_data: "SessionData") -> None:
+        """Create placeholders for prediction."""
+
+        dialogue_len = None  # use dynamic time
+        self.a_in = tf.placeholder(
+            dtype=tf.float32,
+            shape=(None, dialogue_len, session_data.X.shape[-1]),
+            name="a",
+        )
+        self.b_in = tf.placeholder(
+            dtype=tf.float32,
+            shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
+            name="b",
+        )
+
+    def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
+        """Rebuild tf graph for prediction."""
 
-            _, i, c = np.unique(full_X, return_inverse=True, return_counts=True, axis=0)
+        self._create_tf_placeholders(session_data)
 
-            counts = c[i].reshape((X.shape[0], X.shape[1]))
+        self.dial_embed, mask = self._create_tf_dial(self.a_in)
 
-            # do not include [-1 -1 ... -1 0] in averaging
-            # and smooth it by taking sqrt
-            return np.maximum(np.sqrt(np.mean(c[1:]) / counts), 1)
+        self.sim_all = self._tf_raw_sim(
+            self.dial_embed[:, :, tf.newaxis, :],
+            self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
+            mask,
+        )
+
+        if self.similarity_type == "cosine":
+            # clip negative values to zero
+            confidence = tf.nn.relu(self.sim_all)
         else:
-            return [[None]]
+            # normalize result to [0, 1] with softmax
+            confidence = tf.nn.softmax(self.sim_all)
+
+        self.bot_embed = self._create_tf_bot_embed(self.b_in)
+
+        self.sim = self._tf_raw_sim(
+            self.dial_embed[:, :, tf.newaxis, :], self.bot_embed, mask
+        )
+
+        return confidence
 
-    def _train_tf(
-        self, session_data: SessionData, loss: tf.Tensor, mask: tf.Tensor
+    def _extract_attention(self) -> Optional["tf.Tensor"]:
+        """Extract attention probabilities from t2t dict"""
+
+        attention = [
+            tf.expand_dims(t, 0)
+            for name, t in self.attention_weights.items()
+            if name.endswith("multihead_attention/dot_product_attention")
+        ]
+
+        if attention:
+            return tf.concat(attention, 0)
+
+    # training methods
+    def train(
+        self,
+        training_trackers: List["DialogueStateTracker"],
+        domain: "Domain",
+        **kwargs: Any
     ) -> None:
-        """Train tf graph."""
+        """Train the policy on given training trackers."""
 
-        self.session.run(tf.global_variables_initializer())
+        logger.debug("Started training embedding policy.")
 
-        if self.evaluate_on_num_examples:
-            logger.info(
-                "Accuracy is updated every {} epochs"
-                "".format(self.evaluate_every_num_epochs)
-            )
-        pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled())
-        train_acc = 0
-        last_loss = 0
-        for ep in pbar:
-            # randomize training data for the current epoch
-            ids = np.random.permutation(session_data.X.shape[0])
+        # set numpy random seed
+        np.random.seed(self.random_seed)
 
-            # calculate batch size for the current epoch
-            batch_size = self._linearly_increasing_batch_size(ep)
-            # calculate number of batches in the current epoch
-            batches_per_epoch = session_data.X.shape[0] // batch_size + int(
-                session_data.X.shape[0] % batch_size > 0
-            )
+        # dealing with training data
+        training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
 
-            # collect average loss over the batches
-            ep_loss = 0
-            for i in range(batches_per_epoch):
-                start_idx = i * batch_size
-                end_idx = (i + 1) * batch_size
-                batch_ids = ids[start_idx:end_idx]
+        # encode all actions with policies' featurizer
+        self._encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions(
+            domain
+        )
 
-                # get randomized data for current batch
-                batch_a = session_data.X[batch_ids]
-                batch_pos_b = session_data.Y[batch_ids]
-                actions_for_b = session_data.actions_for_Y[batch_ids]
+        # check if number of negatives is less than number of actions
+        logger.debug(
+            "Check if num_neg {} is smaller "
+            "than number of actions {}, "
+            "else set num_neg to the number of actions - 1"
+            "".format(self.num_neg, domain.num_actions)
+        )
+        # noinspection PyAttributeOutsideInit
+        self.num_neg = min(self.num_neg, domain.num_actions - 1)
 
-                # add negatives - incorrect bot actions predictions
-                batch_b = self._create_batch_b(batch_pos_b, actions_for_b)
+        # extract actual training data to feed to tf session
+        session_data = self._create_session_data(training_data.X, training_data.y)
 
-                batch_c = session_data.slots[batch_ids]
-                batch_b_prev = session_data.previous_actions[batch_ids]
+        if self.evaluate_on_num_examples:
+            session_data, eval_session_data = self._train_val_split(session_data)
+        else:
+            eval_session_data = None
 
-                # calculate how much the loss from each action
-                # should be scaled based on action rarity
-                batch_loss_scales = self._scale_loss_by_count_actions(
-                    batch_a, batch_c, batch_b_prev, actions_for_b
-                )
+        self.graph = tf.Graph()
 
-                # minimize and calculate loss
-                _loss, _ = self.session.run(
-                    [loss, self._train_op],
-                    feed_dict={
-                        self.a_in: batch_a,
-                        self.b_in: batch_b,
-                        self.c_in: batch_c,
-                        self.b_prev_in: batch_b_prev,
-                        self._dialogue_len: session_data.X.shape[1],
-                        self._x_for_no_intent_in: session_data.x_for_no_intent,
-                        self._y_for_no_action_in: session_data.y_for_no_action,
-                        self._y_for_action_listen_in: session_data.y_for_action_listen,
-                        self._is_training: True,
-                        self._loss_scales: batch_loss_scales,
-                    },
-                )
-                # collect average loss over the batches
-                ep_loss += _loss / batches_per_epoch
-
-            # calculate train accuracy
-            if self.evaluate_on_num_examples:
-                if (
-                    (ep + 1) == 1
-                    or (ep + 1) % self.evaluate_every_num_epochs == 0
-                    or (ep + 1) == self.epochs
-                ):
-                    train_acc = self._calc_train_acc(session_data, mask)
-                    last_loss = ep_loss
-
-                pbar.set_postfix(
-                    {
-                        "loss": "{:.3f}".format(ep_loss),
-                        "acc": "{:.3f}".format(train_acc),
-                    }
+        with self.graph.as_default():
+            # set random seed in tf
+            tf.set_random_seed(self.random_seed)
+
+            # allows increasing batch size
+            batch_size_in = tf.placeholder(tf.int64)
+            train_dataset = self._create_tf_dataset(
+                session_data,
+                batch_size_in,
+                batch_strategy=self.batch_strategy,
+                shuffle=True,
+            )
+
+            self._iterator = self._create_tf_iterator(train_dataset)
+
+            train_init_op = self._iterator.make_initializer(train_dataset)
+
+            if eval_session_data is not None:
+                eval_init_op = self._iterator.make_initializer(
+                    self._create_tf_dataset(
+                        eval_session_data,
+                        # pick maximum batch_size for eval
+                        self._linearly_increasing_batch_size(self.epochs),
+                    )
                 )
             else:
-                pbar.set_postfix({"loss": "{:.3f}".format(ep_loss)})
+                eval_init_op = None
 
-        if self.evaluate_on_num_examples:
-            logger.info(
-                "Finished training embedding policy, "
-                "loss={:.3f}, train accuracy={:.3f}"
-                "".format(last_loss, train_acc)
-            )
+            self._is_training = tf.placeholder_with_default(False, shape=())
+            loss, acc = self._build_tf_train_graph()
 
-    def _calc_train_acc(self, session_data: SessionData, mask: tf.Tensor) -> np.float32:
-        """Calculate training accuracy."""
+            # define which optimizer to use
+            self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
-        # choose n examples to calculate train accuracy
-        n = self.evaluate_on_num_examples
-        ids = np.random.permutation(len(session_data.X))[:n]
-        # noinspection PyPep8Naming
-        all_Y_d_x = np.stack(
-            [session_data.all_Y_d for _ in range(session_data.X[ids].shape[0])]
-        )
+            # train tensorflow graph
+            self.session = tf.Session(config=self._tf_config)
+            self._train_tf_dataset(
+                train_init_op, eval_init_op, batch_size_in, loss, acc
+            )
 
-        _sim, _mask = self.session.run(
-            [self.sim_op, mask],
-            feed_dict={
-                self.a_in: session_data.X[ids],
-                self.b_in: all_Y_d_x,
-                self.c_in: session_data.slots[ids],
-                self.b_prev_in: session_data.previous_actions[ids],
-                self._dialogue_len: session_data.X.shape[1],
-                self._x_for_no_intent_in: session_data.x_for_no_intent,
-                self._y_for_no_action_in: session_data.y_for_no_action,
-                self._y_for_action_listen_in: session_data.y_for_action_listen,
-            },
-        )
-        return np.sum(
-            (np.argmax(_sim, -1) == session_data.actions_for_Y[ids]) * _mask
-        ) / np.sum(_mask)
+            # rebuild the graph for prediction
+            self.pred_confidence = self._build_tf_pred_graph(session_data)
+
+            self.attention_weights = self._extract_attention()
 
     def continue_training(
         self,
-        training_trackers: List[DialogueStateTracker],
-        domain: Domain,
+        training_trackers: List["DialogueStateTracker"],
+        domain: "Domain",
         **kwargs: Any
     ) -> None:
         """Continue training an already trained policy."""
@@ -1299,43 +1213,42 @@ def continue_training(
         batch_size = kwargs.get("batch_size", 5)
         epochs = kwargs.get("epochs", 50)
 
-        for _ in range(epochs):
-            training_data = self._training_data_for_continue_training(
-                batch_size, training_trackers, domain
-            )
+        with self.graph.as_default():
+            for _ in range(epochs):
+                training_data = self._training_data_for_continue_training(
+                    batch_size, training_trackers, domain
+                )
 
-            session_data = self._create_tf_session_data(
-                domain, training_data.X, training_data.y
-            )
+                session_data = self._create_session_data(
+                    training_data.X, training_data.y
+                )
+                train_dataset = self._create_tf_dataset(session_data, batch_size)
+                train_init_op = self._iterator.make_initializer(train_dataset)
+                self.session.run(train_init_op)
 
-            b = self._create_batch_b(session_data.Y, session_data.actions_for_Y)
+                # fit to one extra example using updated trackers
+                while True:
+                    try:
+                        self.session.run(
+                            self._train_op, feed_dict={self._is_training: True}
+                        )
 
-            batch_loss_scales = self._scale_loss_by_count_actions(
-                session_data.X,
-                session_data.slots,
-                session_data.previous_actions,
-                session_data.actions_for_Y,
-            )
+                    except tf.errors.OutOfRangeError:
+                        break
 
-            # fit to one extra example using updated trackers
-            self.session.run(
-                self._train_op,
-                feed_dict={
-                    self.a_in: session_data.X,
-                    self.b_in: b,
-                    self.c_in: session_data.slots,
-                    self.b_prev_in: session_data.previous_actions,
-                    self._dialogue_len: session_data.X.shape[1],
-                    self._x_for_no_intent_in: session_data.x_for_no_intent,
-                    self._y_for_no_action_in: session_data.y_for_no_action,
-                    self._y_for_action_listen_in: session_data.y_for_action_listen,
-                    self._is_training: True,
-                    self._loss_scales: batch_loss_scales,
-                },
-            )
+    def tf_feed_dict_for_prediction(
+        self, tracker: "DialogueStateTracker", domain: "Domain"
+    ) -> Dict["tf.Tensor", "np.ndarray"]:
+        """Create feed dictionary for tf session."""
+
+        # noinspection PyPep8Naming
+        data_X = self.featurizer.create_X([tracker], domain)
+        session_data = self._create_session_data(data_X)
+
+        return {self.a_in: session_data.X}
 
     def predict_action_probabilities(
-        self, tracker: DialogueStateTracker, domain: Domain
+        self, tracker: "DialogueStateTracker", domain: "Domain"
     ) -> List[float]:
         """Predict the next action the bot should take.
 
@@ -1350,40 +1263,15 @@ def predict_action_probabilities(
             )
             return [0.0] * domain.num_actions
 
-        # noinspection PyPep8Naming
-        data_X = self.featurizer.create_X([tracker], domain)
-        session_data = self._create_tf_session_data(domain, data_X)
-        # noinspection PyPep8Naming
-        all_Y_d_x = np.stack(
-            [session_data.all_Y_d for _ in range(session_data.X.shape[0])]
-        )
+        tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain)
 
-        _sim = self.session.run(
-            self.sim_op,
-            feed_dict={
-                self.a_in: session_data.X,
-                self.b_in: all_Y_d_x,
-                self.c_in: session_data.slots,
-                self.b_prev_in: session_data.previous_actions,
-                self._dialogue_len: session_data.X.shape[1],
-                self._x_for_no_intent_in: session_data.x_for_no_intent,
-                self._y_for_no_action_in: session_data.y_for_no_action,
-                self._y_for_action_listen_in: session_data.y_for_action_listen,
-            },
-        )
+        confidence = self.session.run(self.pred_confidence, feed_dict=tf_feed_dict)
 
-        result = _sim[0, -1, :]
-        if self.similarity_type == "cosine":
-            # clip negative values to zero
-            result[result < 0] = 0
-        elif self.similarity_type == "inner":
-            # normalize result to [0, 1] with softmax
-            result = np.exp(result)
-            result /= np.sum(result)
+        return confidence[0, -1, :].tolist()
 
-        return result.tolist()
+    def _persist_tensor(self, name: Text, tensor: "tf.Tensor") -> None:
+        """Add tensor to collection if it is not None"""
 
-    def _persist_tensor(self, name: Text, tensor: tf.Tensor) -> None:
         if tensor is not None:
             self.graph.clear_collection(name)
             self.graph.add_to_collection(name, tensor)
@@ -1411,45 +1299,30 @@ def persist(self, path: Text) -> None:
         rasa.utils.io.create_directory_for_file(checkpoint)
 
         with self.graph.as_default():
-            self._persist_tensor("intent_placeholder", self.a_in)
-            self._persist_tensor("action_placeholder", self.b_in)
-            self._persist_tensor("slots_placeholder", self.c_in)
-            self._persist_tensor("prev_act_placeholder", self.b_prev_in)
-            self._persist_tensor("dialogue_len", self._dialogue_len)
-            self._persist_tensor("x_for_no_intent", self._x_for_no_intent_in)
-            self._persist_tensor("y_for_no_action", self._y_for_no_action_in)
-            self._persist_tensor("y_for_action_listen", self._y_for_action_listen_in)
-
-            self._persist_tensor("similarity_op", self.sim_op)
+            self._persist_tensor("user_placeholder", self.a_in)
+            self._persist_tensor("bot_placeholder", self.b_in)
 
-            self._persist_tensor("alignment_history", self.alignment_history)
+            self._persist_tensor("similarity_all", self.sim_all)
+            self._persist_tensor("pred_confidence", self.pred_confidence)
+            self._persist_tensor("similarity", self.sim)
 
-            self._persist_tensor("user_embed", self.user_embed)
-            self._persist_tensor("bot_embed", self.bot_embed)
-            self._persist_tensor("slot_embed", self.slot_embed)
             self._persist_tensor("dial_embed", self.dial_embed)
+            self._persist_tensor("bot_embed", self.bot_embed)
+            self._persist_tensor("all_bot_embed", self.all_bot_embed)
 
-            self._persist_tensor("rnn_embed", self.rnn_embed)
-            self._persist_tensor("attn_embed", self.attn_embed)
-            self._persist_tensor("copy_attn_debug", self.copy_attn_debug)
-
-            self._persist_tensor("all_time_masks", self.all_time_masks)
+            self._persist_tensor("attention_weights", self.attention_weights)
 
             saver = tf.train.Saver()
             saver.save(self.session, checkpoint)
 
-        encoded_actions_file = os.path.join(
-            path, file_name + ".encoded_all_actions.pkl"
-        )
-        with open(encoded_actions_file, "wb") as f:
-            pickle.dump(self.encoded_all_actions, f)
-
         tf_config_file = os.path.join(path, file_name + ".tf_config.pkl")
         with open(tf_config_file, "wb") as f:
             pickle.dump(self._tf_config, f)
 
     @staticmethod
-    def load_tensor(name: Text) -> Optional[tf.Tensor]:
+    def load_tensor(name: Text) -> Optional["tf.Tensor"]:
+        """Load tensor or set it to None"""
+
         tensor_list = tf.get_collection(name)
         return tensor_list[0] if tensor_list else None
 
@@ -1457,11 +1330,12 @@ def load_tensor(name: Text) -> Optional[tf.Tensor]:
     def load(cls, path: Text) -> "EmbeddingPolicy":
         """Loads a policy from the storage.
 
-            **Needs to load its featurizer**"""
+        **Needs to load its featurizer**
+        """
 
         if not os.path.exists(path):
             raise Exception(
-                "Failed to load dialogue model. Path {} "
+                "Failed to load dialogue model. Path '{}' "
                 "doesn't exist".format(os.path.abspath(path))
             )
 
@@ -1483,64 +1357,36 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         graph = tf.Graph()
         with graph.as_default():
-            sess = tf.Session(config=_tf_config)
+            session = tf.Session(config=_tf_config)
             saver = tf.train.import_meta_graph(checkpoint + ".meta")
 
-            saver.restore(sess, checkpoint)
+            saver.restore(session, checkpoint)
 
-            a_in = cls.load_tensor("intent_placeholder")
-            b_in = cls.load_tensor("action_placeholder")
-            c_in = cls.load_tensor("slots_placeholder")
-            b_prev_in = cls.load_tensor("prev_act_placeholder")
-            dialogue_len = cls.load_tensor("dialogue_len")
-            x_for_no_intent = cls.load_tensor("x_for_no_intent")
-            y_for_no_action = cls.load_tensor("y_for_no_action")
-            y_for_action_listen = cls.load_tensor("y_for_action_listen")
+            a_in = cls.load_tensor("user_placeholder")
+            b_in = cls.load_tensor("bot_placeholder")
 
-            sim_op = cls.load_tensor("similarity_op")
+            sim_all = cls.load_tensor("similarity_all")
+            pred_confidence = cls.load_tensor("pred_confidence")
+            sim = cls.load_tensor("similarity")
 
-            alignment_history = cls.load_tensor("alignment_history")
-
-            user_embed = cls.load_tensor("user_embed")
-            bot_embed = cls.load_tensor("bot_embed")
-            slot_embed = cls.load_tensor("slot_embed")
             dial_embed = cls.load_tensor("dial_embed")
+            bot_embed = cls.load_tensor("bot_embed")
+            all_bot_embed = cls.load_tensor("all_bot_embed")
 
-            rnn_embed = cls.load_tensor("rnn_embed")
-            attn_embed = cls.load_tensor("attn_embed")
-            copy_attn_debug = cls.load_tensor("copy_attn_debug")
-
-            all_time_masks = cls.load_tensor("all_time_masks")
-
-        encoded_actions_file = os.path.join(
-            path, "{}.encoded_all_actions.pkl".format(file_name)
-        )
-
-        with open(encoded_actions_file, "rb") as f:
-            encoded_all_actions = pickle.load(f)
+            attention_weights = cls.load_tensor("attention_weights")
 
         return cls(
             featurizer=featurizer,
             priority=meta["priority"],
-            encoded_all_actions=encoded_all_actions,
             graph=graph,
-            session=sess,
-            intent_placeholder=a_in,
-            action_placeholder=b_in,
-            slots_placeholder=c_in,
-            prev_act_placeholder=b_prev_in,
-            dialogue_len=dialogue_len,
-            x_for_no_intent=x_for_no_intent,
-            y_for_no_action=y_for_no_action,
-            y_for_action_listen=y_for_action_listen,
-            similarity_op=sim_op,
-            alignment_history=alignment_history,
-            user_embed=user_embed,
-            bot_embed=bot_embed,
-            slot_embed=slot_embed,
+            session=session,
+            user_placeholder=a_in,
+            bot_placeholder=b_in,
+            similarity_all=sim_all,
+            pred_confidence=pred_confidence,
+            similarity=sim,
             dial_embed=dial_embed,
-            rnn_embed=rnn_embed,
-            attn_embed=attn_embed,
-            copy_attn_debug=copy_attn_debug,
-            all_time_masks=all_time_masks,
+            bot_embed=bot_embed,
+            all_bot_embed=all_bot_embed,
+            attention_weights=attention_weights,
         )
diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py
index 17b179ff15e7..239a73aa4ae6 100644
--- a/rasa/core/policies/mapping_policy.py
+++ b/rasa/core/policies/mapping_policy.py
@@ -27,6 +27,10 @@ class MappingPolicy(Policy):
     executed whenever the intent is detected. This policy takes precedence over
     any other policy."""
 
+    @staticmethod
+    def _standard_featurizer():
+        return None
+
     def __init__(self, priority: int = 3) -> None:
         """Create a new Mapping policy."""
 
diff --git a/rasa/core/policies/tf_utils.py b/rasa/core/policies/tf_utils.py
deleted file mode 100644
index 2cfddda81bdd..000000000000
--- a/rasa/core/policies/tf_utils.py
+++ /dev/null
@@ -1,957 +0,0 @@
-from collections import namedtuple
-import tensorflow as tf
-
-tf.contrib._warning = None  # avoid warning println on contrib import - remove for tf 2
-
-
-class TimedNTM(object):
-    """Timed Neural Turing Machine
-
-    Inspired by paper:
-        https://arxiv.org/pdf/1410.5401.pdf
-    Implementation inspired by:
-        https://github.com/carpedm20/NTM-tensorflow/blob/master/ntm_cell.py
-
-    See our paper for details: https://arxiv.org/abs/1811.11707
-    """
-
-    def __init__(self, attn_shift_range, sparse_attention, name):
-        """Construct the `TimedNTM`.
-
-        Args:
-            attn_shift_range: Python int.
-                A time range within which to attend to the memory by location
-            sparse_attention: Python bool.
-                If `True` use sparsemax instead of softmax for probs
-            name: Name to use when creating ops.
-        """
-
-        # interpolation gate
-        self.name = "timed_ntm_" + name
-
-        self._inter_gate = tf.layers.Dense(
-            units=1, activation=tf.sigmoid, name=self.name + "/inter_gate"
-        )
-        # if use sparsemax instead of softmax for probs
-        self._sparse_attention = sparse_attention
-
-        if sparse_attention:
-            # sparsemax doesn't support inf
-            self._inf = float(5000)
-        else:
-            self._inf = float("inf")
-
-        # shift weighting if range is provided
-        if attn_shift_range:
-            self._shift_weight = tf.layers.Dense(
-                units=2 * attn_shift_range + 1,
-                activation=tf.nn.softmax,
-                name=self.name + "/shift_weight",
-            )
-        else:
-            self._shift_weight = None
-
-        # sharpening parameter
-        self._gamma_sharp = tf.layers.Dense(
-            units=1,
-            activation=lambda a: tf.nn.softplus(a) + 1,
-            bias_initializer=tf.constant_initializer(1),
-            name=self.name + "/gamma_sharp",
-        )
-
-    def __call__(self, attn_inputs, scores, scores_state, mask):
-        # apply exponential moving average with interpolation gate weight
-        # to scores from previous time which are equal to probs at this point
-        # different from original NTM where it is applied after softmax
-        i_g = self._inter_gate(attn_inputs)
-
-        # scores limited by time
-        scores = tf.concat(
-            [i_g * scores[:, :-1] + (1 - i_g) * scores_state, scores[:, -1:]], 1
-        )
-        next_scores_state = scores
-
-        if mask is not None:
-            # apply mask to scores
-            if self._shift_weight is not None:
-                # rearrange scores to make them continuous for convolution
-                scores = tf.map_fn(
-                    self._rearrange_fn, [scores, mask], dtype=scores.dtype
-                )
-            else:
-                scores = tf.where(mask > 0, scores, -self._inf * tf.ones_like(scores))
-
-        # create probabilities for attention
-        if self._sparse_attention:
-            probs = tf.contrib.sparsemax.sparsemax(scores)
-        else:
-            probs = tf.nn.softmax(scores)
-
-        if self._shift_weight is not None:
-            s_w = self._shift_weight(attn_inputs)
-
-            # we want to go back in time during convolution
-            conv_probs = tf.reverse(probs, axis=[1])
-
-            # preare probs for tf.nn.depthwise_conv2d
-            # [in_width, in_channels=batch]
-            conv_probs = tf.transpose(conv_probs, [1, 0])
-            # [batch=1, in_height=1, in_width=time+1, in_channels=batch]
-            conv_probs = conv_probs[tf.newaxis, tf.newaxis, :, :]
-
-            # [filter_height=1, filter_width=2*attn_shift_range+1,
-            #   in_channels=batch, channel_multiplier=1]
-            conv_s_w = tf.transpose(s_w, [1, 0])
-            conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis]
-
-            # perform 1d convolution
-            # [batch=1, out_height=1, out_width=time+1, out_channels=batch]
-            conv_probs = tf.nn.depthwise_conv2d_native(
-                conv_probs, conv_s_w, [1, 1, 1, 1], "SAME"
-            )
-            conv_probs = conv_probs[0, 0, :, :]
-            conv_probs = tf.transpose(conv_probs, [1, 0])
-
-            probs = tf.reverse(conv_probs, axis=[1])
-
-            if mask is not None:
-                # arrange probs back to their original time order
-                probs = tf.map_fn(
-                    self._arrange_back_fn, [probs, mask], dtype=probs.dtype
-                )
-
-        # sharpening
-        g_sh = self._gamma_sharp(attn_inputs)
-
-        powed_probs = tf.pow(probs, g_sh)
-        probs = powed_probs / (tf.reduce_sum(powed_probs, 1, keepdims=True) + 1e-32)
-
-        return probs, next_scores_state
-
-    def _rearrange_fn(self, list_tensor_1d_mask_1d):
-        """Rearranges tensor_1d to put all the values
-            where mask_1d=1 to the right and
-            where mask_1d=0 to the left and sets them to -infinity"""
-        tensor_1d, mask_1d = list_tensor_1d_mask_1d
-
-        partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2)
-        partitioned_tensor[0] = -self._inf * tf.ones_like(partitioned_tensor[0])
-
-        return tf.concat(partitioned_tensor, 0)
-
-    @staticmethod
-    def _arrange_back_fn(list_tensor_1d_mask_1d):
-        """Arranges back tensor_1d to restore original order
-            modified by `_rearrange_fn` according to mask_1d:
-            - number of 0s in mask_1d values on the left are set to
-              their corresponding places where mask_1d=0,
-            - number of 1s in mask_1d values on the right are set to
-              their corresponding places where mask_1d=1"""
-        tensor_1d, mask_1d = list_tensor_1d_mask_1d
-
-        mask_indices = tf.dynamic_partition(
-            tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2
-        )
-
-        mask_sum = tf.reduce_sum(mask_1d, axis=0)
-        partitioned_tensor = [
-            tf.zeros_like(tensor_1d[:-mask_sum]),
-            tensor_1d[-mask_sum:],
-        ]
-
-        return tf.dynamic_stitch(mask_indices, partitioned_tensor)
-
-
-def _compute_time_attention(
-    attention_mechanism,
-    attn_inputs,
-    attention_state,
-    # time is added to calculate time attention
-    time,
-    timed_ntm,
-    time_mask,
-    ignore_mask,
-    attention_layer,
-):
-    """Computes the attention and alignments limited by time
-        for a given attention_mechanism.
-
-        Modified helper method from tensorflow."""
-
-    scores, _ = attention_mechanism(attn_inputs, state=attention_state)
-
-    # take only scores from current and past times
-    timed_scores = scores[:, : time + 1]
-    timed_scores_state = attention_state[:, :time]
-
-    # get mask for past times
-    timed_time_mask = time_mask[:, :time]
-    if ignore_mask is not None:
-        timed_time_mask *= 1 - ignore_mask[:, :time]
-
-    # set mask for current time to 1
-    timed_time_mask = tf.concat([timed_time_mask, tf.ones_like(time_mask[:, :1])], 1)
-
-    # pass these scores to NTM
-    probs, next_scores_state = timed_ntm(
-        attn_inputs, timed_scores, timed_scores_state, timed_time_mask
-    )
-
-    # concatenate probs with zeros to get new alignments
-    zeros = tf.zeros_like(scores)
-    # remove current time from attention
-    alignments = tf.concat([probs[:, :-1], zeros[:, time:]], 1)
-
-    # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
-    expanded_alignments = tf.expand_dims(alignments, 1)
-
-    # Context is the inner product of alignments and values along the
-    # memory time dimension.
-    # alignments shape is
-    #   [batch_size, 1, memory_time]
-    # attention_mechanism.values shape is
-    #   [batch_size, memory_time, memory_size]
-    # the batched matmul is over memory_time, so the output shape is
-    #   [batch_size, 1, memory_size].
-    # we then squeeze out the singleton dim.
-    context = tf.matmul(expanded_alignments, attention_mechanism.values)
-    context = tf.squeeze(context, [1])
-
-    if attention_layer is not None:
-        attention = attention_layer(tf.concat([attn_inputs, context], 1))
-    else:
-        attention = context
-
-    # return current time to attention
-    alignments = tf.concat([probs, zeros[:, time + 1 :]], 1)
-    next_attention_state = tf.concat([next_scores_state, zeros[:, time + 1 :]], 1)
-    return attention, alignments, next_attention_state
-
-
-# noinspection PyProtectedMember
-class TimeAttentionWrapperState(
-    namedtuple(
-        "TimeAttentionWrapperState",
-        tf.contrib.seq2seq.AttentionWrapperState._fields
-        + ("all_time_masks", "all_cell_states"),
-    )
-):  # added
-    """Modified  from tensorflow's tf.contrib.seq2seq.AttentionWrapperState
-        see there for description of the parameters
-
-    Additional fields:
-        - `all_time_masks`: A mask applied to a memory
-           that filters certain time steps
-        - `all_cell_states`: All states of the wrapped `RNNCell`
-           at all the previous time steps.
-    """
-
-    def clone(self, **kwargs):
-        """Copied  from tensorflow's tf.contrib.seq2seq.AttentionWrapperState
-            see there for description of the parameters"""
-
-        def with_same_shape(old, new):
-            """Check and set new tensor's shape."""
-            if isinstance(old, tf.Tensor) and isinstance(new, tf.Tensor):
-                return tf.contrib.framework.with_same_shape(old, new)
-            return new
-
-        return tf.contrib.framework.nest.map_structure(
-            with_same_shape,
-            self,
-            super(TimeAttentionWrapperState, self)._replace(**kwargs),
-        )
-
-
-class TimeAttentionWrapper(tf.contrib.seq2seq.AttentionWrapper):
-    """Custom AttentionWrapper that takes into account time
-        when calculating attention.
-        Attention is calculated before calling rnn cell.
-
-        Modified from tensorflow's tf.contrib.seq2seq.AttentionWrapper.
-
-        See our paper for details: https://arxiv.org/abs/1811.11707
-    """
-
-    def __init__(
-        self,
-        cell,
-        attention_mechanism,
-        sequence_len,
-        attn_shift_range=0,
-        sparse_attention=False,
-        attention_layer_size=None,
-        alignment_history=False,
-        rnn_and_attn_inputs_fn=None,
-        ignore_mask=None,
-        cell_input_fn=None,
-        index_of_attn_to_copy=None,
-        likelihood_fn=None,
-        tensor_not_to_copy=None,
-        output_attention=False,
-        initial_cell_state=None,
-        name=None,
-        attention_layer=None,
-    ):
-        """Construct the `TimeAttentionWrapper`.
-            See the super class for the original arguments description.
-
-        Additional args:
-            sequence_len: Python integer.
-                Maximum length of the sequence, used to create
-                appropriate TensorArray for all cell states
-                in TimeAttentionWrapperState
-            attn_shift_range: Python integer (`0` by default).
-                A time range within which to attend to the memory
-                by location in Neural Turing Machine.
-            sparse_attention: Python bool.
-                A flag to use sparsemax (if `True`) instead of
-                softmax (if `False`, default) for probabilities
-            inputs_and_attn_inputs_fn: (optional) A `callable`.
-                A function that creates inputs and attention inputs tensors.
-            ignore_mask: (optional) Boolean Tensor.
-                Determines which time steps to ignore in attention
-            index_of_attn_to_copy: (optional) Python integer.
-                An index of attention mechanism that picks
-                which part of attention tensor to use for copying to output,
-                the default is `None`, which turns off copying mechanism.
-                Copy inspired by: https://arxiv.org/pdf/1603.06393.pdf
-            likelihood_fn: (optional) A `callable`.
-                A method to perform likelihood calculation to
-                filter time step in copy mechanism.
-                Returns a tuple of binary likelihood and likelihood
-            tensor_not_to_copy: (optional) A Tensor.
-                A tensor, which shouldn't be copied from previous time steps
-
-        Modified args:
-            output_attention: Python bool.  If `True`, the output at each
-                time step is the concatenated cell outputs,
-                attention values and additional values described in
-                `additional_output_size()`, used in copy mechanism.
-        """
-        super(TimeAttentionWrapper, self).__init__(
-            cell,
-            attention_mechanism,
-            attention_layer_size,
-            alignment_history,
-            cell_input_fn,
-            output_attention,
-            initial_cell_state,
-            name,
-            attention_layer,
-        )
-        self._sequence_len = sequence_len
-
-        if not isinstance(attn_shift_range, list):
-            # attn_shift_range might not be a list
-            attn_shift_range = [attn_shift_range]
-        self._timed_ntms = [TimedNTM(attn_shift_range[0], sparse_attention, name="0")]
-        if self._is_multi:
-            # if there are several attention mechanisms,
-            # create additional TimedNTMs for them
-            if len(attn_shift_range) == 1:
-                # original attn_shift_range might not be a list
-                attn_shift_range *= len(attention_mechanism)
-            elif len(attn_shift_range) != len(attention_mechanism):
-                raise ValueError(
-                    "If provided, `attn_shift_range` must contain exactly one "
-                    "integer per attention_mechanism, saw: {} vs {}"
-                    "".format(len(attn_shift_range), len(attention_mechanism))
-                )
-            for i in range(1, len(attention_mechanism)):
-                self._timed_ntms.append(
-                    TimedNTM(attn_shift_range[i], sparse_attention, name=str(i))
-                )
-
-        if rnn_and_attn_inputs_fn is None:
-            rnn_and_attn_inputs_fn = self._default_rnn_and_attn_inputs_fn
-        else:
-            if not callable(rnn_and_attn_inputs_fn):
-                raise TypeError(
-                    "`rnn_and_attn_inputs_fn` must be callable, saw type: {}"
-                    "".format(type(rnn_and_attn_inputs_fn).__name__)
-                )
-        self._rnn_and_attn_inputs_fn = rnn_and_attn_inputs_fn
-
-        if not isinstance(ignore_mask, list):
-            self._ignore_mask = [tf.cast(ignore_mask, tf.int32)]
-        else:
-            self._ignore_mask = [tf.cast(i_m, tf.int32) for i_m in ignore_mask]
-
-        self._index_of_attn_to_copy = index_of_attn_to_copy
-
-        self._likelihood_fn = likelihood_fn
-        self._tensor_not_to_copy = tensor_not_to_copy
-
-    @staticmethod
-    def _default_rnn_and_attn_inputs_fn(inputs, cell_state):
-        if isinstance(cell_state, tf.contrib.rnn.LSTMStateTuple):
-            return inputs, tf.concat([inputs, cell_state.h], -1)
-        else:
-            return inputs, tf.concat([inputs, cell_state], -1)
-
-    @staticmethod
-    def additional_output_size():
-        """Number of additional outputs:
-
-        likelihoods:
-            attn_likelihood, state_likelihood
-        debugging info:
-            current_time_prob,
-            bin_likelihood_not_to_copy, bin_likelihood_to_copy
-
-        **Method should be static**
-        """
-        return 2 + 3
-
-    @property
-    def output_size(self):
-        if self._output_attention:
-            if self._index_of_attn_to_copy is not None:
-                # output both raw rnn cell_output and
-                # cell_output with copied attention
-                # together with attention vector itself
-                # and additional output
-                return (
-                    2 * self._cell.output_size
-                    + self._attention_layer_size
-                    + self.additional_output_size()
-                )
-            else:
-                return self._cell.output_size + self._attention_layer_size
-        else:
-            return self._cell.output_size
-
-    @property
-    def state_size(self):
-        """The `state_size` property of `TimeAttentionWrapper`.
-        Returns:
-            A `TimeAttentionWrapperState` tuple containing shapes
-            used by this object.
-        """
-
-        # use AttentionWrapperState from superclass
-        state_size = super(TimeAttentionWrapper, self).state_size
-
-        all_cell_states = self._cell.state_size
-
-        return TimeAttentionWrapperState(
-            cell_state=state_size.cell_state,
-            time=state_size.time,
-            attention=state_size.attention,
-            alignments=state_size.alignments,
-            attention_state=state_size.attention_state,
-            alignment_history=state_size.alignment_history,
-            all_time_masks=self._sequence_len,
-            all_cell_states=all_cell_states,
-        )
-
-    def zero_state(self, batch_size, dtype):
-        """Modified  from tensorflow's zero_state
-            see there for description of the parameters"""
-
-        # use AttentionWrapperState from superclass
-        zero_state = super(TimeAttentionWrapper, self).zero_state(batch_size, dtype)
-
-        with tf.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-            # store time masks
-            all_time_masks = tf.TensorArray(
-                tf.int32,
-                size=self._sequence_len + 1,
-                dynamic_size=False,
-                clear_after_read=False,
-            ).write(0, tf.zeros([batch_size, self.state_size.all_time_masks], tf.int32))
-
-            # store all cell states into a tensor array to allow
-            # copy mechanism to go back in time
-            if isinstance(self._cell.state_size, tf.contrib.rnn.LSTMStateTuple):
-                all_cell_states = tf.contrib.rnn.LSTMStateTuple(
-                    tf.TensorArray(
-                        dtype,
-                        size=self._sequence_len + 1,
-                        dynamic_size=False,
-                        clear_after_read=False,
-                    ).write(0, zero_state.cell_state.c),
-                    tf.TensorArray(
-                        dtype,
-                        size=self._sequence_len + 1,
-                        dynamic_size=False,
-                        clear_after_read=False,
-                    ).write(0, zero_state.cell_state.h),
-                )
-            else:
-                all_cell_states = tf.TensorArray(
-                    dtype, size=0, dynamic_size=False, clear_after_read=False
-                ).write(0, zero_state.cell_state)
-
-            return TimeAttentionWrapperState(
-                cell_state=zero_state.cell_state,
-                time=zero_state.time,
-                attention=zero_state.attention,
-                alignments=zero_state.alignments,
-                attention_state=zero_state.attention_state,
-                alignment_history=zero_state.alignment_history,
-                all_time_masks=all_time_masks,
-                all_cell_states=all_cell_states,
-            )
-
-    def call(self, inputs, state):
-        """Perform a step of attention-wrapped RNN.
-
-        The order has changed:
-        - Step 1: Calculate attention inputs based on the previous cell state
-                  and current inputs
-        - Step 2: Score the output with `attention_mechanism`.
-        - Step 3: Calculate the alignments by passing the score through the
-                  `normalizer` and limit them by time.
-        - Step 4: Calculate the context vector as the inner product between the
-                  alignments and the attention_mechanism's values (memory).
-        - Step 5: Calculate the attention output by concatenating
-                  the cell output and context through the attention layer
-                  (a linear layer with `attention_layer_size` outputs).
-        - Step 6: Mix the `inputs` and `attention` output via
-                  `cell_input_fn` to get cell inputs.
-        - Step 7: Call the wrapped `cell` with these cell inputs and
-                  its previous state.
-        - Step 8: (optional) Maybe copy output and cell state from history
-
-        Args:
-          inputs: (Possibly nested tuple of) Tensor,
-                  the input at this time step.
-          state: An instance of `TimeAttentionWrapperState`
-                 containing tensors from the previous time step.
-
-        Returns:
-          A tuple `(attention_or_cell_output, next_state)`, where:
-
-          - `attention_or_cell_output` depending on `output_attention`.
-          - `next_state` is an instance of `TimeAttentionWrapperState`
-             containing the state calculated at this time step.
-
-        Raises:
-          TypeError: If `state` is not an instance of
-          `TimeAttentionWrapperState`.
-        """
-        if not isinstance(state, TimeAttentionWrapperState):
-            raise TypeError(
-                "Expected state to be instance of "
-                "TimeAttentionWrapperState. "
-                "Received type {} instead.".format(type(state))
-            )
-
-        # Step 1: Calculate attention based on
-        #         the previous output and current input
-        cell_state = state.cell_state
-
-        rnn_inputs, attn_inputs = self._rnn_and_attn_inputs_fn(inputs, cell_state)
-
-        cell_batch_size = attn_inputs.shape[0].value or tf.shape(attn_inputs)[0]
-        error_message = (
-            "When applying AttentionWrapper %s: " % self.name
-            + "Non-matching batch sizes between the memory "
-            "(encoder output) and the query (decoder output).  "
-            "Are you using "
-            "the BeamSearchDecoder?  "
-            "You may need to tile your memory input via "
-            "the tf.contrib.seq2seq.tile_batch function with argument "
-            "multiple=beam_width."
-        )
-        with tf.control_dependencies(
-            self._batch_size_checks(cell_batch_size, error_message)
-        ):
-            attn_inputs = tf.identity(attn_inputs, name="checked_attn_inputs")
-
-        if self._is_multi:
-            previous_attention_state = state.attention_state
-            previous_alignment_history = state.alignment_history
-        else:
-            previous_attention_state = [state.attention_state]
-            previous_alignment_history = [state.alignment_history]
-
-        all_alignments = []
-        all_attentions = []
-        all_attention_states = []
-        maybe_all_histories = []
-
-        prev_time_masks = self._read_from_tensor_array(state.all_time_masks, state.time)
-        prev_time_mask = prev_time_masks[:, -1, :]
-
-        for i, attention_mechanism in enumerate(self._attention_mechanisms):
-            # Steps 2 - 5 are performed inside `_compute_time_attention`
-            (attention, alignments, next_attention_state) = _compute_time_attention(
-                attention_mechanism,
-                attn_inputs,
-                previous_attention_state[i],
-                # time is added to calculate time attention
-                state.time,
-                self._timed_ntms[i],
-                # provide boolean masks, to ignore some time steps
-                prev_time_mask,
-                self._ignore_mask[i],
-                self._attention_layers[i] if self._attention_layers else None,
-            )
-
-            alignment_history = (
-                previous_alignment_history[i].write(state.time, alignments)
-                if self._alignment_history
-                else ()
-            )
-
-            all_attention_states.append(next_attention_state)
-            all_alignments.append(alignments)
-            all_attentions.append(attention)
-            maybe_all_histories.append(alignment_history)
-
-        attention = tf.concat(all_attentions, 1)
-
-        # Step 6: Mix the `inputs` and `attention` output via
-        #         `cell_input_fn` to get cell inputs.
-        cell_inputs = self._cell_input_fn(rnn_inputs, attention)
-
-        # Step 7: Call the wrapped `cell` with these cell inputs and
-        #         its previous state.
-        cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
-
-        prev_all_cell_states = state.all_cell_states
-
-        time_mask = tf.concat(
-            [
-                prev_time_mask[:, : state.time],
-                tf.ones_like(prev_time_mask[:, :1]),
-                prev_time_mask[:, state.time + 1 :],
-            ],
-            1,
-        )
-
-        if self._index_of_attn_to_copy is not None:
-            # Step 8: Maybe copy output and cell state from history
-
-            # get relevant previous outputs from history
-            attn_to_copy = all_attentions[self._index_of_attn_to_copy]
-            # copy them to current output
-            cell_output_with_attn = cell_output + attn_to_copy
-
-            memory_probs = self._get_memory_probs(all_alignments, state.time)
-
-            # check that we do not pay attention to `tensor_not_to_copy`
-            bin_likelihood_not_to_copy, _ = self._likelihood_fn(
-                cell_output_with_attn, self._tensor_not_to_copy
-            )
-            # recalculate probs
-            memory_probs *= 1 - bin_likelihood_not_to_copy
-
-            history_alignments = self._history_alignments(memory_probs)
-
-            # get previous output from the history
-            prev_output = self._prev_output(
-                cell_output_with_attn, history_alignments, state.time
-            )
-
-            # check that current output is close to
-            # the one in the history to which we pay attention to
-            bin_likelihood_to_copy, _ = self._likelihood_fn(
-                cell_output_with_attn, prev_output
-            )
-            # recalculate probs
-            memory_probs *= bin_likelihood_to_copy
-
-            history_alignments = self._history_alignments(memory_probs)
-            current_time_prob = history_alignments[:, -1:]
-
-            # create additional likelihoods to maximize
-            attn_likelihood = self._additional_likelihood(
-                attn_to_copy, prev_output, current_time_prob
-            )
-            state_likelihood = self._additional_likelihood(
-                cell_output + tf.stop_gradient(attn_to_copy),
-                prev_output,
-                current_time_prob,
-            )
-
-            # recalculate time_mask
-            time_mask = self._apply_alignments_to_history(
-                tf.cast(history_alignments, time_mask.dtype),
-                prev_time_masks[:, :-1, :],
-                time_mask,
-            )
-
-            # recalculate new next_cell_state based on history_alignments
-            next_cell_state = self._new_next_cell_state(
-                prev_all_cell_states,
-                next_cell_state,
-                cell_output_with_attn,
-                history_alignments,
-                state.time,
-            )
-
-            all_cell_states = self._all_cell_states(
-                prev_all_cell_states, next_cell_state, state.time
-            )
-
-            if self._output_attention:
-                # concatenate cell outputs, attention, additional likelihoods
-                # and copy_attn_debug
-                output = tf.concat(
-                    [
-                        cell_output_with_attn,
-                        cell_output,
-                        attention,
-                        # additional likelihoods
-                        attn_likelihood,
-                        state_likelihood,
-                        # copy_attn_debug
-                        bin_likelihood_not_to_copy,
-                        bin_likelihood_to_copy,
-                        current_time_prob,
-                    ],
-                    1,
-                )
-            else:
-                output = cell_output_with_attn
-
-        else:
-            # do not waste resources on storing history
-            all_cell_states = prev_all_cell_states
-
-            if self._output_attention:
-                output = tf.concat([cell_output, attention], 1)
-            else:
-                output = cell_output
-
-        all_time_masks = state.all_time_masks.write(state.time + 1, time_mask)
-
-        next_state = TimeAttentionWrapperState(
-            time=state.time + 1,
-            cell_state=next_cell_state,
-            attention=attention,
-            attention_state=self._item_or_tuple(all_attention_states),
-            alignments=self._item_or_tuple(all_alignments),
-            alignment_history=self._item_or_tuple(maybe_all_histories),
-            all_time_masks=all_time_masks,
-            all_cell_states=all_cell_states,
-        )
-        return output, next_state
-
-    # helper for TensorArray
-    @staticmethod
-    def _read_from_tensor_array(tensor_array, time):
-        """TensorArray time reader"""
-        return tf.transpose(tensor_array.gather(tf.range(0, time + 1)), [1, 0, 2])
-
-    # helper methods for copy mechanism
-    def _get_memory_probs(self, all_alignments, time):
-        """Helper method to get memory_probs from all_alignments"""
-
-        memory_probs = tf.stop_gradient(
-            all_alignments[self._index_of_attn_to_copy][:, :time]
-        )
-
-        # binarize memory_probs only if max value is larger than margin=0.1
-        memory_probs_max = tf.reduce_max(memory_probs, axis=1, keepdims=True)
-        memory_probs_max = tf.where(
-            memory_probs_max > 0.1, memory_probs_max, -memory_probs_max
-        )
-
-        return tf.where(
-            tf.equal(memory_probs, memory_probs_max),
-            tf.ones_like(memory_probs),
-            tf.zeros_like(memory_probs),
-        )
-
-    @staticmethod
-    def _history_alignments(memory_probs):
-        """Helper method to apply binary mask to memory_probs"""
-
-        current_time_prob = 1 - tf.reduce_sum(memory_probs, 1, keepdims=True)
-        return tf.concat([memory_probs, current_time_prob], 1)
-
-    @staticmethod
-    def _apply_alignments_to_history(alignments, history_states, state):
-        """Helper method to apply attention probabilities to rnn history
-
-        copied from tf's `_compute_attention(...)`"""
-
-        expanded_alignments = tf.stop_gradient(tf.expand_dims(alignments, 1))
-
-        history_states = tf.concat([history_states, tf.expand_dims(state, 1)], 1)
-
-        # Context is the inner product of alignments and values along the
-        # memory time dimension.
-        # expanded_alignments shape is
-        #   [batch_size, 1, memory_time]
-        # history_states shape is
-        #   [batch_size, memory_time, memory_size]
-        # the batched matmul is over memory_time, so the output shape is
-        #   [batch_size, 1, memory_size].
-        # we then squeeze out the singleton dim.
-
-        return tf.squeeze(tf.matmul(expanded_alignments, history_states), [1])
-
-    def _prev_output(self, state, alignments, time):
-        """Helper method to get previous output from memory"""
-
-        # get all previous outputs from appropriate
-        # attention mechanism's memory limited by current time
-        prev_outputs = tf.stop_gradient(
-            self._attention_mechanisms[self._index_of_attn_to_copy].values[:, :time, :]
-        )
-
-        # multiply by alignments to get one vector from one time step
-        return self._apply_alignments_to_history(alignments, prev_outputs, state)
-
-    def _additional_likelihood(self, output, prev_output, current_time_prob):
-        """Helper method to create additional likelihood to maximize"""
-
-        _, likelihood = self._likelihood_fn(output, tf.stop_gradient(prev_output))
-        return tf.where(current_time_prob < 0.5, likelihood, tf.ones_like(likelihood))
-
-    def _new_hidden_state(self, prev_all_cell_states, new_state, alignments, time):
-        """Helper method to look into rnn history"""
-
-        # reshape to (batch, time, memory_time) and
-        # do not include current time because
-        # we do not want to pay attention to it,
-        # but we need to read it instead of
-        # adding conditional flow if time == 0
-        prev_cell_states = self._read_from_tensor_array(prev_all_cell_states, time)[
-            :, :-1, :
-        ]
-
-        return self._apply_alignments_to_history(
-            alignments, prev_cell_states, new_state
-        )
-
-    def _new_next_cell_state(
-        self, prev_all_cell_states, next_cell_state, new_cell_output, alignments, time
-    ):
-        """Helper method to recalculate new next_cell_state"""
-
-        if isinstance(next_cell_state, tf.contrib.rnn.LSTMStateTuple):
-            next_cell_state_c = self._new_hidden_state(
-                prev_all_cell_states.c, next_cell_state.c, alignments, time
-            )
-            next_cell_state_h = self._new_hidden_state(
-                prev_all_cell_states.h, new_cell_output, alignments, time
-            )
-            return tf.contrib.rnn.LSTMStateTuple(next_cell_state_c, next_cell_state_h)
-        else:
-            return self._new_hidden_state(
-                prev_all_cell_states, alignments, new_cell_output, time
-            )
-
-    @staticmethod
-    def _all_cell_states(prev_all_cell_states, next_cell_state, time):
-        """Helper method to recalculate all_cell_states tensor array"""
-
-        if isinstance(next_cell_state, tf.contrib.rnn.LSTMStateTuple):
-            return tf.contrib.rnn.LSTMStateTuple(
-                prev_all_cell_states.c.write(time + 1, next_cell_state.c),
-                prev_all_cell_states.h.write(time + 1, next_cell_state.h),
-            )
-        else:
-            return prev_all_cell_states.write(time + 1, next_cell_state)
-
-
-class ChronoBiasLayerNormBasicLSTMCell(tf.contrib.rnn.LayerNormBasicLSTMCell):
-    """Custom LayerNormBasicLSTMCell that allows chrono initialization
-        of gate biases.
-
-        See super class for description.
-
-        See https://arxiv.org/abs/1804.11188
-        for details about chrono initialization
-    """
-
-    def __init__(
-        self,
-        num_units,
-        forget_bias=1.0,
-        input_bias=0.0,
-        activation=tf.tanh,
-        layer_norm=True,
-        norm_gain=1.0,
-        norm_shift=0.0,
-        dropout_keep_prob=1.0,
-        dropout_prob_seed=None,
-        out_layer_size=None,
-        reuse=None,
-    ):
-        """Initializes the basic LSTM cell
-
-        Additional args:
-            input_bias: float, The bias added to input gates.
-            out_layer_size: (optional) integer, The number of units in
-                the optional additional output layer.
-        """
-        super(ChronoBiasLayerNormBasicLSTMCell, self).__init__(
-            num_units,
-            forget_bias=forget_bias,
-            activation=activation,
-            layer_norm=layer_norm,
-            norm_gain=norm_gain,
-            norm_shift=norm_shift,
-            dropout_keep_prob=dropout_keep_prob,
-            dropout_prob_seed=dropout_prob_seed,
-            reuse=reuse,
-        )
-        self._input_bias = input_bias
-        self._out_layer_size = out_layer_size
-
-    @property
-    def output_size(self):
-        return self._out_layer_size or self._num_units
-
-    @property
-    def state_size(self):
-        return tf.contrib.rnn.LSTMStateTuple(self._num_units, self.output_size)
-
-    @staticmethod
-    def _dense_layer(args, layer_size):
-        """Optional out projection layer"""
-        proj_size = args.get_shape()[-1]
-        dtype = args.dtype
-        weights = tf.get_variable("kernel", [proj_size, layer_size], dtype=dtype)
-        bias = tf.get_variable("bias", [layer_size], dtype=dtype)
-        out = tf.nn.bias_add(tf.matmul(args, weights), bias)
-        return out
-
-    def call(self, inputs, state):
-        """LSTM cell with layer normalization and recurrent dropout."""
-        c, h = state
-        args = tf.concat([inputs, h], 1)
-        concat = self._linear(args)
-        dtype = args.dtype
-
-        i, j, f, o = tf.split(value=concat, num_or_size_splits=4, axis=1)
-        if self._layer_norm:
-            i = self._norm(i, "input", dtype=dtype)
-            j = self._norm(j, "transform", dtype=dtype)
-            f = self._norm(f, "forget", dtype=dtype)
-            o = self._norm(o, "output", dtype=dtype)
-
-        g = self._activation(j)
-        if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
-            g = tf.nn.dropout(g, self._keep_prob, seed=self._seed)
-
-        new_c = c * tf.sigmoid(f + self._forget_bias) + g * tf.sigmoid(
-            i + self._input_bias
-        )  # added input_bias
-
-        # do not do layer normalization on the new c,
-        # because there are no trainable weights
-        # if self._layer_norm:
-        #     new_c = self._norm(new_c, "state", dtype=dtype)
-
-        new_h = self._activation(new_c) * tf.sigmoid(o)
-
-        # added dropout to the hidden state h
-        if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
-            new_h = tf.nn.dropout(new_h, self._keep_prob, seed=self._seed)
-
-        # add postprocessing of the output
-        if self._out_layer_size is not None:
-            with tf.variable_scope("out_layer"):
-                new_h = self._dense_layer(new_h, self._out_layer_size)
-
-        new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
-        return new_h, new_state
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a613321cb6ba..3fa504c3c79d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -11,6 +11,11 @@
 from rasa.nlu.components import Component
 from rasa.utils.common import is_logging_disabled
 
+import tensorflow as tf
+
+# avoid warning println on contrib import - remove for tf 2
+tf.contrib._warning = None
+
 logger = logging.getLogger(__name__)
 
 if typing.TYPE_CHECKING:
@@ -20,14 +25,6 @@
     from rasa.nlu.model import Metadata
     from rasa.nlu.training_data import Message
 
-try:
-    import tensorflow as tf
-
-    # avoid warning println on contrib import - remove for tf 2
-    tf.contrib._warning = None
-except ImportError:
-    tf = None
-
 
 class EmbeddingIntentClassifier(Component):
     """Intent classifier using supervised embeddings.
@@ -120,7 +117,6 @@ def __init__(
     ) -> None:
         """Declare instant variables with default values"""
 
-        self._check_tensorflow()
         super(EmbeddingIntentClassifier, self).__init__(component_config)
 
         self._load_params()
@@ -195,15 +191,6 @@ def _load_params(self) -> None:
     def required_packages(cls) -> List[Text]:
         return ["tensorflow"]
 
-    @staticmethod
-    def _check_tensorflow():
-        if tf is None:
-            raise ImportError(
-                "Failed to import `tensorflow`. "
-                "Please install `tensorflow`. "
-                "For example with `pip install tensorflow`."
-            )
-
     # training data helpers:
     @staticmethod
     def _create_intent_dict(training_data: "TrainingData") -> Dict[Text, int]:
diff --git a/requirements.txt b/requirements.txt
index 8d5b5ab60992..66dce9b78614 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,9 @@ fakeredis==1.0.3
 pymongo==3.8.0
 numpy==1.16.3
 scipy==1.2.1
-tensorflow==1.13.1
+tensorflow==1.14.0
+tensorflow-probability==0.7.0
+tensor2tensor==1.13.4
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.3
@@ -47,3 +49,4 @@ SQLAlchemy~=1.3.3
 kafka-python==1.4.6
 sklearn-crfsuite==0.3.6
 psycopg2-binary==2.8.2
+setuptools==41.0.1
diff --git a/setup.py b/setup.py
index 85d291abc696..46f18776d41f 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,9 @@
     "pymongo~=3.8",
     "numpy~=1.16",
     "scipy~=1.2",
-    "tensorflow~=1.13.0",
+    "tensorflow~=1.14.0",
+    "tensorflow-probability~=0.7.0",
+    "tensor2tensor~=1.13.4",
     "apscheduler~=3.0",
     "tqdm~=4.0",
     "networkx~=2.3",
@@ -74,6 +76,7 @@
     "SQLAlchemy~=1.3.0",
     "kafka-python~=1.4",
     "sklearn-crfsuite~=0.3.6",
+    "setuptools~=41.0.1",
 ]
 
 extras_requires = {
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index c496d2682ca3..4f8692ff23c3 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -21,7 +21,9 @@
 from rasa.core.events import ActionExecuted
 from rasa.core.featurizers import (
     BinarySingleStateFeaturizer,
+    LabelTokenizerSingleStateFeaturizer,
     MaxHistoryTrackerFeaturizer,
+    FullDialogueTrackerFeaturizer,
 )
 from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy
 from rasa.core.policies.embedding_policy import EmbeddingPolicy
@@ -120,6 +122,26 @@ async def trained_policy(self, featurizer, priority):
         policy.train(training_trackers, default_domain)
         return policy
 
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert trained_policy.featurizer.max_history == self.max_history
+        assert isinstance(
+            trained_policy.featurizer.state_featurizer, BinarySingleStateFeaturizer
+        )
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert loaded.featurizer.max_history == self.max_history
+        assert isinstance(
+            loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer
+        )
+
+    async def test_continue_training(self, trained_policy, default_domain):
+        training_trackers = await train_trackers(default_domain, augmentation_factor=0)
+        trained_policy.continue_training(
+            training_trackers, default_domain, **{"epochs": 1}
+        )
+
     async def test_persist_and_load(self, trained_policy, default_domain, tmpdir):
         trained_policy.persist(tmpdir.strpath)
         loaded = trained_policy.__class__.load(tmpdir.strpath)
@@ -156,12 +178,14 @@ def test_persist_and_load_empty_policy(self, tmpdir):
 
     def test_tf_config(self, trained_policy, tmpdir):
         if hasattr(trained_policy, "session"):
+            import tensorflow as tf
+
             # noinspection PyProtectedMember
-            assert trained_policy.session._config is None
+            assert trained_policy.session._config == tf.Session()._config
             trained_policy.persist(tmpdir.strpath)
             loaded = trained_policy.__class__.load(tmpdir.strpath)
             # noinspection PyProtectedMember
-            assert loaded.session._config is None
+            assert loaded.session._config == tf.Session()._config
 
     @staticmethod
     def _get_next_action(policy, events, domain):
@@ -192,154 +216,6 @@ def test_tf_config(self, trained_policy, tmpdir):
         assert loaded.session._config == session_config()
 
 
-class TestFallbackPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        p = FallbackPolicy(priority=priority)
-        return p
-
-    @pytest.mark.parametrize(
-        "nlu_confidence, last_action_name, should_nlu_fallback",
-        [
-            (0.1, "some_action", False),
-            (0.1, "action_listen", True),
-            (0.9, "some_action", False),
-            (0.9, "action_listen", False),
-        ],
-    )
-    def test_should_nlu_fallback(
-        self, trained_policy, nlu_confidence, last_action_name, should_nlu_fallback
-    ):
-        assert (
-            trained_policy.should_nlu_fallback(nlu_confidence, last_action_name)
-            is should_nlu_fallback
-        )
-
-
-class TestMappingPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        p = MappingPolicy()
-        return p
-
-    @pytest.fixture(scope="module")
-    def domain_with_mapping(self):
-        return Domain.load(DEFAULT_DOMAIN_PATH_WITH_MAPPING)
-
-    @pytest.fixture
-    def tracker(self, domain_with_mapping):
-        return DialogueStateTracker(
-            UserMessage.DEFAULT_SENDER_ID, domain_with_mapping.slots
-        )
-
-    @pytest.fixture(
-        params=[
-            ("default", "utter_default"),
-            ("greet", "utter_greet"),
-            (USER_INTENT_RESTART, ACTION_RESTART_NAME),
-            (USER_INTENT_BACK, ACTION_BACK_NAME),
-        ]
-    )
-    def intent_mapping(self, request):
-        return request.param
-
-    def test_predict_mapped_action(self, priority, domain_with_mapping, intent_mapping):
-        policy = self.create_policy(None, priority)
-        events = [
-            ActionExecuted(ACTION_LISTEN_NAME),
-            user_uttered(intent_mapping[0], 1),
-        ]
-
-        assert (
-            self._get_next_action(policy, events, domain_with_mapping)
-            == intent_mapping[1]
-        )
-
-    def test_predict_action_listen(self, priority, domain_with_mapping, intent_mapping):
-        policy = self.create_policy(None, priority)
-        events = [
-            ActionExecuted(ACTION_LISTEN_NAME),
-            user_uttered(intent_mapping[0], 1),
-            ActionExecuted(intent_mapping[1], policy="policy_0_MappingPolicy"),
-        ]
-        tracker = get_tracker(events)
-        scores = policy.predict_action_probabilities(tracker, domain_with_mapping)
-        index = scores.index(max(scores))
-        action_planned = domain_with_mapping.action_names[index]
-        assert action_planned == ACTION_LISTEN_NAME
-        assert scores != [0] * domain_with_mapping.num_actions
-
-    def test_do_not_follow_other_policy(
-        self, priority, domain_with_mapping, intent_mapping
-    ):
-        policy = self.create_policy(None, priority)
-        events = [
-            ActionExecuted(ACTION_LISTEN_NAME),
-            user_uttered(intent_mapping[0], 1),
-            ActionExecuted(intent_mapping[1], policy="other_policy"),
-        ]
-        tracker = get_tracker(events)
-        scores = policy.predict_action_probabilities(tracker, domain_with_mapping)
-        assert scores == [0] * domain_with_mapping.num_actions
-
-
-class TestMemoizationPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        max_history = None
-        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
-            max_history = featurizer.max_history
-        p = MemoizationPolicy(priority=priority, max_history=max_history)
-        return p
-
-    async def test_memorise(self, trained_policy, default_domain):
-        trackers = await train_trackers(default_domain, augmentation_factor=20)
-        trained_policy.train(trackers, default_domain)
-        lookup_with_augmentation = trained_policy.lookup
-
-        trackers = [t for t in trackers if not hasattr(t, "is_augmented")]
-
-        all_states, all_actions = trained_policy.featurizer.training_states_and_actions(
-            trackers, default_domain
-        )
-
-        for tracker, states, actions in zip(trackers, all_states, all_actions):
-            recalled = trained_policy.recall(states, tracker, default_domain)
-            assert recalled == default_domain.index_for_action(actions[0])
-
-        nums = np.random.randn(default_domain.num_states)
-        random_states = [{f: num for f, num in zip(default_domain.input_states, nums)}]
-        assert trained_policy._recall_states(random_states) is None
-
-        # compare augmentation for augmentation_factor of 0 and 20:
-        trackers_no_augmentation = await train_trackers(
-            default_domain, augmentation_factor=0
-        )
-        trained_policy.train(trackers_no_augmentation, default_domain)
-        lookup_no_augmentation = trained_policy.lookup
-
-        assert lookup_no_augmentation == lookup_with_augmentation
-
-    def test_memorise_with_nlu(self, trained_policy, default_domain):
-        filename = "data/test_dialogues/default.json"
-        dialogue = read_dialogue_file(filename)
-
-        tracker = DialogueStateTracker(dialogue.name, default_domain.slots)
-        tracker.recreate_from_dialogue(dialogue)
-        states = trained_policy.featurizer.prediction_states([tracker], default_domain)[
-            0
-        ]
-
-        recalled = trained_policy.recall(states, tracker, default_domain)
-        assert recalled is not None
-
-
-class TestAugmentedMemoizationPolicy(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        max_history = None
-        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
-            max_history = featurizer.max_history
-        p = AugmentedMemoizationPolicy(priority=priority, max_history=max_history)
-        return p
-
-
 class TestSklearnPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority, **kwargs):
         p = SklearnPolicy(featurizer, priority, **kwargs)
@@ -463,51 +339,116 @@ def test_train_with_shuffle_false(
         policy.train(trackers, domain=default_domain)
 
 
-class TestEmbeddingPolicyNoAttention(PolicyTestCollection):
+class TestEmbeddingPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=False, attn_after_rnn=False
-        )
+        p = EmbeddingPolicy(featurizer=featurizer, priority=priority)
         return p
 
+    def test_similarity_type(self, trained_policy):
+        assert trained_policy.similarity_type == "inner"
+
+    async def test_gen_batch(self, trained_policy, default_domain):
+        training_trackers = await train_trackers(default_domain, augmentation_factor=0)
+        training_data = trained_policy.featurize_for_training(
+            training_trackers, default_domain
+        )
+        session_data = trained_policy._create_session_data(
+            training_data.X, training_data.y
+        )
+        batch_size = 2
+        batch_x, batch_y = next(
+            trained_policy._gen_batch(session_data=session_data, batch_size=batch_size)
+        )
+        assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
+        assert (
+            batch_x[0].shape == session_data.X[0].shape
+            and batch_y[0].shape == session_data.Y[0].shape
+        )
+        batch_x, batch_y = next(
+            trained_policy._gen_batch(
+                session_data=session_data,
+                batch_size=batch_size,
+                batch_strategy="balanced",
+                shuffle=True,
+            )
+        )
+        assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
+        assert (
+            batch_x[0].shape == session_data.X[0].shape
+            and batch_y[0].shape == session_data.Y[0].shape
+        )
+
 
-class TestEmbeddingPolicyAttentionBeforeRNN(PolicyTestCollection):
+class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
         p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=True, attn_after_rnn=False
+            featurizer=featurizer, priority=priority, **{"loss_type": "margin"}
         )
         return p
 
+    def test_similarity_type(self, trained_policy):
+        assert trained_policy.similarity_type == "cosine"
+
 
-class TestEmbeddingPolicyAttentionAfterRNN(PolicyTestCollection):
+class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
         p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=False, attn_after_rnn=True
+            featurizer=featurizer,
+            priority=priority,
+            **{"scale_loss": False, "evaluate_on_num_examples": 4}
         )
         return p
 
 
-class TestEmbeddingPolicyAttentionBoth(PolicyTestCollection):
+class TestEmbeddingPolicyWithFullDialogue(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(
-            priority=priority, attn_before_rnn=True, attn_after_rnn=True
-        )
+        # if max_history is not specified
+        p = EmbeddingPolicy(priority=priority)
         return p
 
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert isinstance(trained_policy.featurizer, FullDialogueTrackerFeaturizer)
+        assert isinstance(
+            trained_policy.featurizer.state_featurizer,
+            LabelTokenizerSingleStateFeaturizer,
+        )
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert isinstance(loaded.featurizer, FullDialogueTrackerFeaturizer)
+        assert isinstance(
+            loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer
+        )
 
-class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection):
+
+class TestEmbeddingPolicyWithMaxHistory(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
-        # since it is using FullDialogueTrackerFeaturizer
-        p = EmbeddingPolicy(priority=priority, **tf_defaults())
+        # since it is using MaxHistoryTrackerFeaturizer
+        # if max_history is specified
+        p = EmbeddingPolicy(priority=priority, max_history=self.max_history)
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert trained_policy.featurizer.max_history == self.max_history
+        assert isinstance(
+            trained_policy.featurizer.state_featurizer,
+            LabelTokenizerSingleStateFeaturizer,
+        )
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert loaded.featurizer.max_history == self.max_history
+        assert isinstance(
+            loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer
+        )
+
+
+class TestEmbeddingPolicyWithTfConfig(TestEmbeddingPolicy):
+    def create_policy(self, featurizer, priority):
+        p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
         return p
 
     def test_tf_config(self, trained_policy, tmpdir):
@@ -519,12 +460,84 @@ def test_tf_config(self, trained_policy, tmpdir):
         assert loaded.session._config == session_config()
 
 
-class TestFormPolicy(PolicyTestCollection):
+class TestMemoizationPolicy(PolicyTestCollection):
+    def create_policy(self, featurizer, priority):
+        max_history = None
+        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
+            max_history = featurizer.max_history
+        p = MemoizationPolicy(priority=priority, max_history=max_history)
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
+        assert trained_policy.featurizer.state_featurizer is None
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
+        assert loaded.featurizer.state_featurizer is None
+
+    async def test_memorise(self, trained_policy, default_domain):
+        trackers = await train_trackers(default_domain, augmentation_factor=20)
+        trained_policy.train(trackers, default_domain)
+        lookup_with_augmentation = trained_policy.lookup
+
+        trackers = [
+            t for t in trackers if not hasattr(t, "is_augmented") or not t.is_augmented
+        ]
+
+        (
+            all_states,
+            all_actions,
+        ) = trained_policy.featurizer.training_states_and_actions(
+            trackers, default_domain
+        )
+
+        for tracker, states, actions in zip(trackers, all_states, all_actions):
+            recalled = trained_policy.recall(states, tracker, default_domain)
+            assert recalled == default_domain.index_for_action(actions[0])
+
+        nums = np.random.randn(default_domain.num_states)
+        random_states = [{f: num for f, num in zip(default_domain.input_states, nums)}]
+        assert trained_policy._recall_states(random_states) is None
+
+        # compare augmentation for augmentation_factor of 0 and 20:
+        trackers_no_augmentation = await train_trackers(
+            default_domain, augmentation_factor=0
+        )
+        trained_policy.train(trackers_no_augmentation, default_domain)
+        lookup_no_augmentation = trained_policy.lookup
+
+        assert lookup_no_augmentation == lookup_with_augmentation
+
+    def test_memorise_with_nlu(self, trained_policy, default_domain):
+        filename = "data/test_dialogues/default.json"
+        dialogue = read_dialogue_file(filename)
+
+        tracker = DialogueStateTracker(dialogue.name, default_domain.slots)
+        tracker.recreate_from_dialogue(dialogue)
+        states = trained_policy.featurizer.prediction_states([tracker], default_domain)[
+            0
+        ]
+
+        recalled = trained_policy.recall(states, tracker, default_domain)
+        assert recalled is not None
+
+
+class TestAugmentedMemoizationPolicy(TestMemoizationPolicy):
+    def create_policy(self, featurizer, priority):
+        max_history = None
+        if isinstance(featurizer, MaxHistoryTrackerFeaturizer):
+            max_history = featurizer.max_history
+        p = AugmentedMemoizationPolicy(priority=priority, max_history=max_history)
+        return p
+
+
+class TestFormPolicy(TestMemoizationPolicy):
     def create_policy(self, featurizer, priority):
         p = FormPolicy(priority=priority)
         return p
 
-    async def test_memorise(self, trained_policy):
+    async def test_memorise(self, trained_policy, default_domain):
         domain = Domain.load("data/test_domains/form.yml")
         trackers = await training.load_data("data/test_stories/stories_form.md", domain)
         trained_policy.train(trackers, domain)
@@ -582,8 +595,112 @@ async def test_memorise(self, trained_policy):
         random_states = [{f: num for f, num in zip(domain.input_states, nums)}]
         assert trained_policy.recall(random_states, None, domain) is None
 
+    def test_memorise_with_nlu(self, trained_policy, default_domain):
+        pass
+
+
+class TestMappingPolicy(PolicyTestCollection):
+    def create_policy(self, featurizer, priority):
+        p = MappingPolicy()
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer is None
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer is None
+
+    @pytest.fixture(scope="module")
+    def domain_with_mapping(self):
+        return Domain.load(DEFAULT_DOMAIN_PATH_WITH_MAPPING)
+
+    @pytest.fixture
+    def tracker(self, domain_with_mapping):
+        return DialogueStateTracker(
+            UserMessage.DEFAULT_SENDER_ID, domain_with_mapping.slots
+        )
+
+    @pytest.fixture(
+        params=[
+            ("default", "utter_default"),
+            ("greet", "utter_greet"),
+            (USER_INTENT_RESTART, ACTION_RESTART_NAME),
+            (USER_INTENT_BACK, ACTION_BACK_NAME),
+        ]
+    )
+    def intent_mapping(self, request):
+        return request.param
+
+    def test_predict_mapped_action(self, priority, domain_with_mapping, intent_mapping):
+        policy = self.create_policy(None, priority)
+        events = [
+            ActionExecuted(ACTION_LISTEN_NAME),
+            user_uttered(intent_mapping[0], 1),
+        ]
+
+        assert (
+            self._get_next_action(policy, events, domain_with_mapping)
+            == intent_mapping[1]
+        )
+
+    def test_predict_action_listen(self, priority, domain_with_mapping, intent_mapping):
+        policy = self.create_policy(None, priority)
+        events = [
+            ActionExecuted(ACTION_LISTEN_NAME),
+            user_uttered(intent_mapping[0], 1),
+            ActionExecuted(intent_mapping[1], policy="policy_0_MappingPolicy"),
+        ]
+        tracker = get_tracker(events)
+        scores = policy.predict_action_probabilities(tracker, domain_with_mapping)
+        index = scores.index(max(scores))
+        action_planned = domain_with_mapping.action_names[index]
+        assert action_planned == ACTION_LISTEN_NAME
+        assert scores != [0] * domain_with_mapping.num_actions
+
+    def test_do_not_follow_other_policy(
+        self, priority, domain_with_mapping, intent_mapping
+    ):
+        policy = self.create_policy(None, priority)
+        events = [
+            ActionExecuted(ACTION_LISTEN_NAME),
+            user_uttered(intent_mapping[0], 1),
+            ActionExecuted(intent_mapping[1], policy="other_policy"),
+        ]
+        tracker = get_tracker(events)
+        scores = policy.predict_action_probabilities(tracker, domain_with_mapping)
+        assert scores == [0] * domain_with_mapping.num_actions
+
+
+class TestFallbackPolicy(PolicyTestCollection):
+    def create_policy(self, featurizer, priority):
+        p = FallbackPolicy(priority=priority)
+        return p
+
+    def test_featurizer(self, trained_policy, tmpdir):
+        assert trained_policy.featurizer is None
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        assert loaded.featurizer is None
+
+    @pytest.mark.parametrize(
+        "nlu_confidence, last_action_name, should_nlu_fallback",
+        [
+            (0.1, "some_action", False),
+            (0.1, "action_listen", True),
+            (0.9, "some_action", False),
+            (0.9, "action_listen", False),
+        ],
+    )
+    def test_should_nlu_fallback(
+        self, trained_policy, nlu_confidence, last_action_name, should_nlu_fallback
+    ):
+        assert (
+            trained_policy.should_nlu_fallback(nlu_confidence, last_action_name)
+            is should_nlu_fallback
+        )
+
 
-class TestTwoStageFallbackPolicy(PolicyTestCollection):
+class TestTwoStageFallbackPolicy(TestFallbackPolicy):
     def create_policy(self, featurizer, priority):
         p = TwoStageFallbackPolicy(
             priority=priority, deny_suggestion_intent_name="deny"