From 0322eebdd3819ac3b37c448c94dd36346294e162 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Fri, 5 Jul 2019 16:41:57 +0200 Subject: [PATCH 01/50] copy files from nlu_lstm --- rasa/core/featurizers.py | 12 +- rasa/core/policies/embedding_policy.py | 1336 +++++++++++++++++------- 2 files changed, 961 insertions(+), 387 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index 76634cac1517..737b4e22a80e 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -177,7 +177,7 @@ def prepare_from_domain(self, domain: Domain) -> None: """Creates internal vocabularies for user intents and bot actions to use for featurization""" self.user_labels = domain.intent_states + domain.entity_states - self.slot_labels = domain.slot_states + self.slot_labels = domain.slot_states + domain.form_states self.bot_labels = domain.action_names if self.use_shared_vocab: @@ -249,7 +249,7 @@ def create_encoded_all_actions(self, domain: Domain) -> np.ndarray: """Create matrix with all actions from domain encoded in rows as bag of words.""" encoded_all_actions = np.zeros( - (domain.num_actions, len(self.bot_vocab)), dtype=int + (domain.num_actions, len(self.bot_vocab)), dtype=np.int32 ) for idx, name in enumerate(domain.action_names): for t in name.split(self.split_symbol): @@ -361,8 +361,10 @@ def _featurize_labels( labels.append(story_labels) + y = np.array(labels) # if it is MaxHistoryFeaturizer, squeeze out time axis - y = np.array(labels).squeeze() + if y.shape[1] == 1 and isinstance(self, MaxHistoryTrackerFeaturizer): + y = y[:, 0, :] return y @@ -410,7 +412,7 @@ def create_X( def persist(self, path): featurizer_file = os.path.join(path, "featurizer.json") - rasa.utils.io.create_directory_for_file(featurizer_file) + utils.create_dir_for_file(featurizer_file) with open(featurizer_file, "w", encoding="utf-8") as f: # noinspection PyTypeChecker f.write(str(jsonpickle.encode(self))) @@ -566,7 +568,7 @@ def _hash_example(states, action): def training_states_and_actions( self, trackers: List[DialogueStateTracker], domain: Domain - ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]: + ) -> Tuple[List[List[Dict]], List[List[Text]]]: trackers_as_states = [] trackers_as_actions = [] diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index d1e5e1864cf1..bd6f9bcd9fb6 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -18,10 +18,17 @@ TrackerFeaturizer, FullDialogueTrackerFeaturizer, LabelTokenizerSingleStateFeaturizer, + MaxHistoryTrackerFeaturizer ) from rasa.core.policies.policy import Policy import tensorflow as tf +from tensorflow.python.ops import gen_array_ops +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_layers +from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder +from tensor2tensor.models.evolved_transformer import evolved_transformer_encoder + from rasa.core.policies.tf_utils import ( TimeAttentionWrapper, ChronoBiasLayerNormBasicLSTMCell, @@ -33,11 +40,10 @@ from rasa.core.policies.tf_utils import TimeAttentionWrapperState try: - import cPickle as pickle # pytype: disable=import-error + import cPickle as pickle except ImportError: import pickle -tf.contrib._warning = None # avoid warning println on contrib import - remove for tf 2 logger = logging.getLogger(__name__) @@ -75,8 +81,17 @@ class EmbeddingPolicy(Policy): # a list of hidden layers sizes before bot embed layer # number of hidden layers is equal to the length of this list "hidden_layers_sizes_b": [], + + "transformer": False, + "pos_encoding": "timing", # {"timing", "emb", "custom_timing"} + # introduce phase shift in time encodings between transformers + # 0.5 - 0.8 works on small dataset + "pos_max_timescale": 1.0e1, + "max_seq_length": 256, + "num_heads": 4, # number of units in rnn cell - "rnn_size": 64, + "rnn_size": 128, + "num_rnn_layers": 1, # training parameters # flag if to turn on layer normalization for lstm cell "layer_norm": True, @@ -136,44 +151,51 @@ class EmbeddingPolicy(Policy): # end default properties (DOC MARKER - don't remove) - @classmethod - def _standard_featurizer(cls): - return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer()) + @staticmethod + def _standard_featurizer(max_history=None): + if max_history is None: + return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer()) + else: + return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history) def __init__( self, - featurizer: Optional[FullDialogueTrackerFeaturizer] = None, + featurizer: Optional['FullDialogueTrackerFeaturizer'] = None, priority: int = 1, - encoded_all_actions: Optional[np.ndarray] = None, - graph: Optional[tf.Graph] = None, - session: Optional[tf.Session] = None, - intent_placeholder: Optional[tf.Tensor] = None, - action_placeholder: Optional[tf.Tensor] = None, - slots_placeholder: Optional[tf.Tensor] = None, - prev_act_placeholder: Optional[tf.Tensor] = None, - dialogue_len: Optional[tf.Tensor] = None, - x_for_no_intent: Optional[tf.Tensor] = None, - y_for_no_action: Optional[tf.Tensor] = None, - y_for_action_listen: Optional[tf.Tensor] = None, - similarity_op: Optional[tf.Tensor] = None, - alignment_history: Optional[tf.Tensor] = None, - user_embed: Optional[tf.Tensor] = None, - bot_embed: Optional[tf.Tensor] = None, - slot_embed: Optional[tf.Tensor] = None, - dial_embed: Optional[tf.Tensor] = None, - rnn_embed: Optional[tf.Tensor] = None, - attn_embed: Optional[tf.Tensor] = None, - copy_attn_debug: Optional[tf.Tensor] = None, - all_time_masks: Optional[tf.Tensor] = None, + encoded_all_actions: Optional['np.ndarray'] = None, + graph: Optional['tf.Graph'] = None, + session: Optional['tf.Session'] = None, + intent_placeholder: Optional['tf.Tensor'] = None, + action_placeholder: Optional['tf.Tensor'] = None, + slots_placeholder: Optional['tf.Tensor'] = None, + prev_act_placeholder: Optional['tf.Tensor'] = None, + dialogue_len: Optional['tf.Tensor'] = None, + x_for_no_intent: Optional['tf.Tensor'] = None, + y_for_no_action: Optional['tf.Tensor'] = None, + y_for_action_listen: Optional['tf.Tensor'] = None, + similarity_op: Optional['tf.Tensor'] = None, + alignment_history: Optional['tf.Tensor'] = None, + user_embed: Optional['tf.Tensor'] = None, + bot_embed: Optional['tf.Tensor'] = None, + slot_embed: Optional['tf.Tensor'] = None, + dial_embed: Optional['tf.Tensor'] = None, + rnn_embed: Optional['tf.Tensor'] = None, + attn_embed: Optional['tf.Tensor'] = None, + copy_attn_debug: Optional['tf.Tensor'] = None, + all_time_masks: Optional['tf.Tensor'] = None, + attention_weights=None, + max_history: Optional[int] = None, **kwargs: Any ) -> None: - if featurizer: - if not isinstance(featurizer, FullDialogueTrackerFeaturizer): - raise TypeError( - "Passed tracker featurizer of type {}, " - "should be FullDialogueTrackerFeaturizer." - "".format(type(featurizer).__name__) - ) + # if featurizer: + # if not isinstance(featurizer, FullDialogueTrackerFeaturizer): + # raise TypeError( + # "Passed tracker featurizer of type {}, " + # "should be FullDialogueTrackerFeaturizer." + # "".format(type(featurizer).__name__) + # ) + if not featurizer: + featurizer = self._standard_featurizer(max_history) super(EmbeddingPolicy, self).__init__(featurizer, priority) # flag if to use the same embeddings for user and bot @@ -219,7 +241,7 @@ def __init__( self.copy_attn_debug = copy_attn_debug self.all_time_masks = all_time_masks - + self.attention_weights = attention_weights # internal tf instances self._train_op = None self._is_training = None @@ -244,8 +266,14 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"] ) ) + self.transformer = config['transformer'] + self.pos_encoding = config['pos_encoding'] + self.pos_max_timescale = config['pos_max_timescale'] + self.max_seq_length = config['max_seq_length'] + self.num_heads = config['num_heads'] self.rnn_size = config["rnn_size"] + self.num_rnn_layers = config["num_rnn_layers"] self.layer_norm = config["layer_norm"] self.batch_size = config["batch_size"] @@ -329,14 +357,21 @@ def _actions_for_Y(data_Y: np.ndarray) -> np.ndarray: def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray: """Prepare Y data for training: features for action labels.""" - return np.stack( - [ - np.stack( - [self.encoded_all_actions[action_idx] for action_idx in action_ids] - ) - for action_ids in actions_for_Y - ] - ) + if len(actions_for_Y.shape) == 2: + return np.stack( + [ + np.stack( + [self.encoded_all_actions[action_idx] for action_idx in action_ids] + ) + for action_ids in actions_for_Y + ] + ) + else: + return np.stack( + [ + self.encoded_all_actions[action_idx] for action_idx in actions_for_Y + ] + ) # noinspection PyPep8Naming @staticmethod @@ -382,7 +417,11 @@ def _create_tf_session_data( y_for_action_listen = self._create_y_for_action_listen(domain) # is needed to calculate train accuracy - all_Y_d = self._create_all_Y_d(X.shape[1]) + if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): + dial_len = X.shape[1] + else: + dial_len = 1 + all_Y_d = self._create_all_Y_d(dial_len) return SessionData( X=X, @@ -396,15 +435,14 @@ def _create_tf_session_data( all_Y_d=all_Y_d, ) - # tf helpers: - + # tf helpers: def _create_tf_nn( self, - x_in: tf.Tensor, + x_in: 'tf.Tensor', layer_sizes: List, droprate: float, layer_name_suffix: Text, - ) -> tf.Tensor: + ) -> 'tf.Tensor': """Create nn with hidden layers and name suffix.""" reg = tf.contrib.layers.l2_regularizer(self.C2) @@ -421,7 +459,7 @@ def _create_tf_nn( x = tf.layers.dropout(x, rate=droprate, training=self._is_training) return x - def _create_embed(self, x: tf.Tensor, layer_name_suffix: Text) -> tf.Tensor: + def _create_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': """Create dense embedding layer with a name.""" reg = tf.contrib.layers.l2_regularizer(self.C2) @@ -435,7 +473,7 @@ def _create_embed(self, x: tf.Tensor, layer_name_suffix: Text) -> tf.Tensor: ) return embed_x - def _create_tf_user_embed(self, a_in: tf.Tensor) -> tf.Tensor: + def _create_tf_user_embed(self, a_in: 'tf.Tensor') -> 'tf.Tensor': """Create embedding user vector.""" layer_name_suffix = "a_and_b" if self.share_embedding else "a" @@ -448,7 +486,7 @@ def _create_tf_user_embed(self, a_in: tf.Tensor) -> tf.Tensor: ) return self._create_embed(a, layer_name_suffix=layer_name_suffix) - def _create_tf_bot_embed(self, b_in: tf.Tensor) -> tf.Tensor: + def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': """Create embedding bot vector.""" layer_name_suffix = "a_and_b" if self.share_embedding else "b" @@ -461,7 +499,7 @@ def _create_tf_bot_embed(self, b_in: tf.Tensor) -> tf.Tensor: ) return self._create_embed(b, layer_name_suffix=layer_name_suffix) - def _create_tf_no_intent_embed(self, x_for_no_intent_i: tf.Tensor) -> tf.Tensor: + def _create_tf_no_intent_embed(self, x_for_no_intent_i: 'tf.Tensor') -> 'tf.Tensor': """Create embedding user vector for empty intent.""" layer_name_suffix = "a_and_b" if self.share_embedding else "a" @@ -476,7 +514,7 @@ def _create_tf_no_intent_embed(self, x_for_no_intent_i: tf.Tensor) -> tf.Tensor: self._create_embed(x_for_no_intent, layer_name_suffix=layer_name_suffix) ) - def _create_tf_no_action_embed(self, y_for_no_action_in: tf.Tensor) -> tf.Tensor: + def _create_tf_no_action_embed(self, y_for_no_action_in: 'tf.Tensor') -> 'tf.Tensor': """Create embedding bot vector for empty action and action_listen.""" layer_name_suffix = "a_and_b" if self.share_embedding else "b" @@ -491,7 +529,8 @@ def _create_tf_no_action_embed(self, y_for_no_action_in: tf.Tensor) -> tf.Tensor self._create_embed(y_for_no_action, layer_name_suffix=layer_name_suffix) ) - def _create_rnn_cell(self) -> tf.contrib.rnn.RNNCell: + def _create_rnn_cell(self): + # type: () -> tf.contrib.rnn.RNNCell """Create one rnn cell.""" # chrono initialization for forget bias @@ -524,11 +563,11 @@ def _create_rnn_cell(self) -> tf.contrib.rnn.RNNCell: ) @staticmethod - def _num_units(memory: tf.Tensor) -> int: + def _num_units(memory: 'tf.Tensor') -> int: return memory.shape[-1].value def _create_attn_mech( - self, memory: tf.Tensor, real_length: tf.Tensor + self, memory: 'tf.Tensor', real_length: 'tf.Tensor' ) -> tf.contrib.seq2seq.AttentionMechanism: return tf.contrib.seq2seq.BahdanauAttention( @@ -545,10 +584,10 @@ def _create_attn_mech( def cell_input_fn( self, - rnn_inputs: tf.Tensor, - attention: tf.Tensor, + rnn_inputs: 'tf.Tensor', + attention: 'tf.Tensor', num_cell_input_memory_units: int, - ) -> tf.Tensor: + ) -> 'tf.Tensor': """Combine rnn inputs and attention into cell input. Args: @@ -594,8 +633,8 @@ def cell_input_fn( return rnn_inputs def rnn_and_attn_inputs_fn( - self, inputs: tf.Tensor, cell_state: tf.Tensor - ) -> Tuple[tf.Tensor, tf.Tensor]: + self, inputs: 'tf.Tensor', cell_state: 'tf.Tensor' + ) -> Tuple['tf.Tensor', 'tf.Tensor']: """Construct rnn input and attention mechanism input. Args: @@ -626,19 +665,20 @@ def rnn_and_attn_inputs_fn( def _create_attn_cell( self, cell: tf.contrib.rnn.RNNCell, - embed_utter: tf.Tensor, - embed_prev_action: tf.Tensor, - real_length: tf.Tensor, - embed_for_no_intent: tf.Tensor, - embed_for_no_action: tf.Tensor, - embed_for_action_listen: tf.Tensor, + embed_utter: 'tf.Tensor', + embed_prev_action: 'tf.Tensor', + real_length: 'tf.Tensor', + embed_for_no_intent: 'tf.Tensor', + embed_for_no_action: 'tf.Tensor', + embed_for_action_listen: 'tf.Tensor', ) -> tf.contrib.rnn.RNNCell: """Wrap cell in attention wrapper with given memory.""" if self.attn_before_rnn: # create attention over previous user input num_memory_units_before_rnn = self._num_units(embed_utter) - attn_mech = self._create_attn_mech(embed_utter, real_length) + with tf.variable_scope('before', reuse=tf.AUTO_REUSE): + attn_mech = self._create_attn_mech(embed_utter, real_length) # create mask for empty user input not to pay attention to it ignore_mask = tf.reduce_all( @@ -655,7 +695,8 @@ def _create_attn_cell( if self.attn_after_rnn: # create attention over previous bot actions - attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length) + with tf.variable_scope('after', reuse=tf.AUTO_REUSE): + attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length) # create mask for empty bot action or action_listen # not to pay attention to them @@ -713,14 +754,14 @@ def _create_attn_cell( def _create_tf_dial_embed( self, - embed_utter: tf.Tensor, - embed_slots: tf.Tensor, - embed_prev_action: tf.Tensor, - mask: tf.Tensor, - embed_for_no_intent: tf.Tensor, - embed_for_no_action: tf.Tensor, - embed_for_action_listen: tf.Tensor, - ) -> Tuple[tf.Tensor, Union[tf.Tensor, "TimeAttentionWrapperState"]]: + embed_utter: 'tf.Tensor', + embed_slots: 'tf.Tensor', + embed_prev_action: 'tf.Tensor', + mask: 'tf.Tensor', + embed_for_no_intent: 'tf.Tensor', + embed_for_no_action: 'tf.Tensor', + embed_for_action_listen: 'tf.Tensor', + ) -> Tuple['tf.Tensor', Union['tf.Tensor', "TimeAttentionWrapperState"]]: """Create rnn for dialogue level embedding.""" cell_input = tf.concat([embed_utter, embed_slots, embed_prev_action], -1) @@ -740,16 +781,266 @@ def _create_tf_dial_embed( embed_for_action_listen, ) - return tf.nn.dynamic_rnn( - cell, - cell_input, - dtype=tf.float32, - sequence_length=real_length, - scope="rnn_decoder", + with tf.variable_scope('rnn_decoder', reuse=tf.AUTO_REUSE): + return tf.nn.dynamic_rnn( + cell, + cell_input, + dtype=tf.float32, + sequence_length=real_length, + ) + + def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights): + x_in = tf.concat([a_in, b_prev_in], -1) + # print(x_in.shape[-1]) + # exit() + + # x = x_in + hparams = transformer_base() + + hparams.num_hidden_layers = self.num_rnn_layers + hparams.hidden_size = self.rnn_size + # it seems to be factor of 4 for transformer architectures in t2t + hparams.filter_size = hparams.hidden_size * 4 + hparams.num_heads = self.num_heads + hparams.relu_dropout = self.droprate["rnn"] + hparams.pos = self.pos_encoding + + hparams.max_length = self.max_seq_length + + hparams.unidirectional_encoder = True + + hparams.self_attention_type = "dot_product_relative_v2" + hparams.max_relative_position = 5 + hparams.add_relative_to_values = True + + # hparams.proximity_bias = True + + # When not in training mode, set all forms of dropout to zero. + for key, value in hparams.values().items(): + if key.endswith("dropout") or key == "label_smoothing": + setattr(hparams, key, value * tf.cast(self._is_training, tf.float32)) + reg = tf.contrib.layers.l2_regularizer(self.C2) + + x = tf.layers.dense(inputs=x_in, + units=hparams.hidden_size, + use_bias=False, + kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), + kernel_regularizer=reg, + name='transformer_embed_layer', + reuse=tf.AUTO_REUSE) + # a = tf.layers.dense(inputs=a_in, + # units=hparams.hidden_size/3, + # use_bias=False, + # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), + # kernel_regularizer=reg, + # name='transformer_embed_layer_a', + # reuse=tf.AUTO_REUSE) + # + c = tf.layers.dense(inputs=c_in, + units=hparams.hidden_size, + use_bias=False, + kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), + kernel_regularizer=reg, + name='transformer_embed_layer_c', + reuse=tf.AUTO_REUSE) + # + # b = tf.layers.dense(inputs=b_prev_in, + # units=hparams.hidden_size/3, + # use_bias=False, + # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), + # kernel_regularizer=reg, + # name='transformer_embed_layer_b', + # reuse=tf.AUTO_REUSE) + + # x = tf.concat([a, c, b], -1) + + x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training) + + if hparams.multiply_embedding_mode == "sqrt_depth": + x *= hparams.hidden_size ** 0.5 + c *= hparams.hidden_size ** 0.5 + + x *= tf.expand_dims(mask, -1) + + with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): + (x, + self_attention_bias, + encoder_decoder_attention_bias + ) = transformer_prepare_encoder(x, None, hparams) + + if hparams.pos == 'custom_timing': + x = common_attention.add_timing_signal_1d(x, max_timescale=self.pos_max_timescale) + + x *= tf.expand_dims(mask, -1) + + x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) + + attn_bias_for_padding = None + # Otherwise the encoder will just use encoder_self_attention_bias. + if hparams.unidirectional_encoder: + attn_bias_for_padding = encoder_decoder_attention_bias + + x = transformer_encoder( + x, + self_attention_bias, + hparams, + nonpadding=mask, + save_weights_to=attention_weights, + attn_bias_for_padding=attn_bias_for_padding, + ) + + # x = tf.concat([x, c_in], -1) + # c_gate = tf.layers.dense(inputs=x, + # # units=hparams.hidden_size, + # # activation=tf.nn.softmax, + # units=1, + # activation=tf.math.sigmoid, + # bias_initializer=tf.constant_initializer(-1), + # # use_bias=False, + # # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), + # kernel_regularizer=reg, + # name='slots_gate_layer_c', + # reuse=tf.AUTO_REUSE) + x += c #* c_gate + # x = common_layers.layer_postprocess(x, c, hparams) + x *= tf.expand_dims(mask, -1) + + return tf.nn.relu(x), self_attention_bias, x_in + + @staticmethod + def _rearrange_fn(list_tensor_1d_mask_1d): + """Rearranges tensor_1d to put all the values + where mask_1d=1 to the right and + where mask_1d=0 to the left""" + tensor_1d, mask_1d = list_tensor_1d_mask_1d + + partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2) + + return tf.concat(partitioned_tensor, 0) + + @staticmethod + def _arrange_back_fn(list_tensor_1d_mask_1d): + """Arranges back tensor_1d to restore original order + modified by `_rearrange_fn` according to mask_1d: + - number of 0s in mask_1d values on the left are set to + their corresponding places where mask_1d=0, + - number of 1s in mask_1d values on the right are set to + their corresponding places where mask_1d=1""" + tensor_1d, mask_1d = list_tensor_1d_mask_1d + + mask_indices = tf.dynamic_partition( + tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2 ) + mask_sum = tf.reduce_sum(mask_1d, axis=0) + partitioned_tensor = [ + tf.zeros_like(tensor_1d[:-mask_sum]), + tensor_1d[-mask_sum:], + ] + + return tf.dynamic_stitch(mask_indices, partitioned_tensor) + + def _action_to_copy(self, x_in, x, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action): + with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): + ignore_mask_listen = tf.to_float(tf.logical_or( + tf.reduce_all( + tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action), + -1, + ), + tf.reduce_all( + tf.equal(tf.expand_dims(embed_for_action_listen, 0), embed_prev_action), + -1, + ), + )) + + triag_mask = tf.expand_dims( + common_attention.attention_bias_to_padding(self_attention_bias[0, 0, :, tf.newaxis, tf.newaxis, :]), 0) + diag_mask = 1 - (1 - triag_mask) * tf.cumprod(triag_mask, axis=-1, exclusive=True, reverse=True) + + bias = self_attention_bias + common_attention.attention_bias_ignore_padding(ignore_mask_listen) * tf.expand_dims(diag_mask, 1) + + copy_weights = {} + common_attention.multihead_attention(x_in, + embed_prev_action, + bias, + self.rnn_size, + self.embed_dim, + self.embed_dim, + 1, + 0, + save_weights_to=copy_weights) + + copy_weights = copy_weights['copy/multihead_attention/dot_product_attention'][:, 0, :, :] + bias = bias[:, 0, :, :] + shape = tf.shape(copy_weights) + copy_weights = tf.reshape(copy_weights, (-1, shape[-1])) + x_flat = tf.reshape(x_in, (-1, x_in.shape[-1])) + bias = tf.reshape(bias, (-1, shape[-1])) + ignore_mask = common_attention.attention_bias_to_padding(bias[:, tf.newaxis, tf.newaxis, :], tf.to_int32) + + s_w = tf.layers.dense( + inputs=x_flat, + units=2 * self.attn_shift_range + 1, + activation=tf.nn.softmax, + name="shift_weight", + reuse=tf.AUTO_REUSE + ) + mask = 1 - ignore_mask + conv_weights = tf.map_fn( + self._rearrange_fn, [copy_weights, mask], dtype=copy_weights.dtype + ) + + conv_weights = tf.reverse(conv_weights, axis=[1]) + + # preare probs for tf.nn.depthwise_conv2d + # [in_width, in_channels=batch] + conv_weights = tf.transpose(conv_weights, [1, 0]) + # [batch=1, in_height=1, in_width=time+1, in_channels=batch] + conv_weights = conv_weights[tf.newaxis, tf.newaxis, :, :] + + # [filter_height=1, filter_width=2*attn_shift_range+1, + # in_channels=batch, channel_multiplier=1] + conv_s_w = tf.transpose(s_w, [1, 0]) + conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis] + + # perform 1d convolution + # [batch=1, out_height=1, out_width=time+1, out_channels=batch] + conv_weights = tf.nn.depthwise_conv2d_native( + conv_weights, conv_s_w, [1, 1, 1, 1], "SAME" + ) + conv_weights = conv_weights[0, 0, :, :] + conv_weights = tf.transpose(conv_weights, [1, 0]) + + conv_weights = tf.reverse(conv_weights, axis=[1]) + + # arrange probs back to their original time order + copy_weights = tf.map_fn( + self._arrange_back_fn, [conv_weights, mask], dtype=conv_weights.dtype + ) + + # sharpening parameter + g_sh = tf.layers.dense( + inputs=x_flat, + units=1, + activation=lambda a: tf.nn.softplus(a) + 1, + bias_initializer=tf.constant_initializer(1), + name="gamma_sharp", + reuse=tf.AUTO_REUSE + ) + + powed_weights = tf.pow(copy_weights, g_sh) + copy_weights = powed_weights / (tf.reduce_sum(powed_weights, 1, keepdims=True) + 1e-32) + + copy_weights = tf.reshape(copy_weights, shape) + + # remove current time + copy_prev = copy_weights * diag_mask + keep_current = copy_weights * (1 - diag_mask) + dial_embed = self._create_embed(x, layer_name_suffix="out") + return tf.matmul(copy_prev, embed_prev_action) + tf.matmul(keep_current, dial_embed), copy_weights + @staticmethod - def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> tf.Tensor: + def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor': """Extract alignments history form final rnn cell state.""" alignments_from_state = final_state.alignment_history @@ -764,14 +1055,14 @@ def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> tf.Ten return tf.concat(alignment_history, -1) @staticmethod - def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> tf.Tensor: + def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor': """Extract all time masks form final rnn cell state.""" # reshape to (batch, time, memory_time) and ignore last time # because time_mask is created for the next time step return tf.transpose(final_state.all_time_masks.stack(), [1, 0, 2])[:, :-1, :] - def _sims_rnn_to_max_from(self, cell_output: tf.Tensor) -> List[tf.Tensor]: + def _sims_rnn_to_max_from(self, cell_output: 'tf.Tensor') -> List['tf.Tensor']: """Save intermediate tensors for debug purposes.""" if self.attn_after_rnn: @@ -786,7 +1077,7 @@ def _sims_rnn_to_max_from(self, cell_output: tf.Tensor) -> List[tf.Tensor]: else: return [] - def _embed_dialogue_from(self, cell_output: tf.Tensor) -> tf.Tensor: + def _embed_dialogue_from(self, cell_output: 'tf.Tensor') -> 'tf.Tensor': """Extract or calculate dialogue level embedding from cell_output.""" if self.attn_after_rnn: @@ -812,12 +1103,65 @@ def _embed_dialogue_from(self, cell_output: tf.Tensor) -> tf.Tensor: return embed_dialogue + def _tf_sample_neg(self, + pos_b, + neg_bs=None, + neg_ids=None, + batch_size=None, + first_only=False + ) -> 'tf.Tensor': + + all_b = pos_b[tf.newaxis, :, :] + if batch_size is None: + batch_size = tf.shape(pos_b)[0] + all_b = tf.tile(all_b, [batch_size, 1, 1]) + if neg_bs is None and neg_ids is None: + return all_b + + def sample_neg_b(): + if neg_bs is not None: + _neg_bs = neg_bs + elif neg_ids is not None: + _neg_bs = tf.batch_gather(all_b, neg_ids) + else: + raise + return tf.concat([pos_b[:, tf.newaxis, :], _neg_bs], 1) + + if first_only: + out_b = pos_b[:, tf.newaxis, :] + else: + out_b = all_b + + if neg_bs is not None: + cond = tf.logical_and(self._is_training, tf.shape(neg_bs)[0] > 1) + elif neg_ids is not None: + cond = tf.logical_and(self._is_training, tf.shape(neg_ids)[0] > 1) + else: + raise + + return tf.cond(cond, sample_neg_b, lambda: out_b) + + def _tf_calc_iou(self, + b_raw, + neg_bs=None, + neg_ids=None + ) -> 'tf.Tensor': + + tiled_intent_raw = self._tf_sample_neg(b_raw, neg_bs=neg_bs, neg_ids=neg_ids) + pos_b_raw = tiled_intent_raw[:, :1, :] + neg_b_raw = tiled_intent_raw[:, 1:, :] + intersection_b_raw = tf.minimum(neg_b_raw, pos_b_raw) + union_b_raw = tf.maximum(neg_b_raw, pos_b_raw) + + return tf.reduce_sum(intersection_b_raw, -1) / tf.reduce_sum(union_b_raw, -1) + def _tf_sim( self, - embed_dialogue: tf.Tensor, - embed_action: tf.Tensor, - mask: Optional[tf.Tensor], - ) -> Tuple[tf.Tensor, tf.Tensor]: + embed_dialogue: 'tf.Tensor', + embed_action: 'tf.Tensor', + mask: Optional['tf.Tensor'], + ) -> Union[Tuple['tf.Tensor', 'tf.Tensor'], + Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor']]: """Define similarity. This method has two roles: @@ -832,93 +1176,183 @@ def _tf_sim( because it is necessary for them to be mathematically identical. """ - if self.similarity_type == "cosine": - # normalize embedding vectors for cosine similarity + if self.similarity_type not in {"cosine", "inner"}: + raise ValueError( + "Wrong similarity type {}, " + "should be 'cosine' or 'inner'" + "".format(self.similarity_type) + ) + + if len(embed_dialogue.shape) == 2 and len(embed_action.shape) == 2: + # calculate similarity between + # two embedding vectors of the same size + + # always use cosine sim for copy mech embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1) embed_action = tf.nn.l2_normalize(embed_action, -1) - if self.similarity_type in {"cosine", "inner"}: + cos_sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True) - if len(embed_dialogue.shape) == len(embed_action.shape): - # calculate similarity between - # two embedding vectors of the same size - sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True) - bin_sim = tf.where( - sim > (self.mu_pos - self.mu_neg) / 2.0, - tf.ones_like(sim), - tf.zeros_like(sim), - ) + bin_sim = tf.where( + cos_sim > (self.mu_pos - self.mu_neg) / 2.0, + tf.ones_like(cos_sim), + tf.zeros_like(cos_sim), + ) + + # output binary mask and similarity + return bin_sim, cos_sim - # output binary mask and similarity - return bin_sim, sim + else: + # calculate similarity with several + # embedded actions for the loss + if self.similarity_type == "cosine": + # normalize embedding vectors for cosine similarity + embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1) + embed_action = tf.nn.l2_normalize(embed_action, -1) + + if len(embed_dialogue.shape) == 4: + embed_dialogue_pos = embed_dialogue[:, :, :1, :] else: - # calculate similarity with several - # embedded actions for the loss - sim = tf.reduce_sum( - tf.expand_dims(embed_dialogue, -2) * embed_action, -1 - ) - sim *= tf.expand_dims(mask, 2) + embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2) - sim_act = tf.reduce_sum( - embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1 - ) - sim_act *= tf.expand_dims(mask, 2) + sim = tf.reduce_sum( + embed_dialogue_pos * embed_action, -1 + ) * tf.expand_dims(mask, 2) + + sim_bot_emb = tf.reduce_sum( + embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1 + ) * tf.expand_dims(mask, 2) - # output similarities between user input and bot actions - # and similarities between bot actions - return sim, sim_act + if len(embed_dialogue.shape) == 4: + sim_dial_emb = tf.reduce_sum( + embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1 + ) * tf.expand_dims(mask, 2) + else: + sim_dial_emb = None + if len(embed_dialogue.shape) == 4: + sim_dial_bot_emb = tf.reduce_sum( + embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1 + ) * tf.expand_dims(mask, 2) + else: + sim_dial_bot_emb = None + + # output similarities between user input and bot actions + # and similarities between bot actions + return sim, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb + + # noinspection PyPep8Naming + def _scale_loss_by_count_actions( + self, + X, + Y, + slots, + previous_actions, + ) -> Union[np.ndarray, List[List]]: + """Calculate inverse proportionality of repeated actions.""" + + if self.scale_loss_by_action_counts: + # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): + # full = tf.concat([X, slots, previous_actions, Y], -1) + # else: + full = Y + + flat = tf.reshape(full, (-1, full.shape[-1])) + _, i, c = gen_array_ops.unique_with_counts_v2(flat, axis=[0]) + c = tf.cast(c, tf.float32) + + counts = tf.reshape(tf.gather(c, i), (tf.shape(Y)[0], tf.shape(Y)[1])) + + # do not include [-1 -1 ... -1 0] in averaging + # and smooth it by taking sqrt + + if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): + # action_listen is the top one by an order + max_c = tf.math.top_k(c, 2)[0][1] + else: + max_c = tf.reduce_max(c) + # max_c = tf.math.top_k(c, 2)[0][1] + # max_c = tf.cond(tf.shape(c)[0] > 1, lambda: tf.math.top_k(c, 2)[0][1], lambda: tf.reduce_max(c)) + # max_c = tf.reduce_max(c) + + return tf.maximum(max_c / counts, 1) + # return tf.maximum(tf.square(max_c / counts), 1) + + # exit() + # full_X = tf.concat( + # [X, slots, previous_actions, Y], -1 + # ) + # full_X = tf.reshape(full_X, (-1, full_X.shape[-1])) + # # include [-1 -1 ... -1 0] as first + # # full_X = tf.concat([full_X[-1:], full_X], 0) + # + # _, i, c = gen_array_ops.unique_with_counts_v2(full_X, axis=[0]) + # c = tf.cast(c, tf.float32) + # + # counts = tf.reshape(tf.gather(c, i), (tf.shape(X)[0], tf.shape(X)[1])) + # + # # do not include [-1 -1 ... -1 0] in averaging + # # and smooth it by taking sqrt + # return tf.maximum(tf.sqrt(tf.reduce_mean(c) / counts), 1) else: - raise ValueError( - "Wrong similarity type {}, " - "should be 'cosine' or 'inner'" - "".format(self.similarity_type) - ) + return [[None]] - def _regularization_loss(self) -> Union[tf.Tensor, int]: + def _regularization_loss(self): + # type: () -> Union['tf.Tensor', int] """Add regularization to the embed layer inside rnn cell.""" if self.attn_after_rnn: - return self.C2 * tf.add_n( - [ - tf.nn.l2_loss(tf_var) - for tf_var in tf.trainable_variables() - if "cell/out_layer/kernel" in tf_var.name - ] - ) - else: - return 0 + vars_to_reg = [ + tf.nn.l2_loss(tf_var) + for tf_var in tf.trainable_variables() + if "cell/out_layer/kernel" in tf_var.name + ] + if vars_to_reg: + return self.C2 * tf.add_n(vars_to_reg) + + return 0 def _tf_loss( self, - sim: tf.Tensor, - sim_act: tf.Tensor, - sims_rnn_to_max: List[tf.Tensor], - mask: tf.Tensor, - ) -> tf.Tensor: + sim: 'tf.Tensor', + sim_bot_emb: 'tf.Tensor', + sim_dial_emb: 'tf.Tensor', + sims_rnn_to_max: List['tf.Tensor'], + bad_negs, + mask: 'tf.Tensor', + batch_bad_negs + ) -> 'tf.Tensor': """Define loss.""" # loss for maximizing similarity with correct action - loss = tf.maximum(0.0, self.mu_pos - sim[:, :, 0]) + loss = tf.maximum(0., self.mu_pos - sim[:, :, 0]) # loss for minimizing similarity with `num_neg` incorrect actions + sim_neg = sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs if self.use_max_sim_neg: # minimize only maximum similarity over incorrect actions - max_sim_neg = tf.reduce_max(sim[:, :, 1:], -1) - loss += tf.maximum(0.0, self.mu_neg + max_sim_neg) + max_sim_neg = tf.reduce_max(sim_neg, -1) + loss += tf.maximum(0., self.mu_neg + max_sim_neg) else: # minimize all similarities with incorrect actions - max_margin = tf.maximum(0.0, self.mu_neg + sim[:, :, 1:]) + max_margin = tf.maximum(0., self.mu_neg + sim_neg) loss += tf.reduce_sum(max_margin, -1) - if self.scale_loss_by_action_counts: + if isinstance(self.featurizer, FullDialogueTrackerFeaturizer) and self.scale_loss_by_action_counts: # scale loss inverse proportionally to number of action counts loss *= self._loss_scales - # penalize max similarity between intent embeddings - loss_act = tf.maximum(0.0, tf.reduce_max(sim_act, -1)) - loss += loss_act * self.C_emb + # penalize max similarity between bot embeddings + sim_bot_emb += common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs + max_sim_bot_emb = tf.maximum(0., tf.reduce_max(sim_bot_emb, -1)) + loss += max_sim_bot_emb * self.C_emb + + # penalize max similarity between dial embeddings + if sim_dial_emb is not None: + sim_dial_emb += common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs + max_sim_input_emb = tf.maximum(0., tf.reduce_max(sim_dial_emb, -1)) + loss += max_sim_input_emb * self.C_emb # maximize similarity returned by time attention wrapper for sim_to_add in sims_rnn_to_max: @@ -938,12 +1372,80 @@ def _tf_loss( ) return loss - # training methods + def _tf_loss_2( + self, + sim: 'tf.Tensor', + sim_bot_emb: 'tf.Tensor', + sim_dial_emb: 'tf.Tensor', + sim_dial_bot_emb, + sims_rnn_to_max: List['tf.Tensor'], + bad_negs, + mask: 'tf.Tensor', + batch_bad_negs=None, + ) -> 'tf.Tensor': + """Define loss.""" + + all_sim = [sim[:, :, :1], + sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs, + sim_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs, + ] + if sim_dial_emb is not None: + all_sim.append(sim_dial_emb + common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs) + + if sim_dial_bot_emb is not None: + all_sim.append(sim_dial_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs) + + logits = tf.concat(all_sim, -1) + pos_labels = tf.ones_like(logits[:, :, :1]) + neg_labels = tf.zeros_like(logits[:, :, 1:]) + labels = tf.concat([pos_labels, neg_labels], -1) + + pred = tf.nn.softmax(logits) + # fake_logits = tf.concat([logits[:, :, :1] - common_attention.large_compatible_negative(logits.dtype), + # logits[:, :, 1:] + common_attention.large_compatible_negative(logits.dtype)], -1) + + # ones = tf.ones_like(pred[:, :, 0]) + # zeros = tf.zeros_like(pred[:, :, 0]) + + # already_learned = tf.where(pred[:, :, 0] > 0.8, zeros, ones) + already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4) + + # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): + # if self.scale_loss_by_action_counts: + # scale_mask = self._loss_scales * mask + # else: + scale_mask = mask + # else: + # scale_mask = 1.0 + + loss = tf.losses.softmax_cross_entropy(labels, + logits, + scale_mask * already_learned) + # add regularization losses + loss += self._regularization_loss() + tf.losses.get_regularization_loss() + + # maximize similarity returned by time attention wrapper + add_loss = [] + for sim_to_add in sims_rnn_to_max: + add_loss.append(tf.maximum(0.0, 1.0 - sim_to_add)) + + if add_loss: + # mask loss for different length sequences + add_loss = sum(add_loss) * mask + # average the loss over sequence length + add_loss = tf.reduce_sum(add_loss, -1) / tf.reduce_sum(mask, 1) + # average the loss over the batch + add_loss = tf.reduce_mean(add_loss) + loss += add_loss + + return loss + + # training methods def train( self, - training_trackers: List[DialogueStateTracker], - domain: Domain, + training_trackers: List['DialogueStateTracker'], + domain: 'Domain', **kwargs: Any ) -> None: """Train the policy on given training trackers.""" @@ -985,61 +1487,70 @@ def train( # set random seed in tf tf.set_random_seed(self.random_seed) - dialogue_len = None # use dynamic time for rnn - # create placeholders - self.a_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.X.shape[-1]), - name="a", - ) - self.b_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, None, session_data.Y.shape[-1]), - name="b", - ) - self.c_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.slots.shape[-1]), - name="slt", - ) - self.b_prev_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.Y.shape[-1]), - name="b_prev", - ) - self._dialogue_len = tf.placeholder( - dtype=tf.int32, shape=(), name="dialogue_len" - ) - self._x_for_no_intent_in = tf.placeholder( + batch_size_in = tf.placeholder(tf.int64) + train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X, + session_data.Y, + session_data.slots, + session_data.previous_actions)) + train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) + train_dataset = train_dataset.batch(batch_size_in) + + if self.evaluate_on_num_examples: + ids = np.random.permutation(len(session_data.X))[:self.evaluate_on_num_examples] + + val_dataset = tf.data.Dataset.from_tensor_slices((session_data.X[ids], + session_data.Y[ids], + session_data.slots[ids], + session_data.previous_actions[ids]) + ).batch(self.evaluate_on_num_examples) + else: + val_dataset = None + + iterator = tf.data.Iterator.from_structure(train_dataset.output_types, + train_dataset.output_shapes, + output_classes=train_dataset.output_classes) + + self.a_in, self.b_in, self.c_in, self.b_prev_in = iterator.get_next() + + self.a_in = tf.cast(self.a_in, tf.float32) + self.b_in = tf.cast(self.b_in, tf.float32) + self.c_in = tf.cast(self.c_in, tf.float32) + self.b_prev_in = tf.cast(self.b_prev_in, tf.float32) + + # they don't change + self._x_for_no_intent_in = tf.constant( + session_data.x_for_no_intent, dtype=tf.float32, - shape=(1, session_data.X.shape[-1]), name="x_for_no_intent", ) - self._y_for_no_action_in = tf.placeholder( + self._y_for_no_action_in = tf.constant( + session_data.y_for_no_action, dtype=tf.float32, - shape=(1, session_data.Y.shape[-1]), name="y_for_no_action", ) - self._y_for_action_listen_in = tf.placeholder( + self._y_for_action_listen_in = tf.constant( + session_data.y_for_action_listen, dtype=tf.float32, - shape=(1, session_data.Y.shape[-1]), name="y_for_action_listen", ) - self._is_training = tf.placeholder_with_default(False, shape=()) + all_actions = tf.constant(self.encoded_all_actions, + dtype=tf.float32, + name="all_actions") - self._loss_scales = tf.placeholder( - dtype=tf.float32, shape=(None, dialogue_len) + # dynamic variables + self._is_training = tf.placeholder_with_default(False, shape=()) + self._dialogue_len = tf.placeholder( + dtype=tf.int32, shape=(), name="dialogue_len" ) - # create embedding vectors - self.user_embed = self._create_tf_user_embed(self.a_in) + # mask different length sequences + # if there is at least one `-1` it should be masked + mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + self.bot_embed = self._create_tf_bot_embed(self.b_in) - self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt") + all_actions_embed = self._create_tf_bot_embed(all_actions) embed_prev_action = self._create_tf_bot_embed(self.b_prev_in) - embed_for_no_intent = self._create_tf_no_intent_embed( - self._x_for_no_intent_in - ) embed_for_no_action = self._create_tf_no_action_embed( self._y_for_no_action_in ) @@ -1047,42 +1558,185 @@ def train( self._y_for_action_listen_in ) - # mask different length sequences - # if there is at least one `-1` it should be masked - mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + if self.transformer: + self.attention_weights = {} + tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) + # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action) + self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out") #+ self._create_embed(self.c_in, layer_name_suffix="slots") + sims_rnn_to_max = [] + else: + # create embedding vectors + self.user_embed = self._create_tf_user_embed(self.a_in) + self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt") - # get rnn output - cell_output, final_state = self._create_tf_dial_embed( - self.user_embed, - self.slot_embed, - embed_prev_action, - mask, - embed_for_no_intent, - embed_for_no_action, - embed_for_action_listen, - ) - # process rnn output - if self.is_using_attention(): - self.alignment_history = self._alignments_history_from(final_state) + embed_for_no_intent = self._create_tf_no_intent_embed( + self._x_for_no_intent_in + ) - self.all_time_masks = self._all_time_masks_from(final_state) + # get rnn output + cell_output, final_state = self._create_tf_dial_embed( + self.user_embed, + self.slot_embed, + embed_prev_action, + mask, + embed_for_no_intent, + embed_for_no_action, + embed_for_action_listen, + ) + # process rnn output + if self.is_using_attention(): + self.alignment_history = self._alignments_history_from(final_state) - sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output) - self.dial_embed = self._embed_dialogue_from(cell_output) + self.all_time_masks = self._all_time_masks_from(final_state) + + sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output) + self.dial_embed = self._embed_dialogue_from(cell_output) # calculate similarities - self.sim_op, sim_act = self._tf_sim(self.dial_embed, self.bot_embed, mask) + if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): + self.b_in = tf.expand_dims(self.b_in, 1) + self.bot_embed = tf.expand_dims(self.bot_embed, 1) + self.dial_embed = self.dial_embed[:, -1:, :] + mask = mask[:, -1:] + + b_raw = tf.reshape(self.b_in, (-1, self.b_in.shape[-1])) + + _, i, c = gen_array_ops.unique_with_counts_v2(b_raw, axis=[0]) + counts = tf.expand_dims(tf.reshape(tf.gather(tf.cast(c, tf.float32), i), (tf.shape(b_raw)[0],)), 0) + batch_neg_ids = tf.random.categorical(tf.log((1. - tf.eye(tf.shape(b_raw)[0])/counts)), self.num_neg) + + batch_iou_bot = self._tf_calc_iou(b_raw, neg_ids=batch_neg_ids) + batch_bad_negs = 1. - tf.nn.relu(tf.sign(1. - batch_iou_bot)) + batch_bad_negs = tf.reshape(batch_bad_negs, (tf.shape(self.dial_embed)[0], + tf.shape(self.dial_embed)[1], + -1)) + + neg_ids = tf.random.categorical(tf.log(tf.ones((tf.shape(b_raw)[0], tf.shape(all_actions)[0]))), self.num_neg) + + tiled_all_actions = tf.tile(tf.expand_dims(all_actions, 0), (tf.shape(b_raw)[0], 1, 1)) + neg_bs = tf.batch_gather(tiled_all_actions, neg_ids) + iou_bot = self._tf_calc_iou(b_raw, neg_bs) + bad_negs = 1. - tf.nn.relu(tf.sign(1. - iou_bot)) + bad_negs = tf.reshape(bad_negs, (tf.shape(self.bot_embed)[0], + tf.shape(self.bot_embed)[1], + -1)) + + dial_embed_flat = tf.reshape(self.dial_embed, (-1, self.dial_embed.shape[-1])) + + tiled_dial_embed = self._tf_sample_neg(dial_embed_flat, neg_ids=batch_neg_ids, first_only=True) + tiled_dial_embed = tf.reshape(tiled_dial_embed, (tf.shape(self.dial_embed)[0], + tf.shape(self.dial_embed)[1], + -1, + self.dial_embed.shape[-1])) + + bot_embed_flat = tf.reshape(self.bot_embed, (-1, self.bot_embed.shape[-1])) + tiled_all_actions_embed = tf.tile(tf.expand_dims(all_actions_embed, 0), (tf.shape(b_raw)[0], 1, 1)) + neg_embs = tf.batch_gather(tiled_all_actions_embed, neg_ids) + tiled_bot_embed = self._tf_sample_neg(bot_embed_flat, neg_bs=neg_embs) + tiled_bot_embed = tf.reshape(tiled_bot_embed, (tf.shape(self.bot_embed)[0], + tf.shape(self.bot_embed)[1], + -1, + self.bot_embed.shape[-1])) + + # self.sim_op, sim_bot_emb, sim_dial_emb = self._tf_sim(self.dial_embed, tiled_bot_embed, mask) + self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb = self._tf_sim(tiled_dial_embed, tiled_bot_embed, mask) + # construct loss - loss = self._tf_loss(self.sim_op, sim_act, sims_rnn_to_max, mask) + if self.scale_loss_by_action_counts: + self._loss_scales = self._scale_loss_by_count_actions(self.a_in, self.b_in, self.c_in, self.b_prev_in) + else: + self._loss_scales = None + # loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sims_rnn_to_max, bad_negs, mask) + loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb, sims_rnn_to_max, bad_negs, mask, batch_bad_negs) # define which optimizer to use self._train_op = tf.train.AdamOptimizer( - learning_rate=0.001, epsilon=1e-16 + # learning_rate=0.001, epsilon=1e-16 ).minimize(loss) + + train_init_op = iterator.make_initializer(train_dataset) + if self.evaluate_on_num_examples: + val_init_op = iterator.make_initializer(val_dataset) + else: + val_init_op = None + # train tensorflow graph self.session = tf.Session(config=self._tf_config) - self._train_tf(session_data, loss, mask) + # self._train_tf(session_data, loss, mask) + self._train_tf_dataset(train_init_op, val_init_op, batch_size_in, loss, mask, session_data.X.shape[1]) + + dialogue_len = None # use dynamic time for rnn + # create placeholders + self.a_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, session_data.X.shape[-1]), + name="a", + ) + self.b_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, None, session_data.Y.shape[-1]), + name="b", + ) + self.c_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, session_data.slots.shape[-1]), + name="slt", + ) + self.b_prev_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, session_data.Y.shape[-1]), + name="b_prev", + ) + + # mask different length sequences + # if there is at least one `-1` it should be masked + mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + + self.bot_embed = self._create_tf_bot_embed(self.b_in) + embed_prev_action = self._create_tf_bot_embed(self.b_prev_in) + + if self.transformer: + self.attention_weights = {} + tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask, + self.attention_weights) + # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias, + # embed_prev_action, + # embed_for_action_listen, + # embed_for_no_action) + self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out") + + else: + self.user_embed = self._create_tf_user_embed(self.a_in) + self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt") + + # get rnn output + cell_output, final_state = self._create_tf_dial_embed( + self.user_embed, + self.slot_embed, + embed_prev_action, + mask, + embed_for_no_intent, + embed_for_no_action, + embed_for_action_listen, + ) + # process rnn output + if self.is_using_attention(): + self.alignment_history = self._alignments_history_from(final_state) + + self.all_time_masks = self._all_time_masks_from(final_state) + + self.dial_embed = self._embed_dialogue_from(cell_output) + + if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): + self.dial_embed = self.dial_embed[:, -1:, :] + + self.sim_op, _, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask) + + # if self.attention_weights.items(): + # self.attention_weights = tf.concat([tf.expand_dims(t, 0) + # for name, t in self.attention_weights.items() + # if name.endswith('multihead_attention/dot_product_attention')], 0) # training helpers def _linearly_increasing_batch_size(self, epoch: int) -> int: @@ -1102,73 +1756,15 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int: else: return int(self.batch_size[0]) - def _create_batch_b( - self, batch_pos_b: np.ndarray, intent_ids: np.ndarray - ) -> np.ndarray: - """Create batch of actions. - - The first is correct action - and the rest are wrong actions sampled randomly. - """ - - batch_pos_b = batch_pos_b[:, :, np.newaxis, :] - - # sample negatives - batch_neg_b = np.zeros( - ( - batch_pos_b.shape[0], - batch_pos_b.shape[1], - self.num_neg, - batch_pos_b.shape[-1], - ), - dtype=int, - ) - for b in range(batch_pos_b.shape[0]): - for h in range(batch_pos_b.shape[1]): - # create negative indexes out of possible ones - # except for correct index of b - negative_indexes = [ - i - for i in range(self.encoded_all_actions.shape[0]) - if i != intent_ids[b, h] - ] - - negs = np.random.choice(negative_indexes, size=self.num_neg) - - batch_neg_b[b, h] = self.encoded_all_actions[negs] - - return np.concatenate([batch_pos_b, batch_neg_b], -2) - - # noinspection PyPep8Naming - def _scale_loss_by_count_actions( - self, - X: np.ndarray, - slots: np.ndarray, - previous_actions: np.ndarray, - actions_for_Y: np.ndarray, - ) -> Union[np.ndarray, List[List]]: - """Calculate inverse proportionality of repeated actions.""" - - if self.scale_loss_by_action_counts: - full_X = np.concatenate( - [X, slots, previous_actions, actions_for_Y[:, :, np.newaxis]], -1 - ) - full_X = full_X.reshape((-1, full_X.shape[-1])) - - _, i, c = np.unique(full_X, return_inverse=True, return_counts=True, axis=0) - - counts = c[i].reshape((X.shape[0], X.shape[1])) - - # do not include [-1 -1 ... -1 0] in averaging - # and smooth it by taking sqrt - return np.maximum(np.sqrt(np.mean(c[1:]) / counts), 1) - else: - return [[None]] - - def _train_tf( - self, session_data: SessionData, loss: tf.Tensor, mask: tf.Tensor - ) -> None: - """Train tf graph.""" + def _train_tf_dataset(self, + train_init_op, + val_init_op, + batch_size_in, + loss: 'tf.Tensor', + mask, + dialogue_len, + ) -> None: + """Train tf graph""" self.session.run(tf.global_variables_initializer()) @@ -1178,115 +1774,66 @@ def _train_tf( "".format(self.evaluate_every_num_epochs) ) pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled()) + train_acc = 0 last_loss = 0 for ep in pbar: - # randomize training data for the current epoch - ids = np.random.permutation(session_data.X.shape[0]) - # calculate batch size for the current epoch batch_size = self._linearly_increasing_batch_size(ep) - # calculate number of batches in the current epoch - batches_per_epoch = session_data.X.shape[0] // batch_size + int( - session_data.X.shape[0] % batch_size > 0 - ) - # collect average loss over the batches - ep_loss = 0 - for i in range(batches_per_epoch): - start_idx = i * batch_size - end_idx = (i + 1) * batch_size - batch_ids = ids[start_idx:end_idx] - - # get randomized data for current batch - batch_a = session_data.X[batch_ids] - batch_pos_b = session_data.Y[batch_ids] - actions_for_b = session_data.actions_for_Y[batch_ids] - - # add negatives - incorrect bot actions predictions - batch_b = self._create_batch_b(batch_pos_b, actions_for_b) - - batch_c = session_data.slots[batch_ids] - batch_b_prev = session_data.previous_actions[batch_ids] - - # calculate how much the loss from each action - # should be scaled based on action rarity - batch_loss_scales = self._scale_loss_by_count_actions( - batch_a, batch_c, batch_b_prev, actions_for_b - ) + self.session.run(train_init_op, feed_dict={batch_size_in: batch_size}) - # minimize and calculate loss - _loss, _ = self.session.run( - [loss, self._train_op], - feed_dict={ - self.a_in: batch_a, - self.b_in: batch_b, - self.c_in: batch_c, - self.b_prev_in: batch_b_prev, - self._dialogue_len: session_data.X.shape[1], - self._x_for_no_intent_in: session_data.x_for_no_intent, - self._y_for_no_action_in: session_data.y_for_no_action, - self._y_for_action_listen_in: session_data.y_for_action_listen, - self._is_training: True, - self._loss_scales: batch_loss_scales, - }, - ) - # collect average loss over the batches - ep_loss += _loss / batches_per_epoch - - # calculate train accuracy - if self.evaluate_on_num_examples: - if ( - (ep + 1) == 1 - or (ep + 1) % self.evaluate_every_num_epochs == 0 - or (ep + 1) == self.epochs - ): - train_acc = self._calc_train_acc(session_data, mask) + ep_loss = 0 + batches_per_epoch = 0 + while True: + try: + _, batch_loss = self.session.run((self._train_op, loss), + feed_dict={self._is_training: True, + self._dialogue_len: dialogue_len}) + + except tf.errors.OutOfRangeError: + break + + batches_per_epoch += 1 + ep_loss += batch_loss + + ep_loss /= batches_per_epoch + + if self.evaluate_on_num_examples and val_init_op is not None: + if (ep == 0 or + (ep + 1) % self.evaluate_every_num_epochs == 0 or + (ep + 1) == self.epochs): + train_acc = self._output_training_stat_dataset(val_init_op, mask, dialogue_len) last_loss = ep_loss - pbar.set_postfix( - { - "loss": "{:.3f}".format(ep_loss), - "acc": "{:.3f}".format(train_acc), - } - ) + pbar.set_postfix({ + "loss": "{:.3f}".format(ep_loss), + "acc": "{:.3f}".format(train_acc) + }) else: - pbar.set_postfix({"loss": "{:.3f}".format(ep_loss)}) + pbar.set_postfix({ + "loss": "{:.3f}".format(ep_loss) + }) if self.evaluate_on_num_examples: - logger.info( - "Finished training embedding policy, " - "loss={:.3f}, train accuracy={:.3f}" - "".format(last_loss, train_acc) - ) + logger.info("Finished training embedding classifier, " + "loss={:.3f}, train accuracy={:.3f}" + "".format(last_loss, train_acc)) - def _calc_train_acc(self, session_data: SessionData, mask: tf.Tensor) -> np.float32: - """Calculate training accuracy.""" + def _output_training_stat_dataset(self, val_init_op, mask, dialogue_len) -> np.ndarray: + """Output training statistics""" - # choose n examples to calculate train accuracy - n = self.evaluate_on_num_examples - ids = np.random.permutation(len(session_data.X))[:n] - # noinspection PyPep8Naming - all_Y_d_x = np.stack( - [session_data.all_Y_d for _ in range(session_data.X[ids].shape[0])] - ) + self.session.run(val_init_op) - _sim, _mask = self.session.run( - [self.sim_op, mask], - feed_dict={ - self.a_in: session_data.X[ids], - self.b_in: all_Y_d_x, - self.c_in: session_data.slots[ids], - self.b_prev_in: session_data.previous_actions[ids], - self._dialogue_len: session_data.X.shape[1], - self._x_for_no_intent_in: session_data.x_for_no_intent, - self._y_for_no_action_in: session_data.y_for_no_action, - self._y_for_action_listen_in: session_data.y_for_action_listen, - }, - ) - return np.sum( - (np.argmax(_sim, -1) == session_data.actions_for_Y[ids]) * _mask - ) / np.sum(_mask) + sim_, mask_ = self.session.run([self.sim_op, mask], + feed_dict={self._is_training: False, + self._dialogue_len: dialogue_len}) + sim_ = sim_.reshape((-1, sim_.shape[-1])) + mask_ = mask_.reshape((-1,)) + + train_acc = np.sum((np.max(sim_, -1) == sim_.diagonal()) * mask_) / np.sum(mask_) + + return train_acc def continue_training( self, @@ -1334,6 +1881,22 @@ def continue_training( }, ) + def tf_feed_dict_for_prediction(self, + tracker: DialogueStateTracker, + domain: Domain) -> Dict: + # noinspection PyPep8Naming + data_X = self.featurizer.create_X([tracker], domain) + session_data = self._create_tf_session_data(domain, data_X) + # noinspection PyPep8Naming + all_Y_d_x = np.stack([session_data.all_Y_d + for _ in range(session_data.X.shape[0])]) + + return {self.a_in: session_data.X, + self.b_in: all_Y_d_x, + self.c_in: session_data.slots, + self.b_prev_in: session_data.previous_actions, + self._dialogue_len: session_data.X.shape[1]} + def predict_action_probabilities( self, tracker: DialogueStateTracker, domain: Domain ) -> List[float]: @@ -1357,7 +1920,9 @@ def predict_action_probabilities( all_Y_d_x = np.stack( [session_data.all_Y_d for _ in range(session_data.X.shape[0])] ) - + # self.similarity_type = 'cosine' + # mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + # self.sim_op, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask) _sim = self.session.run( self.sim_op, feed_dict={ @@ -1366,24 +1931,26 @@ def predict_action_probabilities( self.c_in: session_data.slots, self.b_prev_in: session_data.previous_actions, self._dialogue_len: session_data.X.shape[1], - self._x_for_no_intent_in: session_data.x_for_no_intent, - self._y_for_no_action_in: session_data.y_for_no_action, - self._y_for_action_listen_in: session_data.y_for_action_listen, }, ) + # TODO assume we used inner: + self.similarity_type = "inner" + result = _sim[0, -1, :] if self.similarity_type == "cosine": # clip negative values to zero result[result < 0] = 0 elif self.similarity_type == "inner": - # normalize result to [0, 1] with softmax + # normalize result to [0, 1] with softmax but only over 3*num_neg+1 values + low_ids = result.argsort()[::-1][4*self.num_neg+1:] + result[low_ids] += -np.inf result = np.exp(result) result /= np.sum(result) return result.tolist() - def _persist_tensor(self, name: Text, tensor: tf.Tensor) -> None: + def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None: if tensor is not None: self.graph.clear_collection(name) self.graph.add_to_collection(name, tensor) @@ -1408,7 +1975,7 @@ def persist(self, path: Text) -> None: file_name = "tensorflow_embedding.ckpt" checkpoint = os.path.join(path, file_name) - rasa.utils.io.create_directory_for_file(checkpoint) + utils.create_dir_for_file(checkpoint) with self.graph.as_default(): self._persist_tensor("intent_placeholder", self.a_in) @@ -1435,6 +2002,8 @@ def persist(self, path: Text) -> None: self._persist_tensor("all_time_masks", self.all_time_masks) + self._persist_tensor("attention_weights", self.attention_weights) + saver = tf.train.Saver() saver.save(self.session, checkpoint) @@ -1449,7 +2018,7 @@ def persist(self, path: Text) -> None: pickle.dump(self._tf_config, f) @staticmethod - def load_tensor(name: Text) -> Optional[tf.Tensor]: + def load_tensor(name: Text) -> Optional['tf.Tensor']: tensor_list = tf.get_collection(name) return tensor_list[0] if tensor_list else None @@ -1512,6 +2081,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy": all_time_masks = cls.load_tensor("all_time_masks") + attention_weights = cls.load_tensor("attention_weights") + encoded_actions_file = os.path.join( path, "{}.encoded_all_actions.pkl".format(file_name) ) @@ -1543,4 +2114,5 @@ def load(cls, path: Text) -> "EmbeddingPolicy": attn_embed=attn_embed, copy_attn_debug=copy_attn_debug, all_time_masks=all_time_masks, + attention_weights=attention_weights ) From 92d24f07b28999e3f4869f9ba35a7011b4426234 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 9 Jul 2019 18:16:41 +0200 Subject: [PATCH 02/50] remove trash --- rasa/core/featurizers.py | 4 +- rasa/core/policies/embedding_policy.py | 1011 +++--------------------- 2 files changed, 118 insertions(+), 897 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index 737b4e22a80e..f0e722975078 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -412,7 +412,7 @@ def create_X( def persist(self, path): featurizer_file = os.path.join(path, "featurizer.json") - utils.create_dir_for_file(featurizer_file) + rasa.utils.io.create_directory_for_file(featurizer_file) with open(featurizer_file, "w", encoding="utf-8") as f: # noinspection PyTypeChecker f.write(str(jsonpickle.encode(self))) @@ -568,7 +568,7 @@ def _hash_example(states, action): def training_states_and_actions( self, trackers: List[DialogueStateTracker], domain: Domain - ) -> Tuple[List[List[Dict]], List[List[Text]]]: + ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]: trackers_as_states = [] trackers_as_actions = [] diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index bd6f9bcd9fb6..68bc2808a67d 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -12,7 +12,6 @@ import rasa.utils.io from rasa.core import utils -from rasa.core.actions.action import ACTION_LISTEN_NAME from rasa.core.domain import Domain from rasa.core.featurizers import ( TrackerFeaturizer, @@ -21,23 +20,20 @@ MaxHistoryTrackerFeaturizer ) from rasa.core.policies.policy import Policy +from rasa.core.trackers import DialogueStateTracker +from rasa.utils.common import is_logging_disabled import tensorflow as tf from tensorflow.python.ops import gen_array_ops -from tensor2tensor.layers import common_attention -from tensor2tensor.layers import common_layers -from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder -from tensor2tensor.models.evolved_transformer import evolved_transformer_encoder - -from rasa.core.policies.tf_utils import ( - TimeAttentionWrapper, - ChronoBiasLayerNormBasicLSTMCell, -) -from rasa.core.trackers import DialogueStateTracker -from rasa.utils.common import is_logging_disabled -if typing.TYPE_CHECKING: - from rasa.core.policies.tf_utils import TimeAttentionWrapperState +try: + from tensor2tensor.layers import common_attention + from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder +except ImportError: + common_attention = None + transformer_base = None + transformer_prepare_encoder = None + transformer_encoder = None try: import cPickle as pickle @@ -56,9 +52,6 @@ "slots", "previous_actions", "actions_for_Y", - "x_for_no_intent", - "y_for_no_action", - "y_for_action_listen", "all_Y_d", ), ) @@ -82,7 +75,6 @@ class EmbeddingPolicy(Policy): # number of hidden layers is equal to the length of this list "hidden_layers_sizes_b": [], - "transformer": False, "pos_encoding": "timing", # {"timing", "emb", "custom_timing"} # introduce phase shift in time encodings between transformers # 0.5 - 0.8 works on small dataset @@ -123,8 +115,6 @@ class EmbeddingPolicy(Policy): # the scale of how important is to minimize the maximum similarity # between embeddings of different actions "C_emb": 0.8, - # scale loss with inverse frequency of bot actions - "scale_loss_by_action_counts": True, # dropout rate for user nn "droprate_a": 0.0, # dropout rate for bot nn @@ -158,6 +148,11 @@ def _standard_featurizer(max_history=None): else: return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history) + @staticmethod + def _check_t2t(): + if common_attention is None: + raise ImportError("Please install tensor2tensor") + def __init__( self, featurizer: Optional['FullDialogueTrackerFeaturizer'] = None, @@ -170,9 +165,6 @@ def __init__( slots_placeholder: Optional['tf.Tensor'] = None, prev_act_placeholder: Optional['tf.Tensor'] = None, dialogue_len: Optional['tf.Tensor'] = None, - x_for_no_intent: Optional['tf.Tensor'] = None, - y_for_no_action: Optional['tf.Tensor'] = None, - y_for_action_listen: Optional['tf.Tensor'] = None, similarity_op: Optional['tf.Tensor'] = None, alignment_history: Optional['tf.Tensor'] = None, user_embed: Optional['tf.Tensor'] = None, @@ -187,13 +179,9 @@ def __init__( max_history: Optional[int] = None, **kwargs: Any ) -> None: - # if featurizer: - # if not isinstance(featurizer, FullDialogueTrackerFeaturizer): - # raise TypeError( - # "Passed tracker featurizer of type {}, " - # "should be FullDialogueTrackerFeaturizer." - # "".format(type(featurizer).__name__) - # ) + # check if t2t is installed + self._check_t2t() + if not featurizer: featurizer = self._standard_featurizer(max_history) super(EmbeddingPolicy, self).__init__(featurizer, priority) @@ -221,9 +209,6 @@ def __init__( self.c_in = slots_placeholder self.b_prev_in = prev_act_placeholder self._dialogue_len = dialogue_len - self._x_for_no_intent_in = x_for_no_intent - self._y_for_no_action_in = y_for_no_action - self._y_for_action_listen_in = y_for_action_listen self.sim_op = similarity_op # store attention probability distribution as @@ -245,7 +230,6 @@ def __init__( # internal tf instances self._train_op = None self._is_training = None - self._loss_scales = None # init helpers def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: @@ -266,7 +250,6 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"] ) ) - self.transformer = config['transformer'] self.pos_encoding = config['pos_encoding'] self.pos_max_timescale = config['pos_max_timescale'] self.max_seq_length = config['max_seq_length'] @@ -293,7 +276,6 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None: def _load_regularization_params(self, config: Dict[Text, Any]) -> None: self.C2 = config["C2"] self.C_emb = config["C_emb"] - self.scale_loss_by_action_counts = config["scale_loss_by_action_counts"] self.droprate = { "a": config["droprate_a"], "b": config["droprate_b"], @@ -373,18 +355,6 @@ def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray: ] ) - # noinspection PyPep8Naming - @staticmethod - def _create_zero_vector(X: np.ndarray) -> np.ndarray: - """Create zero vector of shape (1, X.shape[-1]).""" - - return np.zeros((1, X.shape[-1]), X.dtype) - - def _create_y_for_action_listen(self, domain: "Domain") -> np.ndarray: - """Extract feature vector for action_listen""" - action_listen_idx = domain.index_for_action(ACTION_LISTEN_NAME) - return self.encoded_all_actions[action_listen_idx : action_listen_idx + 1] - # noinspection PyPep8Naming def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray: """Stack encoded_all_intents on top of each other @@ -396,8 +366,8 @@ def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray: return np.stack([self.encoded_all_actions] * dialogue_len) # noinspection PyPep8Naming - def _create_tf_session_data( - self, domain: "Domain", data_X: np.ndarray, data_Y: Optional[np.ndarray] = None + def _create_session_data( + self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None ) -> SessionData: """Combine all tf session related data into a named tuple""" @@ -412,10 +382,6 @@ def _create_tf_session_data( actions_for_Y = None Y = None - x_for_no_intent = self._create_zero_vector(X) - y_for_no_action = self._create_zero_vector(previous_actions) - y_for_action_listen = self._create_y_for_action_listen(domain) - # is needed to calculate train accuracy if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): dial_len = X.shape[1] @@ -429,13 +395,34 @@ def _create_tf_session_data( slots=slots, previous_actions=previous_actions, actions_for_Y=actions_for_Y, - x_for_no_intent=x_for_no_intent, - y_for_no_action=y_for_no_action, - y_for_action_listen=y_for_action_listen, all_Y_d=all_Y_d, ) + @staticmethod + def _sample_session_data(session_data: 'SessionData', + num_samples: int) -> 'SessionData': + ids = np.random.permutation(len(session_data.X))[:num_samples] + return SessionData( + X=session_data.X[ids], + Y=session_data.Y[ids], + slots=session_data.slots[ids], + previous_actions=session_data.previous_actions[ids], + actions_for_Y=session_data.actions_for_Y[ids], + all_Y_d=session_data.all_Y_d, + ) + # tf helpers: + @staticmethod + def _create_tf_dataset(session_data: 'SessionData', + batch_size: Union['tf.Tensor', int]) -> 'tf.data.Dataset': + train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X, + session_data.Y, + session_data.slots, + session_data.previous_actions)) + train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) + train_dataset = train_dataset.batch(batch_size) + return train_dataset + def _create_tf_nn( self, x_in: 'tf.Tensor', @@ -473,19 +460,6 @@ def _create_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': ) return embed_x - def _create_tf_user_embed(self, a_in: 'tf.Tensor') -> 'tf.Tensor': - """Create embedding user vector.""" - - layer_name_suffix = "a_and_b" if self.share_embedding else "a" - - a = self._create_tf_nn( - a_in, - self.hidden_layer_sizes["a"], - self.droprate["a"], - layer_name_suffix=layer_name_suffix, - ) - return self._create_embed(a, layer_name_suffix=layer_name_suffix) - def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': """Create embedding bot vector.""" @@ -499,302 +473,7 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': ) return self._create_embed(b, layer_name_suffix=layer_name_suffix) - def _create_tf_no_intent_embed(self, x_for_no_intent_i: 'tf.Tensor') -> 'tf.Tensor': - """Create embedding user vector for empty intent.""" - - layer_name_suffix = "a_and_b" if self.share_embedding else "a" - - x_for_no_intent = self._create_tf_nn( - x_for_no_intent_i, - self.hidden_layer_sizes["a"], - droprate=0, - layer_name_suffix=layer_name_suffix, - ) - return tf.stop_gradient( - self._create_embed(x_for_no_intent, layer_name_suffix=layer_name_suffix) - ) - - def _create_tf_no_action_embed(self, y_for_no_action_in: 'tf.Tensor') -> 'tf.Tensor': - """Create embedding bot vector for empty action and action_listen.""" - - layer_name_suffix = "a_and_b" if self.share_embedding else "b" - - y_for_no_action = self._create_tf_nn( - y_for_no_action_in, - self.hidden_layer_sizes["b"], - droprate=0, - layer_name_suffix=layer_name_suffix, - ) - return tf.stop_gradient( - self._create_embed(y_for_no_action, layer_name_suffix=layer_name_suffix) - ) - - def _create_rnn_cell(self): - # type: () -> tf.contrib.rnn.RNNCell - """Create one rnn cell.""" - - # chrono initialization for forget bias - # assuming that characteristic time is max dialogue length - # left border that initializes forget gate close to 0 - bias_0 = -1.0 - - # right border that initializes forget gate close to 1 - bias_1 = np.log(self.characteristic_time - 1.0) - fbias = (bias_1 - bias_0) * np.random.random(self.rnn_size) + bias_0 - - if self.attn_after_rnn: - # since attention is copied to rnn output, - # embedding should be performed inside the cell - embed_layer_size = self.embed_dim - else: - embed_layer_size = None - - keep_prob = 1.0 - ( - self.droprate["rnn"] * tf.cast(self._is_training, tf.float32) - ) - - return ChronoBiasLayerNormBasicLSTMCell( - num_units=self.rnn_size, - layer_norm=self.layer_norm, - forget_bias=fbias, - input_bias=-fbias, - dropout_keep_prob=keep_prob, - out_layer_size=embed_layer_size, - ) - - @staticmethod - def _num_units(memory: 'tf.Tensor') -> int: - return memory.shape[-1].value - - def _create_attn_mech( - self, memory: 'tf.Tensor', real_length: 'tf.Tensor' - ) -> tf.contrib.seq2seq.AttentionMechanism: - - return tf.contrib.seq2seq.BahdanauAttention( - num_units=self._num_units(memory), - memory=memory, - memory_sequence_length=real_length, - normalize=True, - probability_fn=tf.identity, - # we only attend to memory up to a current time step - # it does not affect alignments, but - # is important for interpolation gate - score_mask_value=0, - ) - - def cell_input_fn( - self, - rnn_inputs: 'tf.Tensor', - attention: 'tf.Tensor', - num_cell_input_memory_units: int, - ) -> 'tf.Tensor': - """Combine rnn inputs and attention into cell input. - - Args: - rnn_inputs: Tensor, first output from `rnn_and_attn_inputs_fn`. - - attention: Tensor, concatenated all attentions for one time step. - - num_cell_input_memory_units: int, number of the first units in - `attention` that are responsible for - enhancing cell input. - - Returns: - A Tensor `cell_inputs` to feed to an rnn cell. - """ - - if num_cell_input_memory_units: - if num_cell_input_memory_units == self.embed_dim: - # since attention can contain additional - # attention mechanisms, only attention - # from previous user input is used as an input - # for rnn cell and only if memory before rnn - # is the same size as embed_utter - return tf.concat( - [ - rnn_inputs[:, : self.embed_dim] - + attention[:, :num_cell_input_memory_units], - rnn_inputs[:, self.embed_dim :], - ], - -1, - ) - else: - # in current implementation it cannot fall here, - # but this Exception exists in case - # attention before rnn is changed - raise ValueError( - "Number of memory units {} is not " - "equal to number of utter units {}. " - "Please modify cell input function " - "accordingly." - "".format(num_cell_input_memory_units, self.embed_dim) - ) - else: - return rnn_inputs - - def rnn_and_attn_inputs_fn( - self, inputs: 'tf.Tensor', cell_state: 'tf.Tensor' - ) -> Tuple['tf.Tensor', 'tf.Tensor']: - """Construct rnn input and attention mechanism input. - - Args: - inputs: Tensor, concatenated all embeddings for one time step: - [embed_utter, embed_slots, embed_prev_action]. - - cell_state: Tensor, state of an rnn cell. - - Returns: - Tuple of Tensors `rnn_inputs, attn_inputs` to feed to - rnn and attention mechanisms. - """ - - # the hidden state c and slots are not included, - # in hope that algorithm would learn correct attention - # regardless of the hidden state c of an lstm and slots - if isinstance(cell_state, tf.contrib.rnn.LSTMStateTuple): - attn_inputs = tf.concat([inputs[:, : self.embed_dim], cell_state.h], -1) - else: - attn_inputs = tf.concat([inputs[:, : self.embed_dim], cell_state], -1) - - # include slots in inputs but exclude previous action, since - # rnn should get previous action from its hidden state - rnn_inputs = inputs[:, : (self.embed_dim + self.embed_dim)] - - return rnn_inputs, attn_inputs - - def _create_attn_cell( - self, - cell: tf.contrib.rnn.RNNCell, - embed_utter: 'tf.Tensor', - embed_prev_action: 'tf.Tensor', - real_length: 'tf.Tensor', - embed_for_no_intent: 'tf.Tensor', - embed_for_no_action: 'tf.Tensor', - embed_for_action_listen: 'tf.Tensor', - ) -> tf.contrib.rnn.RNNCell: - """Wrap cell in attention wrapper with given memory.""" - - if self.attn_before_rnn: - # create attention over previous user input - num_memory_units_before_rnn = self._num_units(embed_utter) - with tf.variable_scope('before', reuse=tf.AUTO_REUSE): - attn_mech = self._create_attn_mech(embed_utter, real_length) - - # create mask for empty user input not to pay attention to it - ignore_mask = tf.reduce_all( - tf.equal(tf.expand_dims(embed_for_no_intent, 0), embed_utter), -1 - ) - - # do not use attention by location before rnn - attn_shift_range = 0 - else: - attn_mech = None - ignore_mask = None - num_memory_units_before_rnn = None - attn_shift_range = None - - if self.attn_after_rnn: - # create attention over previous bot actions - with tf.variable_scope('after', reuse=tf.AUTO_REUSE): - attn_mech_after_rnn = self._create_attn_mech(embed_prev_action, real_length) - - # create mask for empty bot action or action_listen - # not to pay attention to them - ignore_mask_listen = tf.logical_or( - tf.reduce_all( - tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action), - -1, - ), - tf.reduce_all( - tf.equal( - tf.expand_dims(embed_for_action_listen, 0), embed_prev_action - ), - -1, - ), - ) - - if attn_mech is not None: - # if there is another attention mechanism, - # create a list of attention mechanisms - attn_mech = [attn_mech, attn_mech_after_rnn] - ignore_mask = [ignore_mask, ignore_mask_listen] - attn_shift_range = [attn_shift_range, self.attn_shift_range] - else: - attn_mech = attn_mech_after_rnn - ignore_mask = ignore_mask_listen - attn_shift_range = self.attn_shift_range - - # this particular attention mechanism is unusual - # in the sense that its calculated attention vector is directly - # added to cell output, therefore enabling copy mechanism - - # `index_of_attn_to_copy` is used by `TimeAttentionWrapper`, - # to know which attention to copy - index_of_attn_to_copy = -1 - else: - index_of_attn_to_copy = None - - return TimeAttentionWrapper( - cell=cell, - attention_mechanism=attn_mech, - sequence_len=self._dialogue_len, - attn_shift_range=attn_shift_range, - sparse_attention=self.sparse_attention, - rnn_and_attn_inputs_fn=self.rnn_and_attn_inputs_fn, - ignore_mask=ignore_mask, - cell_input_fn=lambda inputs, attention: ( - self.cell_input_fn(inputs, attention, num_memory_units_before_rnn) - ), - index_of_attn_to_copy=index_of_attn_to_copy, - likelihood_fn=lambda emb_1, emb_2: (self._tf_sim(emb_1, emb_2, None)), - tensor_not_to_copy=embed_for_action_listen, - output_attention=True, - alignment_history=True, - ) - - def _create_tf_dial_embed( - self, - embed_utter: 'tf.Tensor', - embed_slots: 'tf.Tensor', - embed_prev_action: 'tf.Tensor', - mask: 'tf.Tensor', - embed_for_no_intent: 'tf.Tensor', - embed_for_no_action: 'tf.Tensor', - embed_for_action_listen: 'tf.Tensor', - ) -> Tuple['tf.Tensor', Union['tf.Tensor', "TimeAttentionWrapperState"]]: - """Create rnn for dialogue level embedding.""" - - cell_input = tf.concat([embed_utter, embed_slots, embed_prev_action], -1) - - cell = self._create_rnn_cell() - - real_length = tf.cast(tf.reduce_sum(mask, 1), tf.int32) - - if self.is_using_attention(): - cell = self._create_attn_cell( - cell, - embed_utter, - embed_prev_action, - real_length, - embed_for_no_intent, - embed_for_no_action, - embed_for_action_listen, - ) - - with tf.variable_scope('rnn_decoder', reuse=tf.AUTO_REUSE): - return tf.nn.dynamic_rnn( - cell, - cell_input, - dtype=tf.float32, - sequence_length=real_length, - ) - - def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights): - x_in = tf.concat([a_in, b_prev_in], -1) - # print(x_in.shape[-1]) - # exit() - - # x = x_in + def _create_hparams(self): hparams = transformer_base() hparams.num_hidden_layers = self.num_rnn_layers @@ -812,8 +491,12 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei hparams.self_attention_type = "dot_product_relative_v2" hparams.max_relative_position = 5 hparams.add_relative_to_values = True + return hparams - # hparams.proximity_bias = True + def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights): + hparams = self._create_hparams() + + x_in = tf.concat([a_in, b_prev_in, c_in], -1) # When not in training mode, set all forms of dropout to zero. for key, value in hparams.values().items(): @@ -828,37 +511,11 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei kernel_regularizer=reg, name='transformer_embed_layer', reuse=tf.AUTO_REUSE) - # a = tf.layers.dense(inputs=a_in, - # units=hparams.hidden_size/3, - # use_bias=False, - # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), - # kernel_regularizer=reg, - # name='transformer_embed_layer_a', - # reuse=tf.AUTO_REUSE) - # - c = tf.layers.dense(inputs=c_in, - units=hparams.hidden_size, - use_bias=False, - kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), - kernel_regularizer=reg, - name='transformer_embed_layer_c', - reuse=tf.AUTO_REUSE) - # - # b = tf.layers.dense(inputs=b_prev_in, - # units=hparams.hidden_size/3, - # use_bias=False, - # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), - # kernel_regularizer=reg, - # name='transformer_embed_layer_b', - # reuse=tf.AUTO_REUSE) - - # x = tf.concat([a, c, b], -1) x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training) if hparams.multiply_embedding_mode == "sqrt_depth": x *= hparams.hidden_size ** 0.5 - c *= hparams.hidden_size ** 0.5 x *= tf.expand_dims(mask, -1) @@ -889,219 +546,9 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei attn_bias_for_padding=attn_bias_for_padding, ) - # x = tf.concat([x, c_in], -1) - # c_gate = tf.layers.dense(inputs=x, - # # units=hparams.hidden_size, - # # activation=tf.nn.softmax, - # units=1, - # activation=tf.math.sigmoid, - # bias_initializer=tf.constant_initializer(-1), - # # use_bias=False, - # # kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), - # kernel_regularizer=reg, - # name='slots_gate_layer_c', - # reuse=tf.AUTO_REUSE) - x += c #* c_gate - # x = common_layers.layer_postprocess(x, c, hparams) x *= tf.expand_dims(mask, -1) - return tf.nn.relu(x), self_attention_bias, x_in - - @staticmethod - def _rearrange_fn(list_tensor_1d_mask_1d): - """Rearranges tensor_1d to put all the values - where mask_1d=1 to the right and - where mask_1d=0 to the left""" - tensor_1d, mask_1d = list_tensor_1d_mask_1d - - partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2) - - return tf.concat(partitioned_tensor, 0) - - @staticmethod - def _arrange_back_fn(list_tensor_1d_mask_1d): - """Arranges back tensor_1d to restore original order - modified by `_rearrange_fn` according to mask_1d: - - number of 0s in mask_1d values on the left are set to - their corresponding places where mask_1d=0, - - number of 1s in mask_1d values on the right are set to - their corresponding places where mask_1d=1""" - tensor_1d, mask_1d = list_tensor_1d_mask_1d - - mask_indices = tf.dynamic_partition( - tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2 - ) - - mask_sum = tf.reduce_sum(mask_1d, axis=0) - partitioned_tensor = [ - tf.zeros_like(tensor_1d[:-mask_sum]), - tensor_1d[-mask_sum:], - ] - - return tf.dynamic_stitch(mask_indices, partitioned_tensor) - - def _action_to_copy(self, x_in, x, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action): - with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): - ignore_mask_listen = tf.to_float(tf.logical_or( - tf.reduce_all( - tf.equal(tf.expand_dims(embed_for_no_action, 0), embed_prev_action), - -1, - ), - tf.reduce_all( - tf.equal(tf.expand_dims(embed_for_action_listen, 0), embed_prev_action), - -1, - ), - )) - - triag_mask = tf.expand_dims( - common_attention.attention_bias_to_padding(self_attention_bias[0, 0, :, tf.newaxis, tf.newaxis, :]), 0) - diag_mask = 1 - (1 - triag_mask) * tf.cumprod(triag_mask, axis=-1, exclusive=True, reverse=True) - - bias = self_attention_bias + common_attention.attention_bias_ignore_padding(ignore_mask_listen) * tf.expand_dims(diag_mask, 1) - - copy_weights = {} - common_attention.multihead_attention(x_in, - embed_prev_action, - bias, - self.rnn_size, - self.embed_dim, - self.embed_dim, - 1, - 0, - save_weights_to=copy_weights) - - copy_weights = copy_weights['copy/multihead_attention/dot_product_attention'][:, 0, :, :] - bias = bias[:, 0, :, :] - shape = tf.shape(copy_weights) - copy_weights = tf.reshape(copy_weights, (-1, shape[-1])) - x_flat = tf.reshape(x_in, (-1, x_in.shape[-1])) - bias = tf.reshape(bias, (-1, shape[-1])) - ignore_mask = common_attention.attention_bias_to_padding(bias[:, tf.newaxis, tf.newaxis, :], tf.to_int32) - - s_w = tf.layers.dense( - inputs=x_flat, - units=2 * self.attn_shift_range + 1, - activation=tf.nn.softmax, - name="shift_weight", - reuse=tf.AUTO_REUSE - ) - mask = 1 - ignore_mask - conv_weights = tf.map_fn( - self._rearrange_fn, [copy_weights, mask], dtype=copy_weights.dtype - ) - - conv_weights = tf.reverse(conv_weights, axis=[1]) - - # preare probs for tf.nn.depthwise_conv2d - # [in_width, in_channels=batch] - conv_weights = tf.transpose(conv_weights, [1, 0]) - # [batch=1, in_height=1, in_width=time+1, in_channels=batch] - conv_weights = conv_weights[tf.newaxis, tf.newaxis, :, :] - - # [filter_height=1, filter_width=2*attn_shift_range+1, - # in_channels=batch, channel_multiplier=1] - conv_s_w = tf.transpose(s_w, [1, 0]) - conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis] - - # perform 1d convolution - # [batch=1, out_height=1, out_width=time+1, out_channels=batch] - conv_weights = tf.nn.depthwise_conv2d_native( - conv_weights, conv_s_w, [1, 1, 1, 1], "SAME" - ) - conv_weights = conv_weights[0, 0, :, :] - conv_weights = tf.transpose(conv_weights, [1, 0]) - - conv_weights = tf.reverse(conv_weights, axis=[1]) - - # arrange probs back to their original time order - copy_weights = tf.map_fn( - self._arrange_back_fn, [conv_weights, mask], dtype=conv_weights.dtype - ) - - # sharpening parameter - g_sh = tf.layers.dense( - inputs=x_flat, - units=1, - activation=lambda a: tf.nn.softplus(a) + 1, - bias_initializer=tf.constant_initializer(1), - name="gamma_sharp", - reuse=tf.AUTO_REUSE - ) - - powed_weights = tf.pow(copy_weights, g_sh) - copy_weights = powed_weights / (tf.reduce_sum(powed_weights, 1, keepdims=True) + 1e-32) - - copy_weights = tf.reshape(copy_weights, shape) - - # remove current time - copy_prev = copy_weights * diag_mask - keep_current = copy_weights * (1 - diag_mask) - dial_embed = self._create_embed(x, layer_name_suffix="out") - return tf.matmul(copy_prev, embed_prev_action) + tf.matmul(keep_current, dial_embed), copy_weights - - @staticmethod - def _alignments_history_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor': - """Extract alignments history form final rnn cell state.""" - - alignments_from_state = final_state.alignment_history - if not isinstance(alignments_from_state, tuple): - alignments_from_state = [alignments_from_state] - - alignment_history = [] - for alignments in alignments_from_state: - # reshape to (batch, time, memory_time) - alignment_history.append(tf.transpose(alignments.stack(), [1, 0, 2])) - - return tf.concat(alignment_history, -1) - - @staticmethod - def _all_time_masks_from(final_state: "TimeAttentionWrapperState") -> 'tf.Tensor': - """Extract all time masks form final rnn cell state.""" - - # reshape to (batch, time, memory_time) and ignore last time - # because time_mask is created for the next time step - return tf.transpose(final_state.all_time_masks.stack(), [1, 0, 2])[:, :-1, :] - - def _sims_rnn_to_max_from(self, cell_output: 'tf.Tensor') -> List['tf.Tensor']: - """Save intermediate tensors for debug purposes.""" - - if self.attn_after_rnn: - # extract additional debug tensors - num_add = TimeAttentionWrapper.additional_output_size() - self.copy_attn_debug = cell_output[:, :, -num_add:] - - # extract additional similarity to maximize - sim_attn_to_max = cell_output[:, :, -num_add] - sim_state_to_max = cell_output[:, :, -num_add + 1] - return [sim_attn_to_max, sim_state_to_max] - else: - return [] - - def _embed_dialogue_from(self, cell_output: 'tf.Tensor') -> 'tf.Tensor': - """Extract or calculate dialogue level embedding from cell_output.""" - - if self.attn_after_rnn: - # embedding layer is inside rnn cell - embed_dialogue = cell_output[:, :, : self.embed_dim] - - # extract additional debug tensors - num_add = TimeAttentionWrapper.additional_output_size() - self.rnn_embed = cell_output[ - :, :, self.embed_dim : (self.embed_dim + self.embed_dim) - ] - self.attn_embed = cell_output[ - :, :, (self.embed_dim + self.embed_dim) : -num_add - ] - else: - # add embedding layer to rnn cell output - embed_dialogue = self._create_embed( - cell_output[:, :, : self.rnn_size], layer_name_suffix="out" - ) - if self.attn_before_rnn: - # extract additional debug tensors - self.attn_embed = cell_output[:, :, self.rnn_size :] - - return embed_dialogue + return tf.nn.relu(x) def _tf_sample_neg(self, pos_b, @@ -1160,8 +607,7 @@ def _tf_sim( embed_dialogue: 'tf.Tensor', embed_action: 'tf.Tensor', mask: Optional['tf.Tensor'], - ) -> Union[Tuple['tf.Tensor', 'tf.Tensor'], - Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor']]: + ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']: """Define similarity. This method has two roles: @@ -1183,135 +629,44 @@ def _tf_sim( "".format(self.similarity_type) ) - if len(embed_dialogue.shape) == 2 and len(embed_action.shape) == 2: - # calculate similarity between - # two embedding vectors of the same size + # calculate similarity with several + # embedded actions for the loss - # always use cosine sim for copy mech + if self.similarity_type == "cosine": + # normalize embedding vectors for cosine similarity embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1) embed_action = tf.nn.l2_normalize(embed_action, -1) - cos_sim = tf.reduce_sum(embed_dialogue * embed_action, -1, keepdims=True) - - bin_sim = tf.where( - cos_sim > (self.mu_pos - self.mu_neg) / 2.0, - tf.ones_like(cos_sim), - tf.zeros_like(cos_sim), - ) - - # output binary mask and similarity - return bin_sim, cos_sim - + if len(embed_dialogue.shape) == 4: + embed_dialogue_pos = embed_dialogue[:, :, :1, :] else: - # calculate similarity with several - # embedded actions for the loss + embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2) - if self.similarity_type == "cosine": - # normalize embedding vectors for cosine similarity - embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1) - embed_action = tf.nn.l2_normalize(embed_action, -1) + sim = tf.reduce_sum( + embed_dialogue_pos * embed_action, -1 + ) * tf.expand_dims(mask, 2) - if len(embed_dialogue.shape) == 4: - embed_dialogue_pos = embed_dialogue[:, :, :1, :] - else: - embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2) + sim_bot_emb = tf.reduce_sum( + embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1 + ) * tf.expand_dims(mask, 2) - sim = tf.reduce_sum( - embed_dialogue_pos * embed_action, -1 + if len(embed_dialogue.shape) == 4: + sim_dial_emb = tf.reduce_sum( + embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1 ) * tf.expand_dims(mask, 2) + else: + sim_dial_emb = None - sim_bot_emb = tf.reduce_sum( - embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1 + if len(embed_dialogue.shape) == 4: + sim_dial_bot_emb = tf.reduce_sum( + embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1 ) * tf.expand_dims(mask, 2) - - if len(embed_dialogue.shape) == 4: - sim_dial_emb = tf.reduce_sum( - embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1 - ) * tf.expand_dims(mask, 2) - else: - sim_dial_emb = None - - if len(embed_dialogue.shape) == 4: - sim_dial_bot_emb = tf.reduce_sum( - embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1 - ) * tf.expand_dims(mask, 2) - else: - sim_dial_bot_emb = None - - # output similarities between user input and bot actions - # and similarities between bot actions - return sim, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb - - # noinspection PyPep8Naming - def _scale_loss_by_count_actions( - self, - X, - Y, - slots, - previous_actions, - ) -> Union[np.ndarray, List[List]]: - """Calculate inverse proportionality of repeated actions.""" - - if self.scale_loss_by_action_counts: - # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): - # full = tf.concat([X, slots, previous_actions, Y], -1) - # else: - full = Y - - flat = tf.reshape(full, (-1, full.shape[-1])) - _, i, c = gen_array_ops.unique_with_counts_v2(flat, axis=[0]) - c = tf.cast(c, tf.float32) - - counts = tf.reshape(tf.gather(c, i), (tf.shape(Y)[0], tf.shape(Y)[1])) - - # do not include [-1 -1 ... -1 0] in averaging - # and smooth it by taking sqrt - - if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): - # action_listen is the top one by an order - max_c = tf.math.top_k(c, 2)[0][1] - else: - max_c = tf.reduce_max(c) - # max_c = tf.math.top_k(c, 2)[0][1] - # max_c = tf.cond(tf.shape(c)[0] > 1, lambda: tf.math.top_k(c, 2)[0][1], lambda: tf.reduce_max(c)) - # max_c = tf.reduce_max(c) - - return tf.maximum(max_c / counts, 1) - # return tf.maximum(tf.square(max_c / counts), 1) - - # exit() - # full_X = tf.concat( - # [X, slots, previous_actions, Y], -1 - # ) - # full_X = tf.reshape(full_X, (-1, full_X.shape[-1])) - # # include [-1 -1 ... -1 0] as first - # # full_X = tf.concat([full_X[-1:], full_X], 0) - # - # _, i, c = gen_array_ops.unique_with_counts_v2(full_X, axis=[0]) - # c = tf.cast(c, tf.float32) - # - # counts = tf.reshape(tf.gather(c, i), (tf.shape(X)[0], tf.shape(X)[1])) - # - # # do not include [-1 -1 ... -1 0] in averaging - # # and smooth it by taking sqrt - # return tf.maximum(tf.sqrt(tf.reduce_mean(c) / counts), 1) else: - return [[None]] - - def _regularization_loss(self): - # type: () -> Union['tf.Tensor', int] - """Add regularization to the embed layer inside rnn cell.""" - - if self.attn_after_rnn: - vars_to_reg = [ - tf.nn.l2_loss(tf_var) - for tf_var in tf.trainable_variables() - if "cell/out_layer/kernel" in tf_var.name - ] - if vars_to_reg: - return self.C2 * tf.add_n(vars_to_reg) + sim_dial_bot_emb = None - return 0 + # output similarities between user input and bot actions + # and similarities between bot actions + return sim, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb def _tf_loss( self, @@ -1339,10 +694,6 @@ def _tf_loss( max_margin = tf.maximum(0., self.mu_neg + sim_neg) loss += tf.reduce_sum(max_margin, -1) - if isinstance(self.featurizer, FullDialogueTrackerFeaturizer) and self.scale_loss_by_action_counts: - # scale loss inverse proportionally to number of action counts - loss *= self._loss_scales - # penalize max similarity between bot embeddings sim_bot_emb += common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs max_sim_bot_emb = tf.maximum(0., tf.reduce_max(sim_bot_emb, -1)) @@ -1410,19 +761,11 @@ def _tf_loss_2( # already_learned = tf.where(pred[:, :, 0] > 0.8, zeros, ones) already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4) - # if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): - # if self.scale_loss_by_action_counts: - # scale_mask = self._loss_scales * mask - # else: - scale_mask = mask - # else: - # scale_mask = 1.0 - loss = tf.losses.softmax_cross_entropy(labels, logits, - scale_mask * already_learned) + mask * already_learned) # add regularization losses - loss += self._regularization_loss() + tf.losses.get_regularization_loss() + loss += tf.losses.get_regularization_loss() # maximize similarity returned by time attention wrapper add_loss = [] @@ -1457,10 +800,6 @@ def train( # dealing with training data training_data = self.featurize_for_training(training_trackers, domain, **kwargs) - # assume that characteristic time is the mean length of the dialogues - self.characteristic_time = np.mean(training_data.true_length) - if self.attn_shift_range is None: - self.attn_shift_range = int(self.characteristic_time / 2) # encode all actions with policies' featurizer self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions( @@ -1477,9 +816,7 @@ def train( self.num_neg = min(self.num_neg, domain.num_actions - 1) # extract actual training data to feed to tf session - session_data = self._create_tf_session_data( - domain, training_data.X, training_data.y - ) + session_data = self._create_session_data(training_data.X, training_data.y) self.graph = tf.Graph() @@ -1487,52 +824,26 @@ def train( # set random seed in tf tf.set_random_seed(self.random_seed) + # allows increasing batch size batch_size_in = tf.placeholder(tf.int64) - train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X, - session_data.Y, - session_data.slots, - session_data.previous_actions)) - train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) - train_dataset = train_dataset.batch(batch_size_in) + train_dataset = self._create_tf_dataset(session_data, batch_size_in) if self.evaluate_on_num_examples: - ids = np.random.permutation(len(session_data.X))[:self.evaluate_on_num_examples] - - val_dataset = tf.data.Dataset.from_tensor_slices((session_data.X[ids], - session_data.Y[ids], - session_data.slots[ids], - session_data.previous_actions[ids]) - ).batch(self.evaluate_on_num_examples) + eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples) + eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples) else: - val_dataset = None + eval_train_dataset = None iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes, output_classes=train_dataset.output_classes) - self.a_in, self.b_in, self.c_in, self.b_prev_in = iterator.get_next() - - self.a_in = tf.cast(self.a_in, tf.float32) - self.b_in = tf.cast(self.b_in, tf.float32) - self.c_in = tf.cast(self.c_in, tf.float32) - self.b_prev_in = tf.cast(self.b_prev_in, tf.float32) + # session data are int counts but we need a float tensors + (self.a_in, + self.b_in, + self.c_in, + self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next()) - # they don't change - self._x_for_no_intent_in = tf.constant( - session_data.x_for_no_intent, - dtype=tf.float32, - name="x_for_no_intent", - ) - self._y_for_no_action_in = tf.constant( - session_data.y_for_no_action, - dtype=tf.float32, - name="y_for_no_action", - ) - self._y_for_action_listen_in = tf.constant( - session_data.y_for_action_listen, - dtype=tf.float32, - name="y_for_action_listen", - ) all_actions = tf.constant(self.encoded_all_actions, dtype=tf.float32, name="all_actions") @@ -1547,55 +858,20 @@ def train( # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + self.attention_weights = {} + transformer_out = self._create_transformer_encoder( + self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) + self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out") + sims_rnn_to_max = [] + self.bot_embed = self._create_tf_bot_embed(self.b_in) all_actions_embed = self._create_tf_bot_embed(all_actions) - embed_prev_action = self._create_tf_bot_embed(self.b_prev_in) - embed_for_no_action = self._create_tf_no_action_embed( - self._y_for_no_action_in - ) - embed_for_action_listen = self._create_tf_no_action_embed( - self._y_for_action_listen_in - ) - - if self.transformer: - self.attention_weights = {} - tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) - # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias, embed_prev_action, embed_for_action_listen, embed_for_no_action) - self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out") #+ self._create_embed(self.c_in, layer_name_suffix="slots") - sims_rnn_to_max = [] - else: - # create embedding vectors - self.user_embed = self._create_tf_user_embed(self.a_in) - self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt") - - embed_for_no_intent = self._create_tf_no_intent_embed( - self._x_for_no_intent_in - ) - - # get rnn output - cell_output, final_state = self._create_tf_dial_embed( - self.user_embed, - self.slot_embed, - embed_prev_action, - mask, - embed_for_no_intent, - embed_for_no_action, - embed_for_action_listen, - ) - # process rnn output - if self.is_using_attention(): - self.alignment_history = self._alignments_history_from(final_state) - - self.all_time_masks = self._all_time_masks_from(final_state) - - sims_rnn_to_max = self._sims_rnn_to_max_from(cell_output) - self.dial_embed = self._embed_dialogue_from(cell_output) - # calculate similarities if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): - self.b_in = tf.expand_dims(self.b_in, 1) - self.bot_embed = tf.expand_dims(self.bot_embed, 1) + # pick last action if max history is used + self.b_in = self.b_in[:, tf.newaxis, :] + self.bot_embed = self.bot_embed[:, tf.newaxis, :] self.dial_embed = self.dial_embed[:, -1:, :] mask = mask[:, -1:] @@ -1641,11 +917,6 @@ def train( # self.sim_op, sim_bot_emb, sim_dial_emb = self._tf_sim(self.dial_embed, tiled_bot_embed, mask) self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb = self._tf_sim(tiled_dial_embed, tiled_bot_embed, mask) - # construct loss - if self.scale_loss_by_action_counts: - self._loss_scales = self._scale_loss_by_count_actions(self.a_in, self.b_in, self.c_in, self.b_prev_in) - else: - self._loss_scales = None # loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sims_rnn_to_max, bad_negs, mask) loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb, sims_rnn_to_max, bad_negs, mask, batch_bad_negs) @@ -1656,15 +927,15 @@ def train( train_init_op = iterator.make_initializer(train_dataset) if self.evaluate_on_num_examples: - val_init_op = iterator.make_initializer(val_dataset) + eval_init_op = iterator.make_initializer(eval_train_dataset) else: - val_init_op = None + eval_init_op = None # train tensorflow graph self.session = tf.Session(config=self._tf_config) # self._train_tf(session_data, loss, mask) - self._train_tf_dataset(train_init_op, val_init_op, batch_size_in, loss, mask, session_data.X.shape[1]) + self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, loss, mask, session_data.X.shape[1]) dialogue_len = None # use dynamic time for rnn # create placeholders @@ -1693,40 +964,12 @@ def train( # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) - self.bot_embed = self._create_tf_bot_embed(self.b_in) - embed_prev_action = self._create_tf_bot_embed(self.b_prev_in) - - if self.transformer: - self.attention_weights = {} - tr_out, self_attention_bias, tr_in = self._create_transformer_encoder(self.a_in, self.c_in, self.b_prev_in, mask, - self.attention_weights) - # self.dial_embed, self.attention_weights = self._action_to_copy(tr_in, tr_out, self_attention_bias, - # embed_prev_action, - # embed_for_action_listen, - # embed_for_no_action) - self.dial_embed = self._create_embed(tr_out, layer_name_suffix="out") - - else: - self.user_embed = self._create_tf_user_embed(self.a_in) - self.slot_embed = self._create_embed(self.c_in, layer_name_suffix="slt") - - # get rnn output - cell_output, final_state = self._create_tf_dial_embed( - self.user_embed, - self.slot_embed, - embed_prev_action, - mask, - embed_for_no_intent, - embed_for_no_action, - embed_for_action_listen, - ) - # process rnn output - if self.is_using_attention(): - self.alignment_history = self._alignments_history_from(final_state) - - self.all_time_masks = self._all_time_masks_from(final_state) + self.attention_weights = {} + transformer_out = self._create_transformer_encoder( + self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) + self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out") - self.dial_embed = self._embed_dialogue_from(cell_output) + self.bot_embed = self._create_tf_bot_embed(self.b_in) if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): self.dial_embed = self.dial_embed[:, -1:, :] @@ -1758,7 +1001,7 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int: def _train_tf_dataset(self, train_init_op, - val_init_op, + eval_init_op, batch_size_in, loss: 'tf.Tensor', mask, @@ -1799,11 +1042,11 @@ def _train_tf_dataset(self, ep_loss /= batches_per_epoch - if self.evaluate_on_num_examples and val_init_op is not None: + if self.evaluate_on_num_examples and eval_init_op is not None: if (ep == 0 or (ep + 1) % self.evaluate_every_num_epochs == 0 or (ep + 1) == self.epochs): - train_acc = self._output_training_stat_dataset(val_init_op, mask, dialogue_len) + train_acc = self._output_training_stat_dataset(eval_init_op, mask, dialogue_len) last_loss = ep_loss pbar.set_postfix({ @@ -1820,10 +1063,10 @@ def _train_tf_dataset(self, "loss={:.3f}, train accuracy={:.3f}" "".format(last_loss, train_acc)) - def _output_training_stat_dataset(self, val_init_op, mask, dialogue_len) -> np.ndarray: + def _output_training_stat_dataset(self, eval_init_op, mask, dialogue_len) -> np.ndarray: """Output training statistics""" - self.session.run(val_init_op) + self.session.run(eval_init_op) sim_, mask_ = self.session.run([self.sim_op, mask], feed_dict={self._is_training: False, @@ -1851,19 +1094,10 @@ def continue_training( batch_size, training_trackers, domain ) - session_data = self._create_tf_session_data( - domain, training_data.X, training_data.y - ) + session_data = self._create_session_data(training_data.X, training_data.y) b = self._create_batch_b(session_data.Y, session_data.actions_for_Y) - batch_loss_scales = self._scale_loss_by_count_actions( - session_data.X, - session_data.slots, - session_data.previous_actions, - session_data.actions_for_Y, - ) - # fit to one extra example using updated trackers self.session.run( self._train_op, @@ -1873,11 +1107,7 @@ def continue_training( self.c_in: session_data.slots, self.b_prev_in: session_data.previous_actions, self._dialogue_len: session_data.X.shape[1], - self._x_for_no_intent_in: session_data.x_for_no_intent, - self._y_for_no_action_in: session_data.y_for_no_action, - self._y_for_action_listen_in: session_data.y_for_action_listen, self._is_training: True, - self._loss_scales: batch_loss_scales, }, ) @@ -1886,7 +1116,7 @@ def tf_feed_dict_for_prediction(self, domain: Domain) -> Dict: # noinspection PyPep8Naming data_X = self.featurizer.create_X([tracker], domain) - session_data = self._create_tf_session_data(domain, data_X) + session_data = self._create_session_data(data_X) # noinspection PyPep8Naming all_Y_d_x = np.stack([session_data.all_Y_d for _ in range(session_data.X.shape[0])]) @@ -1915,7 +1145,7 @@ def predict_action_probabilities( # noinspection PyPep8Naming data_X = self.featurizer.create_X([tracker], domain) - session_data = self._create_tf_session_data(domain, data_X) + session_data = self._create_session_data(data_X) # noinspection PyPep8Naming all_Y_d_x = np.stack( [session_data.all_Y_d for _ in range(session_data.X.shape[0])] @@ -1975,7 +1205,7 @@ def persist(self, path: Text) -> None: file_name = "tensorflow_embedding.ckpt" checkpoint = os.path.join(path, file_name) - utils.create_dir_for_file(checkpoint) + rasa.utils.io.create_directory_for_file(checkpoint) with self.graph.as_default(): self._persist_tensor("intent_placeholder", self.a_in) @@ -1983,9 +1213,6 @@ def persist(self, path: Text) -> None: self._persist_tensor("slots_placeholder", self.c_in) self._persist_tensor("prev_act_placeholder", self.b_prev_in) self._persist_tensor("dialogue_len", self._dialogue_len) - self._persist_tensor("x_for_no_intent", self._x_for_no_intent_in) - self._persist_tensor("y_for_no_action", self._y_for_no_action_in) - self._persist_tensor("y_for_action_listen", self._y_for_action_listen_in) self._persist_tensor("similarity_op", self.sim_op) @@ -2062,9 +1289,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy": c_in = cls.load_tensor("slots_placeholder") b_prev_in = cls.load_tensor("prev_act_placeholder") dialogue_len = cls.load_tensor("dialogue_len") - x_for_no_intent = cls.load_tensor("x_for_no_intent") - y_for_no_action = cls.load_tensor("y_for_no_action") - y_for_action_listen = cls.load_tensor("y_for_action_listen") sim_op = cls.load_tensor("similarity_op") @@ -2101,9 +1325,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy": slots_placeholder=c_in, prev_act_placeholder=b_prev_in, dialogue_len=dialogue_len, - x_for_no_intent=x_for_no_intent, - y_for_no_action=y_for_no_action, - y_for_action_listen=y_for_action_listen, similarity_op=sim_op, alignment_history=alignment_history, user_embed=user_embed, From a39e10c17d99b450bfee420b69f4c56954d4b365 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 16 Jul 2019 17:40:48 +0200 Subject: [PATCH 03/50] refactor neg sampling, sim, acc and loss --- rasa/core/policies/embedding_policy.py | 481 ++++++++++++------------- 1 file changed, 227 insertions(+), 254 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 68bc2808a67d..63a10afcfe3b 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -60,7 +60,7 @@ class EmbeddingPolicy(Policy): """Recurrent Embedding Dialogue Policy (REDP) - The policy that is used in our paper https://arxiv.org/abs/1811.11707 + Transformer version of the policy used in our paper https://arxiv.org/abs/1811.11707 """ SUPPORTS_ONLINE_TRAINING = True @@ -103,7 +103,8 @@ class EmbeddingPolicy(Policy): # maximum negative similarity for incorrect actions "mu_neg": -0.2, # should be -1.0 < ... < 1.0 for 'cosine' # the type of the similarity - "similarity_type": "cosine", # string 'cosine' or 'inner' + "similarity_type": "auto", # string 'auto' or 'cosine' or 'inner' + "loss_type": 'softmax', # string 'softmax' or 'margin' # the number of incorrect actions, the algorithm will minimize # their similarity to the user input during training "num_neg": 20, @@ -164,7 +165,6 @@ def __init__( action_placeholder: Optional['tf.Tensor'] = None, slots_placeholder: Optional['tf.Tensor'] = None, prev_act_placeholder: Optional['tf.Tensor'] = None, - dialogue_len: Optional['tf.Tensor'] = None, similarity_op: Optional['tf.Tensor'] = None, alignment_history: Optional['tf.Tensor'] = None, user_embed: Optional['tf.Tensor'] = None, @@ -208,7 +208,6 @@ def __init__( self.b_in = action_placeholder self.c_in = slots_placeholder self.b_prev_in = prev_act_placeholder - self._dialogue_len = dialogue_len self.sim_op = similarity_op # store attention probability distribution as @@ -270,6 +269,13 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None: self.mu_pos = config["mu_pos"] self.mu_neg = config["mu_neg"] self.similarity_type = config["similarity_type"] + self.loss_type = config['loss_type'] + if self.similarity_type == 'auto': + if self.loss_type == 'softmax': + self.similarity_type = 'inner' + elif self.loss_type == 'margin': + self.similarity_type = 'cosine' + self.num_neg = config["num_neg"] self.use_max_sim_neg = config["use_max_sim_neg"] @@ -414,12 +420,14 @@ def _sample_session_data(session_data: 'SessionData', # tf helpers: @staticmethod def _create_tf_dataset(session_data: 'SessionData', - batch_size: Union['tf.Tensor', int]) -> 'tf.data.Dataset': + batch_size: Union['tf.Tensor', int], + shuffle: bool = True) -> 'tf.data.Dataset': train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X, session_data.Y, session_data.slots, session_data.previous_actions)) - train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) + if shuffle: + train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) train_dataset = train_dataset.batch(batch_size) return train_dataset @@ -550,77 +558,55 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei return tf.nn.relu(x) - def _tf_sample_neg(self, - pos_b, - neg_bs=None, - neg_ids=None, - batch_size=None, - first_only=False + @staticmethod + def _tf_make_flat(x): + return tf.reshape(x, (-1, x.shape[-1])) + + @staticmethod + def _tf_sample_neg(batch_size, + all_bs, + neg_ids, ) -> 'tf.Tensor': - all_b = pos_b[tf.newaxis, :, :] - if batch_size is None: - batch_size = tf.shape(pos_b)[0] - all_b = tf.tile(all_b, [batch_size, 1, 1]) - if neg_bs is None and neg_ids is None: - return all_b - - def sample_neg_b(): - if neg_bs is not None: - _neg_bs = neg_bs - elif neg_ids is not None: - _neg_bs = tf.batch_gather(all_b, neg_ids) - else: - raise - return tf.concat([pos_b[:, tf.newaxis, :], _neg_bs], 1) + tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1)) - if first_only: - out_b = pos_b[:, tf.newaxis, :] - else: - out_b = all_b + return tf.batch_gather(tiled_all_bs, neg_ids) - if neg_bs is not None: - cond = tf.logical_and(self._is_training, tf.shape(neg_bs)[0] > 1) - elif neg_ids is not None: - cond = tf.logical_and(self._is_training, tf.shape(neg_ids)[0] > 1) - else: - raise + def _tf_calc_iou_mask(self, + pos_b, + all_bs, + neg_ids, + ) -> 'tf.Tensor': - return tf.cond(cond, sample_neg_b, lambda: out_b) + pos_b_in_flat = pos_b[:, tf.newaxis, :] + neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids) - def _tf_calc_iou(self, - b_raw, - neg_bs=None, - neg_ids=None - ) -> 'tf.Tensor': + intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat) + union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat) - tiled_intent_raw = self._tf_sample_neg(b_raw, neg_bs=neg_bs, neg_ids=neg_ids) - pos_b_raw = tiled_intent_raw[:, :1, :] - neg_b_raw = tiled_intent_raw[:, 1:, :] - intersection_b_raw = tf.minimum(neg_b_raw, pos_b_raw) - union_b_raw = tf.maximum(neg_b_raw, pos_b_raw) + iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1) + return 1. - tf.nn.relu(tf.sign(1. - iou)) - return tf.reduce_sum(intersection_b_raw, -1) / tf.reduce_sum(union_b_raw, -1) + def _tf_get_negs(self, all_embed, all_raw, raw_pos): - def _tf_sim( - self, - embed_dialogue: 'tf.Tensor', - embed_action: 'tf.Tensor', - mask: Optional['tf.Tensor'], - ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']: - """Define similarity. - - This method has two roles: - - calculate similarity between - two embedding vectors of the same size - and output binary mask and similarity; - - calculate similarity with several embedded actions for the loss - and output similarities between user input and bot actions - and similarities between bot actions. - - They are kept in the same helper method, - because it is necessary for them to be mathematically identical. - """ + batch_size = tf.shape(raw_pos)[0] + seq_length = tf.shape(raw_pos)[1] + raw_flat = self._tf_make_flat(raw_pos) + + neg_ids = tf.random.categorical(tf.log(tf.ones((batch_size * seq_length, + tf.shape(all_raw)[0]))), + self.num_neg) + + bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids) + bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1)) + + neg_embed_flat = self._tf_sample_neg(batch_size * seq_length, all_embed, neg_ids) + neg_embed = tf.reshape(neg_embed_flat, + (batch_size, seq_length, -1, all_embed.shape[-1])) + + return neg_embed, bad_negs + + def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor': if self.similarity_type not in {"cosine", "inner"}: raise ValueError( @@ -629,62 +615,72 @@ def _tf_sim( "".format(self.similarity_type) ) - # calculate similarity with several - # embedded actions for the loss - if self.similarity_type == "cosine": - # normalize embedding vectors for cosine similarity - embed_dialogue = tf.nn.l2_normalize(embed_dialogue, -1) - embed_action = tf.nn.l2_normalize(embed_action, -1) - - if len(embed_dialogue.shape) == 4: - embed_dialogue_pos = embed_dialogue[:, :, :1, :] + return tf.nn.l2_normalize(a, -1) else: - embed_dialogue_pos = tf.expand_dims(embed_dialogue, -2) + return a - sim = tf.reduce_sum( - embed_dialogue_pos * embed_action, -1 - ) * tf.expand_dims(mask, 2) + @staticmethod + def _tf_raw_sim( + a: 'tf.Tensor', + b: 'tf.Tensor', + mask: 'tf.Tensor', + ) -> 'tf.Tensor': - sim_bot_emb = tf.reduce_sum( - embed_action[:, :, :1, :] * embed_action[:, :, 1:, :], -1 - ) * tf.expand_dims(mask, 2) + return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2) - if len(embed_dialogue.shape) == 4: - sim_dial_emb = tf.reduce_sum( - embed_dialogue[:, :, :1, :] * embed_dialogue[:, :, 1:, :], -1 - ) * tf.expand_dims(mask, 2) - else: - sim_dial_emb = None + def _tf_sim( + self, + pos_dial_embed: 'tf.Tensor', + pos_bot_embed: 'tf.Tensor', + neg_dial_embed: 'tf.Tensor', + neg_bot_embed: 'tf.Tensor', + dial_bad_negs: 'tf.Tensor', + bot_bad_negs: 'tf.Tensor', + mask: 'tf.Tensor', + ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']: + """Define similarity.""" - if len(embed_dialogue.shape) == 4: - sim_dial_bot_emb = tf.reduce_sum( - embed_dialogue[:, :, :1, :] * embed_action[:, :, 1:, :], -1 - ) * tf.expand_dims(mask, 2) - else: - sim_dial_bot_emb = None + # calculate similarity with several + # embedded actions for the loss + neg_inf = common_attention.large_compatible_negative(pos_dial_embed.dtype) + + sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask) + sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed, + mask) + neg_inf * bot_bad_negs + sim_neg_bot_bot = self._tf_raw_sim(pos_bot_embed, neg_bot_embed, + mask) + neg_inf * bot_bad_negs + sim_neg_dial_dial = self._tf_raw_sim(pos_dial_embed, neg_dial_embed, + mask) + neg_inf * dial_bad_negs + sim_neg_bot_dial = self._tf_raw_sim(pos_bot_embed, neg_dial_embed, + mask) + neg_inf * dial_bad_negs # output similarities between user input and bot actions - # and similarities between bot actions - return sim, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb + # and similarities between bot actions and similarities between user inputs + return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial - def _tf_loss( + @staticmethod + def _tf_calc_accuracy(sim_pos, sim_neg): + + max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1) + return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), + tf.float32)) + + def _tf_loss_margin( self, - sim: 'tf.Tensor', - sim_bot_emb: 'tf.Tensor', - sim_dial_emb: 'tf.Tensor', - sims_rnn_to_max: List['tf.Tensor'], - bad_negs, + sim_pos: 'tf.Tensor', + sim_neg: 'tf.Tensor', + sim_neg_bot_bot: 'tf.Tensor', + sim_neg_dial_dial: 'tf.Tensor', + sim_neg_bot_dial: 'tf.Tensor', mask: 'tf.Tensor', - batch_bad_negs ) -> 'tf.Tensor': - """Define loss.""" + """Define max margin loss.""" # loss for maximizing similarity with correct action - loss = tf.maximum(0., self.mu_pos - sim[:, :, 0]) + loss = tf.maximum(0., self.mu_pos - sim_pos[:, :, 0]) # loss for minimizing similarity with `num_neg` incorrect actions - sim_neg = sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs if self.use_max_sim_neg: # minimize only maximum similarity over incorrect actions max_sim_neg = tf.reduce_max(sim_neg, -1) @@ -694,71 +690,55 @@ def _tf_loss( max_margin = tf.maximum(0., self.mu_neg + sim_neg) loss += tf.reduce_sum(max_margin, -1) - # penalize max similarity between bot embeddings - sim_bot_emb += common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs - max_sim_bot_emb = tf.maximum(0., tf.reduce_max(sim_bot_emb, -1)) - loss += max_sim_bot_emb * self.C_emb + # penalize max similarity between pos bot and neg bot embeddings + max_sim_neg_bot = tf.maximum(0., tf.reduce_max(sim_neg_bot_bot, -1)) + loss += max_sim_neg_bot * self.C_emb - # penalize max similarity between dial embeddings - if sim_dial_emb is not None: - sim_dial_emb += common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs - max_sim_input_emb = tf.maximum(0., tf.reduce_max(sim_dial_emb, -1)) - loss += max_sim_input_emb * self.C_emb + # penalize max similarity between pos dial and neg dial embeddings + max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_dial_dial, -1)) + loss += max_sim_neg_dial * self.C_emb - # maximize similarity returned by time attention wrapper - for sim_to_add in sims_rnn_to_max: - loss += tf.maximum(0.0, 1.0 - sim_to_add) + # penalize max similarity between pos bot and neg dial embeddings + max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_bot_dial, -1)) + loss += max_sim_neg_dial * self.C_emb # mask loss for different length sequences loss *= mask # average the loss over sequence length loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1) - # average the loss over the batch - loss = ( - tf.reduce_mean(loss) - # add regularization losses - + self._regularization_loss() - + tf.losses.get_regularization_loss() - ) + loss = tf.reduce_mean(loss) + + # add regularization losses + loss += tf.losses.get_regularization_loss() + return loss - def _tf_loss_2( - self, - sim: 'tf.Tensor', - sim_bot_emb: 'tf.Tensor', - sim_dial_emb: 'tf.Tensor', - sim_dial_bot_emb, - sims_rnn_to_max: List['tf.Tensor'], - bad_negs, + @staticmethod + def _tf_loss_softmax( + sim_pos: 'tf.Tensor', + sim_neg: 'tf.Tensor', + sim_neg_bot_bot: 'tf.Tensor', + sim_neg_dial_dial: 'tf.Tensor', + sim_neg_bot_dial: 'tf.Tensor', mask: 'tf.Tensor', - batch_bad_negs=None, ) -> 'tf.Tensor': - """Define loss.""" + """Define softmax loss.""" - all_sim = [sim[:, :, :1], - sim[:, :, 1:] + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs, - sim_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs, - ] - if sim_dial_emb is not None: - all_sim.append(sim_dial_emb + common_attention.large_compatible_negative(batch_bad_negs.dtype) * batch_bad_negs) + logits = tf.concat([sim_pos, + sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial + ], -1) - if sim_dial_bot_emb is not None: - all_sim.append(sim_dial_bot_emb + common_attention.large_compatible_negative(bad_negs.dtype) * bad_negs) - - logits = tf.concat(all_sim, -1) + # create labels for softmax pos_labels = tf.ones_like(logits[:, :, :1]) neg_labels = tf.zeros_like(logits[:, :, 1:]) labels = tf.concat([pos_labels, neg_labels], -1) + # mask loss by prediction confidence pred = tf.nn.softmax(logits) - # fake_logits = tf.concat([logits[:, :, :1] - common_attention.large_compatible_negative(logits.dtype), - # logits[:, :, 1:] + common_attention.large_compatible_negative(logits.dtype)], -1) - - # ones = tf.ones_like(pred[:, :, 0]) - # zeros = tf.zeros_like(pred[:, :, 0]) - - # already_learned = tf.where(pred[:, :, 0] > 0.8, zeros, ones) already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4) loss = tf.losses.softmax_cross_entropy(labels, @@ -767,23 +747,35 @@ def _tf_loss_2( # add regularization losses loss += tf.losses.get_regularization_loss() - # maximize similarity returned by time attention wrapper - add_loss = [] - for sim_to_add in sims_rnn_to_max: - add_loss.append(tf.maximum(0.0, 1.0 - sim_to_add)) - - if add_loss: - # mask loss for different length sequences - add_loss = sum(add_loss) * mask - # average the loss over sequence length - add_loss = tf.reduce_sum(add_loss, -1) / tf.reduce_sum(mask, 1) - # average the loss over the batch - add_loss = tf.reduce_mean(add_loss) - - loss += add_loss - return loss + def _choose_loss(self, + sim_pos: 'tf.Tensor', + sim_neg: 'tf.Tensor', + sim_neg_bot_bot: 'tf.Tensor', + sim_neg_dial_dial: 'tf.Tensor', + sim_neg_bot_dial: 'tf.Tensor', + mask: 'tf.Tensor') -> 'tf.Tensor': + + if self.loss_type == 'margin': + return self._tf_loss_margin(sim_pos, sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + mask) + elif self.loss_type == 'softmax': + return self._tf_loss_softmax(sim_pos, sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + mask) + else: + raise ValueError( + "Wrong loss type {}, " + "should be 'margin' or 'softmax'" + "".format(self.loss_type) + ) + # training methods def train( self, @@ -830,7 +822,7 @@ def train( if self.evaluate_on_num_examples: eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples) - eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples) + eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False) else: eval_train_dataset = None @@ -850,9 +842,6 @@ def train( # dynamic variables self._is_training = tf.placeholder_with_default(False, shape=()) - self._dialogue_len = tf.placeholder( - dtype=tf.int32, shape=(), name="dialogue_len" - ) # mask different length sequences # if there is at least one `-1` it should be masked @@ -862,7 +851,6 @@ def train( transformer_out = self._create_transformer_encoder( self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out") - sims_rnn_to_max = [] self.bot_embed = self._create_tf_bot_embed(self.b_in) all_actions_embed = self._create_tf_bot_embed(all_actions) @@ -875,55 +863,46 @@ def train( self.dial_embed = self.dial_embed[:, -1:, :] mask = mask[:, -1:] - b_raw = tf.reshape(self.b_in, (-1, self.b_in.shape[-1])) - - _, i, c = gen_array_ops.unique_with_counts_v2(b_raw, axis=[0]) - counts = tf.expand_dims(tf.reshape(tf.gather(tf.cast(c, tf.float32), i), (tf.shape(b_raw)[0],)), 0) - batch_neg_ids = tf.random.categorical(tf.log((1. - tf.eye(tf.shape(b_raw)[0])/counts)), self.num_neg) - - batch_iou_bot = self._tf_calc_iou(b_raw, neg_ids=batch_neg_ids) - batch_bad_negs = 1. - tf.nn.relu(tf.sign(1. - batch_iou_bot)) - batch_bad_negs = tf.reshape(batch_bad_negs, (tf.shape(self.dial_embed)[0], - tf.shape(self.dial_embed)[1], - -1)) - - neg_ids = tf.random.categorical(tf.log(tf.ones((tf.shape(b_raw)[0], tf.shape(all_actions)[0]))), self.num_neg) - - tiled_all_actions = tf.tile(tf.expand_dims(all_actions, 0), (tf.shape(b_raw)[0], 1, 1)) - neg_bs = tf.batch_gather(tiled_all_actions, neg_ids) - iou_bot = self._tf_calc_iou(b_raw, neg_bs) - bad_negs = 1. - tf.nn.relu(tf.sign(1. - iou_bot)) - bad_negs = tf.reshape(bad_negs, (tf.shape(self.bot_embed)[0], - tf.shape(self.bot_embed)[1], - -1)) - - dial_embed_flat = tf.reshape(self.dial_embed, (-1, self.dial_embed.shape[-1])) - - tiled_dial_embed = self._tf_sample_neg(dial_embed_flat, neg_ids=batch_neg_ids, first_only=True) - tiled_dial_embed = tf.reshape(tiled_dial_embed, (tf.shape(self.dial_embed)[0], - tf.shape(self.dial_embed)[1], - -1, - self.dial_embed.shape[-1])) - - bot_embed_flat = tf.reshape(self.bot_embed, (-1, self.bot_embed.shape[-1])) - tiled_all_actions_embed = tf.tile(tf.expand_dims(all_actions_embed, 0), (tf.shape(b_raw)[0], 1, 1)) - neg_embs = tf.batch_gather(tiled_all_actions_embed, neg_ids) - tiled_bot_embed = self._tf_sample_neg(bot_embed_flat, neg_bs=neg_embs) - tiled_bot_embed = tf.reshape(tiled_bot_embed, (tf.shape(self.bot_embed)[0], - tf.shape(self.bot_embed)[1], - -1, - self.bot_embed.shape[-1])) - - # self.sim_op, sim_bot_emb, sim_dial_emb = self._tf_sim(self.dial_embed, tiled_bot_embed, mask) - self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb = self._tf_sim(tiled_dial_embed, tiled_bot_embed, mask) - - # loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sims_rnn_to_max, bad_negs, mask) - loss = self._tf_loss_2(self.sim_op, sim_bot_emb, sim_dial_emb, sim_dial_bot_emb, sims_rnn_to_max, bad_negs, mask, batch_bad_negs) + pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :] + neg_dial_embed, dial_bad_negs = self._tf_get_negs( + self._tf_make_flat(self.dial_embed), + self._tf_make_flat(self.b_in), + self.b_in + ) + pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :] + neg_bot_embed, bot_bad_negs = self._tf_get_negs( + all_actions_embed, + all_actions, + self.b_in + ) + # normalize embedding vectors for cosine similarity + pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed) + pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed) + neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed) + neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed) + + (sim_pos, + sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial) = self._tf_sim(pos_dial_embed, + pos_bot_embed, + neg_dial_embed, + neg_bot_embed, + dial_bad_negs, + bot_bad_negs, + mask) + + acc = self._tf_calc_accuracy(sim_pos, sim_neg) + + loss = self._choose_loss(sim_pos, sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + mask) # define which optimizer to use - self._train_op = tf.train.AdamOptimizer( - # learning_rate=0.001, epsilon=1e-16 - ).minimize(loss) + self._train_op = tf.train.AdamOptimizer().minimize(loss) train_init_op = iterator.make_initializer(train_dataset) if self.evaluate_on_num_examples: @@ -934,8 +913,8 @@ def train( # train tensorflow graph self.session = tf.Session(config=self._tf_config) - # self._train_tf(session_data, loss, mask) - self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, loss, mask, session_data.X.shape[1]) + self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, + loss, acc) dialogue_len = None # use dynamic time for rnn # create placeholders @@ -974,7 +953,10 @@ def train( if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): self.dial_embed = self.dial_embed[:, -1:, :] - self.sim_op, _, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask) + self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed) + self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed) + self.sim_op = self._tf_raw_sim(self.dial_embed[:, :, tf.newaxis, :], + self.bot_embed, mask) # if self.attention_weights.items(): # self.attention_weights = tf.concat([tf.expand_dims(t, 0) @@ -1004,8 +986,7 @@ def _train_tf_dataset(self, eval_init_op, batch_size_in, loss: 'tf.Tensor', - mask, - dialogue_len, + acc, ) -> None: """Train tf graph""" @@ -1026,36 +1007,42 @@ def _train_tf_dataset(self, self.session.run(train_init_op, feed_dict={batch_size_in: batch_size}) - ep_loss = 0 + ep_train_loss = 0 + ep_train_acc = 0 batches_per_epoch = 0 while True: try: - _, batch_loss = self.session.run((self._train_op, loss), - feed_dict={self._is_training: True, - self._dialogue_len: dialogue_len}) + _, batch_train_loss, batch_train_acc = self.session.run( + [self._train_op, loss, acc], + feed_dict={self._is_training: True} + ) except tf.errors.OutOfRangeError: break batches_per_epoch += 1 - ep_loss += batch_loss + ep_train_loss += batch_train_loss + ep_train_acc += batch_train_acc - ep_loss /= batches_per_epoch + ep_train_loss /= batches_per_epoch + ep_train_acc /= batches_per_epoch if self.evaluate_on_num_examples and eval_init_op is not None: if (ep == 0 or (ep + 1) % self.evaluate_every_num_epochs == 0 or (ep + 1) == self.epochs): - train_acc = self._output_training_stat_dataset(eval_init_op, mask, dialogue_len) - last_loss = ep_loss + train_acc = self._output_training_stat_dataset(eval_init_op, acc) + last_loss = ep_train_loss pbar.set_postfix({ - "loss": "{:.3f}".format(ep_loss), - "acc": "{:.3f}".format(train_acc) + "train_loss": "{:.3f}".format(ep_train_loss), + "train_acc": "{:.3f}".format(ep_train_acc), + "acc": "{:.3f}".format(train_acc), }) else: pbar.set_postfix({ - "loss": "{:.3f}".format(ep_loss) + "train_loss": "{:.3f}".format(ep_train_loss), + "train_acc": "{:.3f}".format(ep_train_acc) }) if self.evaluate_on_num_examples: @@ -1063,20 +1050,12 @@ def _train_tf_dataset(self, "loss={:.3f}, train accuracy={:.3f}" "".format(last_loss, train_acc)) - def _output_training_stat_dataset(self, eval_init_op, mask, dialogue_len) -> np.ndarray: + def _output_training_stat_dataset(self, eval_init_op, acc) -> np.ndarray: """Output training statistics""" self.session.run(eval_init_op) - sim_, mask_ = self.session.run([self.sim_op, mask], - feed_dict={self._is_training: False, - self._dialogue_len: dialogue_len}) - sim_ = sim_.reshape((-1, sim_.shape[-1])) - mask_ = mask_.reshape((-1,)) - - train_acc = np.sum((np.max(sim_, -1) == sim_.diagonal()) * mask_) / np.sum(mask_) - - return train_acc + return self.session.run(acc, feed_dict={self._is_training: False}) def continue_training( self, @@ -1106,7 +1085,6 @@ def continue_training( self.b_in: b, self.c_in: session_data.slots, self.b_prev_in: session_data.previous_actions, - self._dialogue_len: session_data.X.shape[1], self._is_training: True, }, ) @@ -1124,8 +1102,7 @@ def tf_feed_dict_for_prediction(self, return {self.a_in: session_data.X, self.b_in: all_Y_d_x, self.c_in: session_data.slots, - self.b_prev_in: session_data.previous_actions, - self._dialogue_len: session_data.X.shape[1]} + self.b_prev_in: session_data.previous_actions} def predict_action_probabilities( self, tracker: DialogueStateTracker, domain: Domain @@ -1160,7 +1137,6 @@ def predict_action_probabilities( self.b_in: all_Y_d_x, self.c_in: session_data.slots, self.b_prev_in: session_data.previous_actions, - self._dialogue_len: session_data.X.shape[1], }, ) @@ -1212,7 +1188,6 @@ def persist(self, path: Text) -> None: self._persist_tensor("action_placeholder", self.b_in) self._persist_tensor("slots_placeholder", self.c_in) self._persist_tensor("prev_act_placeholder", self.b_prev_in) - self._persist_tensor("dialogue_len", self._dialogue_len) self._persist_tensor("similarity_op", self.sim_op) @@ -1288,7 +1263,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy": b_in = cls.load_tensor("action_placeholder") c_in = cls.load_tensor("slots_placeholder") b_prev_in = cls.load_tensor("prev_act_placeholder") - dialogue_len = cls.load_tensor("dialogue_len") sim_op = cls.load_tensor("similarity_op") @@ -1324,7 +1298,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy": action_placeholder=b_in, slots_placeholder=c_in, prev_act_placeholder=b_prev_in, - dialogue_len=dialogue_len, similarity_op=sim_op, alignment_history=alignment_history, user_embed=user_embed, From c631646dee76298be777473fc40de19a76f82a0d Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 16:09:50 +0200 Subject: [PATCH 04/50] refactor train --- rasa/core/policies/embedding_policy.py | 428 +++++++++++-------------- 1 file changed, 186 insertions(+), 242 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 63a10afcfe3b..b4192758284c 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -24,7 +24,6 @@ from rasa.utils.common import is_logging_disabled import tensorflow as tf -from tensorflow.python.ops import gen_array_ops try: from tensor2tensor.layers import common_attention @@ -52,7 +51,6 @@ "slots", "previous_actions", "actions_for_Y", - "all_Y_d", ), ) @@ -166,15 +164,9 @@ def __init__( slots_placeholder: Optional['tf.Tensor'] = None, prev_act_placeholder: Optional['tf.Tensor'] = None, similarity_op: Optional['tf.Tensor'] = None, - alignment_history: Optional['tf.Tensor'] = None, - user_embed: Optional['tf.Tensor'] = None, - bot_embed: Optional['tf.Tensor'] = None, - slot_embed: Optional['tf.Tensor'] = None, dial_embed: Optional['tf.Tensor'] = None, - rnn_embed: Optional['tf.Tensor'] = None, - attn_embed: Optional['tf.Tensor'] = None, - copy_attn_debug: Optional['tf.Tensor'] = None, - all_time_masks: Optional['tf.Tensor'] = None, + bot_embed: Optional['tf.Tensor'] = None, + all_bot_embed: Optional['tf.Tensor'] = None, attention_weights=None, max_history: Optional[int] = None, **kwargs: Any @@ -210,21 +202,11 @@ def __init__( self.b_prev_in = prev_act_placeholder self.sim_op = similarity_op - # store attention probability distribution as - # concatenated tensor of each attention types - self.alignment_history = alignment_history - # persisted embeddings - self.user_embed = user_embed - self.bot_embed = bot_embed - self.slot_embed = slot_embed self.dial_embed = dial_embed + self.bot_embed = bot_embed + self.all_bot_embed = all_bot_embed - self.rnn_embed = rnn_embed - self.attn_embed = attn_embed - self.copy_attn_debug = copy_attn_debug - - self.all_time_masks = all_time_masks self.attention_weights = attention_weights # internal tf instances self._train_op = None @@ -361,16 +343,6 @@ def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray: ] ) - # noinspection PyPep8Naming - def _create_all_Y_d(self, dialogue_len: int) -> np.ndarray: - """Stack encoded_all_intents on top of each other - - to create candidates for training examples and - to calculate training accuracy. - """ - - return np.stack([self.encoded_all_actions] * dialogue_len) - # noinspection PyPep8Naming def _create_session_data( self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None @@ -393,7 +365,6 @@ def _create_session_data( dial_len = X.shape[1] else: dial_len = 1 - all_Y_d = self._create_all_Y_d(dial_len) return SessionData( X=X, @@ -401,7 +372,6 @@ def _create_session_data( slots=slots, previous_actions=previous_actions, actions_for_Y=actions_for_Y, - all_Y_d=all_Y_d, ) @staticmethod @@ -414,7 +384,6 @@ def _sample_session_data(session_data: 'SessionData', slots=session_data.slots[ids], previous_actions=session_data.previous_actions[ids], actions_for_Y=session_data.actions_for_Y[ids], - all_Y_d=session_data.all_Y_d, ) # tf helpers: @@ -422,15 +391,22 @@ def _sample_session_data(session_data: 'SessionData', def _create_tf_dataset(session_data: 'SessionData', batch_size: Union['tf.Tensor', int], shuffle: bool = True) -> 'tf.data.Dataset': - train_dataset = tf.data.Dataset.from_tensor_slices((session_data.X, - session_data.Y, - session_data.slots, - session_data.previous_actions)) + train_dataset = tf.data.Dataset.from_tensor_slices( + (session_data.X, session_data.Y, + session_data.slots, session_data.previous_actions) + ) if shuffle: train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) train_dataset = train_dataset.batch(batch_size) + return train_dataset + @staticmethod + def _create_tf_iterator(dataset): + return tf.data.Iterator.from_structure(dataset.output_types, + dataset.output_shapes, + output_classes=dataset.output_classes) + def _create_tf_nn( self, x_in: 'tf.Tensor', @@ -454,7 +430,7 @@ def _create_tf_nn( x = tf.layers.dropout(x, rate=droprate, training=self._is_training) return x - def _create_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': + def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': """Create dense embedding layer with a name.""" reg = tf.contrib.layers.l2_regularizer(self.C2) @@ -479,7 +455,7 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': self.droprate["b"], layer_name_suffix=layer_name_suffix, ) - return self._create_embed(b, layer_name_suffix=layer_name_suffix) + return self._create_tf_embed(b, layer_name_suffix=layer_name_suffix) def _create_hparams(self): hparams = transformer_base() @@ -501,7 +477,7 @@ def _create_hparams(self): hparams.add_relative_to_values = True return hparams - def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights): + def _create_tf_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights): hparams = self._create_hparams() x_in = tf.concat([a_in, b_prev_in, c_in], -1) @@ -558,6 +534,24 @@ def _create_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_wei return tf.nn.relu(x) + def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: + # mask different length sequences + # if there is at least one `-1` it should be masked + mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + + self.attention_weights = {} + a = self._create_tf_transformer_encoder( + self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) + + dial_embed = self._create_tf_embed(a, layer_name_suffix="out") + + if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): + # pick last action if max history featurizer is used + dial_embed = dial_embed[:, -1:, :] + mask = mask[:, -1:] + + return dial_embed, mask + @staticmethod def _tf_make_flat(x): return tf.reshape(x, (-1, x.shape[-1])) @@ -606,6 +600,24 @@ def _tf_get_negs(self, all_embed, all_raw, raw_pos): return neg_embed, bad_negs + def _sample_negatives(self, all_actions): + + # sample negatives + pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :] + neg_dial_embed, dial_bad_negs = self._tf_get_negs( + self._tf_make_flat(self.dial_embed), + self._tf_make_flat(self.b_in), + self.b_in + ) + pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :] + neg_bot_embed, bot_bad_negs = self._tf_get_negs( + self.all_bot_embed, + all_actions, + self.b_in + ) + return (pos_dial_embed, pos_bot_embed, neg_dial_embed, neg_bot_embed, + dial_bad_negs, bot_bad_negs) + def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor': if self.similarity_type not in {"cosine", "inner"}: @@ -776,6 +788,99 @@ def _choose_loss(self, "".format(self.loss_type) ) + def _build_tf_train_graph(self, iterator): + + # session data are int counts but we need a float tensors + (self.a_in, + self.b_in, + self.c_in, + self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next()) + + all_actions = tf.constant(self.encoded_all_actions, + dtype=tf.float32, + name="all_actions") + + self.dial_embed, mask = self._create_tf_dial() + + self.bot_embed = self._create_tf_bot_embed(self.b_in) + self.all_bot_embed = self._create_tf_bot_embed(all_actions) + + if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): + # add time dimension if max history featurizer is used + self.b_in = self.b_in[:, tf.newaxis, :] + self.bot_embed = self.bot_embed[:, tf.newaxis, :] + + (pos_dial_embed, + pos_bot_embed, + neg_dial_embed, + neg_bot_embed, + dial_bad_negs, + bot_bad_negs) = self._sample_negatives(all_actions) + + # normalize embedding vectors for cosine similarity + pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed) + pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed) + neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed) + neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed) + + # calculate similarities + (sim_pos, + sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial) = self._tf_sim(pos_dial_embed, + pos_bot_embed, + neg_dial_embed, + neg_bot_embed, + dial_bad_negs, + bot_bad_negs, + mask) + + acc = self._tf_calc_accuracy(sim_pos, sim_neg) + + loss = self._choose_loss(sim_pos, sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + mask) + return loss, acc + + def _create_tf_placeholders(self, session_data): + dialogue_len = None # use dynamic time for rnn + self.a_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, session_data.X.shape[-1]), + name="a", + ) + self.b_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, None, session_data.Y.shape[-1]), + name="b", + ) + self.c_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, session_data.slots.shape[-1]), + name="slt", + ) + self.b_prev_in = tf.placeholder( + dtype=tf.float32, + shape=(None, dialogue_len, session_data.Y.shape[-1]), + name="b_prev", + ) + + def _build_tf_pred_graph(self): + self.dial_embed, mask = self._create_tf_dial() + self.bot_embed = self._create_tf_bot_embed(self.b_in) + + self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed) + self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed) + + self.sim_op = self._tf_raw_sim( + self.dial_embed[:, :, tf.newaxis, :], + self.all_bot_embed[tf.newaxis, tf.newaxis, :, :], + mask + ) + # training methods def train( self, @@ -820,144 +925,31 @@ def train( batch_size_in = tf.placeholder(tf.int64) train_dataset = self._create_tf_dataset(session_data, batch_size_in) + iterator = self._create_tf_iterator(train_dataset) + + train_init_op = iterator.make_initializer(train_dataset) + if self.evaluate_on_num_examples: eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples) eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False) + eval_init_op = iterator.make_initializer(eval_train_dataset) else: - eval_train_dataset = None - - iterator = tf.data.Iterator.from_structure(train_dataset.output_types, - train_dataset.output_shapes, - output_classes=train_dataset.output_classes) - - # session data are int counts but we need a float tensors - (self.a_in, - self.b_in, - self.c_in, - self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next()) - - all_actions = tf.constant(self.encoded_all_actions, - dtype=tf.float32, - name="all_actions") + eval_init_op = None - # dynamic variables self._is_training = tf.placeholder_with_default(False, shape=()) + loss, acc = self._build_tf_train_graph(iterator) - # mask different length sequences - # if there is at least one `-1` it should be masked - mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) - - self.attention_weights = {} - transformer_out = self._create_transformer_encoder( - self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) - self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out") - - self.bot_embed = self._create_tf_bot_embed(self.b_in) - all_actions_embed = self._create_tf_bot_embed(all_actions) - - # calculate similarities - if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): - # pick last action if max history is used - self.b_in = self.b_in[:, tf.newaxis, :] - self.bot_embed = self.bot_embed[:, tf.newaxis, :] - self.dial_embed = self.dial_embed[:, -1:, :] - mask = mask[:, -1:] - - pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :] - neg_dial_embed, dial_bad_negs = self._tf_get_negs( - self._tf_make_flat(self.dial_embed), - self._tf_make_flat(self.b_in), - self.b_in - ) - pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :] - neg_bot_embed, bot_bad_negs = self._tf_get_negs( - all_actions_embed, - all_actions, - self.b_in - ) - - # normalize embedding vectors for cosine similarity - pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed) - pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed) - neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed) - neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed) - - (sim_pos, - sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial) = self._tf_sim(pos_dial_embed, - pos_bot_embed, - neg_dial_embed, - neg_bot_embed, - dial_bad_negs, - bot_bad_negs, - mask) - - acc = self._tf_calc_accuracy(sim_pos, sim_neg) - - loss = self._choose_loss(sim_pos, sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial, - mask) # define which optimizer to use self._train_op = tf.train.AdamOptimizer().minimize(loss) - train_init_op = iterator.make_initializer(train_dataset) - if self.evaluate_on_num_examples: - eval_init_op = iterator.make_initializer(eval_train_dataset) - else: - eval_init_op = None - # train tensorflow graph self.session = tf.Session(config=self._tf_config) - self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, loss, acc) - dialogue_len = None # use dynamic time for rnn - # create placeholders - self.a_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.X.shape[-1]), - name="a", - ) - self.b_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, None, session_data.Y.shape[-1]), - name="b", - ) - self.c_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.slots.shape[-1]), - name="slt", - ) - self.b_prev_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.Y.shape[-1]), - name="b_prev", - ) - - # mask different length sequences - # if there is at least one `-1` it should be masked - mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) - - self.attention_weights = {} - transformer_out = self._create_transformer_encoder( - self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) - self.dial_embed = self._create_embed(transformer_out, layer_name_suffix="out") - - self.bot_embed = self._create_tf_bot_embed(self.b_in) - - if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): - self.dial_embed = self.dial_embed[:, -1:, :] - - self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed) - self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed) - self.sim_op = self._tf_raw_sim(self.dial_embed[:, :, tf.newaxis, :], - self.bot_embed, mask) - + # rebuild the graph for prediction + self._create_tf_placeholders(session_data) + self._build_tf_pred_graph() # if self.attention_weights.items(): # self.attention_weights = tf.concat([tf.expand_dims(t, 0) # for name, t in self.attention_weights.items() @@ -999,8 +991,8 @@ def _train_tf_dataset(self, ) pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled()) - train_acc = 0 - last_loss = 0 + eval_acc = 0 + eval_loss = 0 for ep in pbar: batch_size = self._linearly_increasing_batch_size(ep) @@ -1016,46 +1008,42 @@ def _train_tf_dataset(self, [self._train_op, loss, acc], feed_dict={self._is_training: True} ) + batches_per_epoch += 1 + ep_train_loss += batch_train_loss + ep_train_acc += batch_train_acc except tf.errors.OutOfRangeError: break - batches_per_epoch += 1 - ep_train_loss += batch_train_loss - ep_train_acc += batch_train_acc - ep_train_loss /= batches_per_epoch ep_train_acc /= batches_per_epoch + pbar.set_postfix({ + "loss": "{:.3f}".format(ep_train_loss), + "acc": "{:.3f}".format(ep_train_acc) + }) + if self.evaluate_on_num_examples and eval_init_op is not None: if (ep == 0 or (ep + 1) % self.evaluate_every_num_epochs == 0 or (ep + 1) == self.epochs): - train_acc = self._output_training_stat_dataset(eval_init_op, acc) - last_loss = ep_train_loss - - pbar.set_postfix({ - "train_loss": "{:.3f}".format(ep_train_loss), - "train_acc": "{:.3f}".format(ep_train_acc), - "acc": "{:.3f}".format(train_acc), - }) - else: - pbar.set_postfix({ - "train_loss": "{:.3f}".format(ep_train_loss), - "train_acc": "{:.3f}".format(ep_train_acc) - }) + eval_loss, eval_acc = self._output_training_stat_dataset( + eval_init_op, loss, acc + ) + logger.info("Evaluation results: loss: {:.3f}, acc: {:.3f}" + "".format(eval_loss, eval_acc)) if self.evaluate_on_num_examples: logger.info("Finished training embedding classifier, " - "loss={:.3f}, train accuracy={:.3f}" - "".format(last_loss, train_acc)) + "loss={:.3f}, accuracy={:.3f}" + "".format(eval_loss, eval_acc)) - def _output_training_stat_dataset(self, eval_init_op, acc) -> np.ndarray: + def _output_training_stat_dataset(self, eval_init_op, loss, acc) -> Tuple[float, float]: """Output training statistics""" self.session.run(eval_init_op) - return self.session.run(acc, feed_dict={self._is_training: False}) + return self.session.run([loss, acc], feed_dict={self._is_training: False}) def continue_training( self, @@ -1095,12 +1083,8 @@ def tf_feed_dict_for_prediction(self, # noinspection PyPep8Naming data_X = self.featurizer.create_X([tracker], domain) session_data = self._create_session_data(data_X) - # noinspection PyPep8Naming - all_Y_d_x = np.stack([session_data.all_Y_d - for _ in range(session_data.X.shape[0])]) return {self.a_in: session_data.X, - self.b_in: all_Y_d_x, self.c_in: session_data.slots, self.b_prev_in: session_data.previous_actions} @@ -1120,30 +1104,14 @@ def predict_action_probabilities( ) return [0.0] * domain.num_actions - # noinspection PyPep8Naming - data_X = self.featurizer.create_X([tracker], domain) - session_data = self._create_session_data(data_X) - # noinspection PyPep8Naming - all_Y_d_x = np.stack( - [session_data.all_Y_d for _ in range(session_data.X.shape[0])] - ) - # self.similarity_type = 'cosine' - # mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) - # self.sim_op, _, _ = self._tf_sim(self.dial_embed, self.bot_embed, mask) - _sim = self.session.run( - self.sim_op, - feed_dict={ - self.a_in: session_data.X, - self.b_in: all_Y_d_x, - self.c_in: session_data.slots, - self.b_prev_in: session_data.previous_actions, - }, - ) + tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain) + + sim_ = self.session.run(self.sim_op, feed_dict=tf_feed_dict) # TODO assume we used inner: self.similarity_type = "inner" - result = _sim[0, -1, :] + result = sim_[0, -1, :] if self.similarity_type == "cosine": # clip negative values to zero result[result < 0] = 0 @@ -1191,18 +1159,9 @@ def persist(self, path: Text) -> None: self._persist_tensor("similarity_op", self.sim_op) - self._persist_tensor("alignment_history", self.alignment_history) - - self._persist_tensor("user_embed", self.user_embed) - self._persist_tensor("bot_embed", self.bot_embed) - self._persist_tensor("slot_embed", self.slot_embed) self._persist_tensor("dial_embed", self.dial_embed) - - self._persist_tensor("rnn_embed", self.rnn_embed) - self._persist_tensor("attn_embed", self.attn_embed) - self._persist_tensor("copy_attn_debug", self.copy_attn_debug) - - self._persist_tensor("all_time_masks", self.all_time_masks) + self._persist_tensor("bot_embed", self.bot_embed) + self._persist_tensor("all_bot_embed", self.all_bot_embed) self._persist_tensor("attention_weights", self.attention_weights) @@ -1266,18 +1225,9 @@ def load(cls, path: Text) -> "EmbeddingPolicy": sim_op = cls.load_tensor("similarity_op") - alignment_history = cls.load_tensor("alignment_history") - - user_embed = cls.load_tensor("user_embed") - bot_embed = cls.load_tensor("bot_embed") - slot_embed = cls.load_tensor("slot_embed") dial_embed = cls.load_tensor("dial_embed") - - rnn_embed = cls.load_tensor("rnn_embed") - attn_embed = cls.load_tensor("attn_embed") - copy_attn_debug = cls.load_tensor("copy_attn_debug") - - all_time_masks = cls.load_tensor("all_time_masks") + bot_embed = cls.load_tensor("bot_embed") + all_bot_embed = cls.load_tensor("all_bot_embed") attention_weights = cls.load_tensor("attention_weights") @@ -1299,14 +1249,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy": slots_placeholder=c_in, prev_act_placeholder=b_prev_in, similarity_op=sim_op, - alignment_history=alignment_history, - user_embed=user_embed, - bot_embed=bot_embed, - slot_embed=slot_embed, dial_embed=dial_embed, - rnn_embed=rnn_embed, - attn_embed=attn_embed, - copy_attn_debug=copy_attn_debug, - all_time_masks=all_time_masks, + bot_embed=bot_embed, + all_bot_embed=all_bot_embed, attention_weights=attention_weights ) From e355baa17110fa7eb70e8adb1e764b2db5858e2c Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 17:08:22 +0200 Subject: [PATCH 05/50] normilize confidence inside pred graph --- rasa/core/policies/embedding_policy.py | 76 +++++++++++++++----------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index b4192758284c..1d3bb8eb7318 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -163,7 +163,9 @@ def __init__( action_placeholder: Optional['tf.Tensor'] = None, slots_placeholder: Optional['tf.Tensor'] = None, prev_act_placeholder: Optional['tf.Tensor'] = None, - similarity_op: Optional['tf.Tensor'] = None, + similarity_all: Optional['tf.Tensor'] = None, + pred_confidence: Optional['tf.Tensor'] = None, + similarity: Optional['tf.Tensor'] = None, dial_embed: Optional['tf.Tensor'] = None, bot_embed: Optional['tf.Tensor'] = None, all_bot_embed: Optional['tf.Tensor'] = None, @@ -200,7 +202,9 @@ def __init__( self.b_in = action_placeholder self.c_in = slots_placeholder self.b_prev_in = prev_act_placeholder - self.sim_op = similarity_op + self.sim_all = similarity_all + self.pred_confidence = pred_confidence + self.sim = similarity # persisted embeddings self.dial_embed = dial_embed @@ -541,8 +545,8 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: self.attention_weights = {} a = self._create_tf_transformer_encoder( - self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights) - + self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights + ) dial_embed = self._create_tf_embed(a, layer_name_suffix="out") if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): @@ -661,11 +665,11 @@ def _tf_sim( sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed, mask) + neg_inf * bot_bad_negs sim_neg_bot_bot = self._tf_raw_sim(pos_bot_embed, neg_bot_embed, - mask) + neg_inf * bot_bad_negs + mask) + neg_inf * bot_bad_negs sim_neg_dial_dial = self._tf_raw_sim(pos_dial_embed, neg_dial_embed, - mask) + neg_inf * dial_bad_negs + mask) + neg_inf * dial_bad_negs sim_neg_bot_dial = self._tf_raw_sim(pos_bot_embed, neg_dial_embed, - mask) + neg_inf * dial_bad_negs + mask) + neg_inf * dial_bad_negs # output similarities between user input and bot actions # and similarities between bot actions and similarities between user inputs @@ -870,17 +874,32 @@ def _create_tf_placeholders(self, session_data): def _build_tf_pred_graph(self): self.dial_embed, mask = self._create_tf_dial() - self.bot_embed = self._create_tf_bot_embed(self.b_in) - self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed) - self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed) - self.sim_op = self._tf_raw_sim( + self.sim_all = self._tf_raw_sim( self.dial_embed[:, :, tf.newaxis, :], self.all_bot_embed[tf.newaxis, tf.newaxis, :, :], mask ) + if self.similarity_type == "cosine": + # clip negative values to zero + confidence = tf.nn.relu(self.sim_all) + else: + # normalize result to [0, 1] with softmax + confidence = tf.nn.softmax(self.sim_all) + + self.bot_embed = self._create_tf_bot_embed(self.b_in) + self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed) + + self.sim = self._tf_raw_sim( + self.dial_embed[:, :, tf.newaxis, :], + self.bot_embed, + mask + ) + + return confidence + # training methods def train( self, @@ -949,7 +968,8 @@ def train( # rebuild the graph for prediction self._create_tf_placeholders(session_data) - self._build_tf_pred_graph() + self.pred_confidence = self._build_tf_pred_graph() + # if self.attention_weights.items(): # self.attention_weights = tf.concat([tf.expand_dims(t, 0) # for name, t in self.attention_weights.items() @@ -1106,23 +1126,9 @@ def predict_action_probabilities( tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain) - sim_ = self.session.run(self.sim_op, feed_dict=tf_feed_dict) - - # TODO assume we used inner: - self.similarity_type = "inner" - - result = sim_[0, -1, :] - if self.similarity_type == "cosine": - # clip negative values to zero - result[result < 0] = 0 - elif self.similarity_type == "inner": - # normalize result to [0, 1] with softmax but only over 3*num_neg+1 values - low_ids = result.argsort()[::-1][4*self.num_neg+1:] - result[low_ids] += -np.inf - result = np.exp(result) - result /= np.sum(result) + sim_ = self.session.run(self.sim_all, feed_dict=tf_feed_dict) - return result.tolist() + return sim_[0, -1, :].tolist() def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None: if tensor is not None: @@ -1157,7 +1163,9 @@ def persist(self, path: Text) -> None: self._persist_tensor("slots_placeholder", self.c_in) self._persist_tensor("prev_act_placeholder", self.b_prev_in) - self._persist_tensor("similarity_op", self.sim_op) + self._persist_tensor("similarity_all", self.sim_all) + self._persist_tensor("pred_confidence", self.pred_confidence) + self._persist_tensor("similarity", self.sim) self._persist_tensor("dial_embed", self.dial_embed) self._persist_tensor("bot_embed", self.bot_embed) @@ -1223,7 +1231,9 @@ def load(cls, path: Text) -> "EmbeddingPolicy": c_in = cls.load_tensor("slots_placeholder") b_prev_in = cls.load_tensor("prev_act_placeholder") - sim_op = cls.load_tensor("similarity_op") + sim_all = cls.load_tensor("similarity_all") + pred_confidence = cls.load_tensor("pred_confidence") + sim = cls.load_tensor("similarity") dial_embed = cls.load_tensor("dial_embed") bot_embed = cls.load_tensor("bot_embed") @@ -1248,9 +1258,11 @@ def load(cls, path: Text) -> "EmbeddingPolicy": action_placeholder=b_in, slots_placeholder=c_in, prev_act_placeholder=b_prev_in, - similarity_op=sim_op, + similarity_all=sim_all, + pred_confidence=pred_confidence, + similarity=sim, dial_embed=dial_embed, bot_embed=bot_embed, all_bot_embed=all_bot_embed, - attention_weights=attention_weights + attention_weights=attention_weights, ) From fdb15f6cad3f1ddb3bb519cf985cfa32c187b4d4 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 17:09:23 +0200 Subject: [PATCH 06/50] use pred_confidence in predict --- rasa/core/policies/embedding_policy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 1d3bb8eb7318..cdfd5551618a 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -1126,9 +1126,9 @@ def predict_action_probabilities( tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain) - sim_ = self.session.run(self.sim_all, feed_dict=tf_feed_dict) + confidence = self.session.run(self.pred_confidence, feed_dict=tf_feed_dict) - return sim_[0, -1, :].tolist() + return confidence[0, -1, :].tolist() def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None: if tensor is not None: From fe5a3adc35449871a6c42b73388fb148a608b62f Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 17:30:13 +0200 Subject: [PATCH 07/50] move normalization to embed helper --- rasa/core/policies/embedding_policy.py | 39 +++++++++++--------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index cdfd5551618a..40f42816d199 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -434,6 +434,20 @@ def _create_tf_nn( x = tf.layers.dropout(x, rate=droprate, training=self._is_training) return x + def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor': + + if self.similarity_type not in {"cosine", "inner"}: + raise ValueError( + "Wrong similarity type {}, " + "should be 'cosine' or 'inner'" + "".format(self.similarity_type) + ) + + if self.similarity_type == "cosine": + return tf.nn.l2_normalize(a, -1) + else: + return a + def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': """Create dense embedding layer with a name.""" @@ -446,7 +460,8 @@ def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tenso name="embed_layer_{}".format(layer_name_suffix), reuse=tf.AUTO_REUSE, ) - return embed_x + # normalize embedding vectors for cosine similarity + return self._tf_normalize_if_cosine(embed_x) def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': """Create embedding bot vector.""" @@ -622,20 +637,6 @@ def _sample_negatives(self, all_actions): return (pos_dial_embed, pos_bot_embed, neg_dial_embed, neg_bot_embed, dial_bad_negs, bot_bad_negs) - def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor': - - if self.similarity_type not in {"cosine", "inner"}: - raise ValueError( - "Wrong similarity type {}, " - "should be 'cosine' or 'inner'" - "".format(self.similarity_type) - ) - - if self.similarity_type == "cosine": - return tf.nn.l2_normalize(a, -1) - else: - return a - @staticmethod def _tf_raw_sim( a: 'tf.Tensor', @@ -821,12 +822,6 @@ def _build_tf_train_graph(self, iterator): dial_bad_negs, bot_bad_negs) = self._sample_negatives(all_actions) - # normalize embedding vectors for cosine similarity - pos_dial_embed = self._tf_normalize_if_cosine(pos_dial_embed) - pos_bot_embed = self._tf_normalize_if_cosine(pos_bot_embed) - neg_dial_embed = self._tf_normalize_if_cosine(neg_dial_embed) - neg_bot_embed = self._tf_normalize_if_cosine(neg_bot_embed) - # calculate similarities (sim_pos, sim_neg, @@ -874,7 +869,6 @@ def _create_tf_placeholders(self, session_data): def _build_tf_pred_graph(self): self.dial_embed, mask = self._create_tf_dial() - self.dial_embed = self._tf_normalize_if_cosine(self.dial_embed) self.sim_all = self._tf_raw_sim( self.dial_embed[:, :, tf.newaxis, :], @@ -890,7 +884,6 @@ def _build_tf_pred_graph(self): confidence = tf.nn.softmax(self.sim_all) self.bot_embed = self._create_tf_bot_embed(self.b_in) - self.bot_embed = self._tf_normalize_if_cosine(self.bot_embed) self.sim = self._tf_raw_sim( self.dial_embed[:, :, tf.newaxis, :], From 4cff3c07aa6200e72702298eddab1da105cedd5c Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 18:11:04 +0200 Subject: [PATCH 08/50] maybe fix continue_training --- rasa/core/policies/embedding_policy.py | 57 ++++++++++++++------------ 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 40f42816d199..695db9a094c4 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -213,6 +213,7 @@ def __init__( self.attention_weights = attention_weights # internal tf instances + self._iterator = None self._train_op = None self._is_training = None @@ -507,15 +508,19 @@ def _create_tf_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_ setattr(hparams, key, value * tf.cast(self._is_training, tf.float32)) reg = tf.contrib.layers.l2_regularizer(self.C2) - x = tf.layers.dense(inputs=x_in, - units=hparams.hidden_size, - use_bias=False, - kernel_initializer=tf.random_normal_initializer(0.0, hparams.hidden_size ** -0.5), - kernel_regularizer=reg, - name='transformer_embed_layer', - reuse=tf.AUTO_REUSE) - - x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training) + x = tf.nn.relu(x_in) + x = tf.layers.dense( + inputs=x, + units=hparams.hidden_size, + use_bias=False, + kernel_initializer=tf.random_normal_initializer( + 0.0, hparams.hidden_size ** -0.5), + kernel_regularizer=reg, + name='transformer_embed_layer', + reuse=tf.AUTO_REUSE + ) + x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, + training=self._is_training) if hparams.multiply_embedding_mode == "sqrt_depth": x *= hparams.hidden_size ** 0.5 @@ -793,13 +798,13 @@ def _choose_loss(self, "".format(self.loss_type) ) - def _build_tf_train_graph(self, iterator): + def _build_tf_train_graph(self): # session data are int counts but we need a float tensors (self.a_in, self.b_in, self.c_in, - self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in iterator.get_next()) + self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in self._iterator.get_next()) all_actions = tf.constant(self.encoded_all_actions, dtype=tf.float32, @@ -937,19 +942,19 @@ def train( batch_size_in = tf.placeholder(tf.int64) train_dataset = self._create_tf_dataset(session_data, batch_size_in) - iterator = self._create_tf_iterator(train_dataset) + self._iterator = self._create_tf_iterator(train_dataset) - train_init_op = iterator.make_initializer(train_dataset) + train_init_op = self._iterator.make_initializer(train_dataset) if self.evaluate_on_num_examples: eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples) eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False) - eval_init_op = iterator.make_initializer(eval_train_dataset) + eval_init_op = self._iterator.make_initializer(eval_train_dataset) else: eval_init_op = None self._is_training = tf.placeholder_with_default(False, shape=()) - loss, acc = self._build_tf_train_graph(iterator) + loss, acc = self._build_tf_train_graph() # define which optimizer to use self._train_op = tf.train.AdamOptimizer().minimize(loss) @@ -1075,20 +1080,18 @@ def continue_training( ) session_data = self._create_session_data(training_data.X, training_data.y) - - b = self._create_batch_b(session_data.Y, session_data.actions_for_Y) + train_dataset = self._create_tf_dataset(session_data, batch_size) + train_init_op = self._iterator.make_initializer(train_dataset) + self.session.run(train_init_op) # fit to one extra example using updated trackers - self.session.run( - self._train_op, - feed_dict={ - self.a_in: session_data.X, - self.b_in: b, - self.c_in: session_data.slots, - self.b_prev_in: session_data.previous_actions, - self._is_training: True, - }, - ) + while True: + try: + self.session.run(self._train_op, + feed_dict={self._is_training: True}) + + except tf.errors.OutOfRangeError: + break def tf_feed_dict_for_prediction(self, tracker: DialogueStateTracker, From 2b1843cab3b77a4c6c3e0694f15eaefa574dbccf Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 18:21:11 +0200 Subject: [PATCH 09/50] fix extract attention --- rasa/core/policies/embedding_policy.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 695db9a094c4..8c6b1ed1105e 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -365,12 +365,6 @@ def _create_session_data( actions_for_Y = None Y = None - # is needed to calculate train accuracy - if isinstance(self.featurizer, FullDialogueTrackerFeaturizer): - dial_len = X.shape[1] - else: - dial_len = 1 - return SessionData( X=X, Y=Y, @@ -898,6 +892,16 @@ def _build_tf_pred_graph(self): return confidence + def _extract_attention(self): + attention = [tf.expand_dims(t, 0) + for name, t in self.attention_weights.items() + if name.endswith('multihead_attention/dot_product_attention')] + + if attention: + return tf.concat(attention, 0) + else: + return + # training methods def train( self, @@ -968,10 +972,7 @@ def train( self._create_tf_placeholders(session_data) self.pred_confidence = self._build_tf_pred_graph() - # if self.attention_weights.items(): - # self.attention_weights = tf.concat([tf.expand_dims(t, 0) - # for name, t in self.attention_weights.items() - # if name.endswith('multihead_attention/dot_product_attention')], 0) + self.attention_weights = self._extract_attention() # training helpers def _linearly_increasing_batch_size(self, epoch: int) -> int: From d1fae48b5e5cb66e2f4a785ca74aad00b75f61c5 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 20:46:13 +0200 Subject: [PATCH 10/50] add types --- rasa/core/policies/embedding_policy.py | 213 ++++++++++--------------- 1 file changed, 88 insertions(+), 125 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 8c6b1ed1105e..caf653be37b9 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -39,7 +39,11 @@ except ImportError: import pickle +if typing.TYPE_CHECKING: + from tensor2tensor.utils.hparam import HParams +# avoid warning println on contrib import - remove for tf 2 +tf.contrib._warning = None logger = logging.getLogger(__name__) # namedtuple for all tf session related data @@ -66,12 +70,9 @@ class EmbeddingPolicy(Policy): # default properties (DOC MARKER - don't remove) defaults = { # nn architecture - # a list of hidden layers sizes before user embed layer - # number of hidden layers is equal to the length of this list - "hidden_layers_sizes_a": [], # a list of hidden layers sizes before bot embed layer # number of hidden layers is equal to the length of this list - "hidden_layers_sizes_b": [], + "hidden_layers_sizes_bot": [], "pos_encoding": "timing", # {"timing", "emb", "custom_timing"} # introduce phase shift in time encodings between transformers @@ -79,12 +80,10 @@ class EmbeddingPolicy(Policy): "pos_max_timescale": 1.0e1, "max_seq_length": 256, "num_heads": 4, - # number of units in rnn cell - "rnn_size": 128, - "num_rnn_layers": 1, + # number of units in transformer + "transformer_size": 128, + "num_transformer_layers": 1, # training parameters - # flag if to turn on layer normalization for lstm cell - "layer_norm": True, # initial and final batch sizes - batch size will be # linearly increased for each epoch "batch_size": [8, 32], @@ -114,23 +113,10 @@ class EmbeddingPolicy(Policy): # the scale of how important is to minimize the maximum similarity # between embeddings of different actions "C_emb": 0.8, - # dropout rate for user nn - "droprate_a": 0.0, # dropout rate for bot nn - "droprate_b": 0.0, - # dropout rate for rnn - "droprate_rnn": 0.1, - # attention parameters - # flag to use attention over user input - # as an input to rnn - "attn_before_rnn": True, - # flag to use attention over prev bot actions - # and copy it to output bypassing rnn - "attn_after_rnn": True, - # flag to use `sparsemax` instead of `softmax` for attention - "sparse_attention": False, # flag to use sparsemax for probs - # the range of allowed location-based attention shifts - "attn_shift_range": None, # if None, set to mean dialogue length / 2 + "droprate_bot": 0.0, + # dropout rate for dial nn + "droprate_dial": 0.1, # visualization of accuracy # how often calculate train accuracy "evaluate_every_num_epochs": 20, # small values may hurt performance @@ -141,20 +127,20 @@ class EmbeddingPolicy(Policy): # end default properties (DOC MARKER - don't remove) @staticmethod - def _standard_featurizer(max_history=None): + def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturizer': if max_history is None: return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer()) else: return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history) @staticmethod - def _check_t2t(): + def _check_t2t() -> None: if common_attention is None: raise ImportError("Please install tensor2tensor") def __init__( self, - featurizer: Optional['FullDialogueTrackerFeaturizer'] = None, + featurizer: Optional['TrackerFeaturizer'] = None, priority: int = 1, encoded_all_actions: Optional['np.ndarray'] = None, graph: Optional['tf.Graph'] = None, @@ -180,12 +166,6 @@ def __init__( featurizer = self._standard_featurizer(max_history) super(EmbeddingPolicy, self).__init__(featurizer, priority) - # flag if to use the same embeddings for user and bot - try: - self.share_embedding = self.featurizer.state_featurizer.use_shared_vocab - except AttributeError: - self.share_embedding = False - self._load_params(**kwargs) # chrono initialization for forget bias @@ -219,31 +199,15 @@ def __init__( # init helpers def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: - self.hidden_layer_sizes = { - "a": config["hidden_layers_sizes_a"], - "b": config["hidden_layers_sizes_b"], - } + self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"] - if self.share_embedding: - if self.hidden_layer_sizes["a"] != self.hidden_layer_sizes["b"]: - raise ValueError( - "Due to sharing vocabulary " - "in the featurizer, embedding weights " - "are shared as well. " - "So hidden_layers_sizes_a={} should be " - "equal to hidden_layers_sizes_b={}" - "".format( - self.hidden_layer_sizes["a"], self.hidden_layer_sizes["b"] - ) - ) self.pos_encoding = config['pos_encoding'] self.pos_max_timescale = config['pos_max_timescale'] self.max_seq_length = config['max_seq_length'] self.num_heads = config['num_heads'] - self.rnn_size = config["rnn_size"] - self.num_rnn_layers = config["num_rnn_layers"] - self.layer_norm = config["layer_norm"] + self.transformer_size = config["transformer_size"] + self.num_transformer_layers = config["num_transformer_layers"] self.batch_size = config["batch_size"] @@ -270,20 +234,10 @@ def _load_regularization_params(self, config: Dict[Text, Any]) -> None: self.C2 = config["C2"] self.C_emb = config["C_emb"] self.droprate = { - "a": config["droprate_a"], - "b": config["droprate_b"], - "rnn": config["droprate_rnn"], + "bot": config["droprate_bot"], + "dial": config["droprate_dial"], } - def _load_attn_params(self, config: Dict[Text, Any]) -> None: - self.sparse_attention = config["sparse_attention"] - self.attn_shift_range = config["attn_shift_range"] - self.attn_after_rnn = config["attn_after_rnn"] - self.attn_before_rnn = config["attn_before_rnn"] - - def is_using_attention(self): - return self.attn_after_rnn or self.attn_before_rnn - def _load_visual_params(self, config: Dict[Text, Any]) -> None: self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"] if self.evaluate_every_num_epochs < 1: @@ -298,14 +252,13 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None: self._load_nn_architecture_params(config) self._load_embedding_params(config) self._load_regularization_params(config) - self._load_attn_params(config) self._load_visual_params(config) # data helpers # noinspection PyPep8Naming def _create_X_slots_previous_actions( - self, data_X: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + self, data_X: 'np.ndarray' + ) -> Tuple['np.ndarray', 'np.ndarray', 'np.ndarray']: """Extract feature vectors for user input (X), slots and @@ -324,12 +277,12 @@ def _create_X_slots_previous_actions( # noinspection PyPep8Naming @staticmethod - def _actions_for_Y(data_Y: np.ndarray) -> np.ndarray: + def _actions_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray': """Prepare Y data for training: extract actions indices.""" return data_Y.argmax(axis=-1) # noinspection PyPep8Naming - def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray: + def _action_features_for_Y(self, actions_for_Y: 'np.ndarray') -> 'np.ndarray': """Prepare Y data for training: features for action labels.""" if len(actions_for_Y.shape) == 2: @@ -350,8 +303,8 @@ def _action_features_for_Y(self, actions_for_Y: np.ndarray) -> np.ndarray: # noinspection PyPep8Naming def _create_session_data( - self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None - ) -> SessionData: + self, data_X: 'np.ndarray', data_Y: Optional['np.ndarray'] = None + ) -> 'SessionData': """Combine all tf session related data into a named tuple""" X, slots, previous_actions = self._create_X_slots_previous_actions(data_X) @@ -401,7 +354,7 @@ def _create_tf_dataset(session_data: 'SessionData', return train_dataset @staticmethod - def _create_tf_iterator(dataset): + def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator': return tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes, output_classes=dataset.output_classes) @@ -409,7 +362,7 @@ def _create_tf_iterator(dataset): def _create_tf_nn( self, x_in: 'tf.Tensor', - layer_sizes: List, + layer_sizes: List[int], droprate: float, layer_name_suffix: Text, ) -> 'tf.Tensor': @@ -461,25 +414,23 @@ def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tenso def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': """Create embedding bot vector.""" - layer_name_suffix = "a_and_b" if self.share_embedding else "b" - b = self._create_tf_nn( b_in, - self.hidden_layer_sizes["b"], - self.droprate["b"], - layer_name_suffix=layer_name_suffix, + self.hidden_layer_sizes_bot, + self.droprate["bot"], + layer_name_suffix="bot", ) - return self._create_tf_embed(b, layer_name_suffix=layer_name_suffix) + return self._create_tf_embed(b, layer_name_suffix="bot") - def _create_hparams(self): + def _create_hparams(self) -> 'HParams': hparams = transformer_base() - hparams.num_hidden_layers = self.num_rnn_layers - hparams.hidden_size = self.rnn_size + hparams.num_hidden_layers = self.num_transformer_layers + hparams.hidden_size = self.transformer_size # it seems to be factor of 4 for transformer architectures in t2t hparams.filter_size = hparams.hidden_size * 4 hparams.num_heads = self.num_heads - hparams.relu_dropout = self.droprate["rnn"] + hparams.relu_dropout = self.droprate["dial"] hparams.pos = self.pos_encoding hparams.max_length = self.max_seq_length @@ -489,13 +440,17 @@ def _create_hparams(self): hparams.self_attention_type = "dot_product_relative_v2" hparams.max_relative_position = 5 hparams.add_relative_to_values = True + return hparams - def _create_tf_transformer_encoder(self, a_in, c_in, b_prev_in, mask, attention_weights): + # noinspection PyUnresolvedReferences + def _create_tf_transformer_encoder(self, + x_in: 'tf.Tensor', + mask: 'tf.Tensor', + attention_weights: Dict[Text, 'tf.Tensor'], + ) -> 'tf.Tensor': hparams = self._create_hparams() - x_in = tf.concat([a_in, b_prev_in, c_in], -1) - # When not in training mode, set all forms of dropout to zero. for key, value in hparams.values().items(): if key.endswith("dropout") or key == "label_smoothing": @@ -557,11 +512,12 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) + x_in = tf.concat([self.a_in, self.b_prev_in, self.c_in], -1) + self.attention_weights = {} - a = self._create_tf_transformer_encoder( - self.a_in, self.c_in, self.b_prev_in, mask, self.attention_weights - ) - dial_embed = self._create_tf_embed(a, layer_name_suffix="out") + x = self._create_tf_transformer_encoder(x_in, mask, self.attention_weights) + + dial_embed = self._create_tf_embed(x, layer_name_suffix="dial") if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): # pick last action if max history featurizer is used @@ -571,24 +527,22 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: return dial_embed, mask @staticmethod - def _tf_make_flat(x): + def _tf_make_flat(x: 'tf.Tensor') -> 'tf.Tensor': return tf.reshape(x, (-1, x.shape[-1])) @staticmethod - def _tf_sample_neg(batch_size, - all_bs, - neg_ids, - ) -> 'tf.Tensor': + def _tf_sample_neg(batch_size: 'tf.Tensor', + all_bs: 'tf.Tensor', + neg_ids: 'tf.Tensor') -> 'tf.Tensor': tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1)) return tf.batch_gather(tiled_all_bs, neg_ids) def _tf_calc_iou_mask(self, - pos_b, - all_bs, - neg_ids, - ) -> 'tf.Tensor': + pos_b: 'tf.Tensor', + all_bs: 'tf.Tensor', + neg_ids: 'tf.Tensor') -> 'tf.Tensor': pos_b_in_flat = pos_b[:, tf.newaxis, :] neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids) @@ -599,7 +553,10 @@ def _tf_calc_iou_mask(self, iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1) return 1. - tf.nn.relu(tf.sign(1. - iou)) - def _tf_get_negs(self, all_embed, all_raw, raw_pos): + def _tf_get_negs(self, + all_embed: 'tf.Tensor', + all_raw: 'tf.Tensor', + raw_pos: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']: batch_size = tf.shape(raw_pos)[0] seq_length = tf.shape(raw_pos)[1] @@ -618,7 +575,12 @@ def _tf_get_negs(self, all_embed, all_raw, raw_pos): return neg_embed, bad_negs - def _sample_negatives(self, all_actions): + def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor', + 'tf.Tensor', + 'tf.Tensor', + 'tf.Tensor', + 'tf.Tensor', + 'tf.Tensor']: # sample negatives pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :] @@ -637,11 +599,7 @@ def _sample_negatives(self, all_actions): dial_bad_negs, bot_bad_negs) @staticmethod - def _tf_raw_sim( - a: 'tf.Tensor', - b: 'tf.Tensor', - mask: 'tf.Tensor', - ) -> 'tf.Tensor': + def _tf_raw_sim(a: 'tf.Tensor', b: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor': return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2) @@ -676,7 +634,7 @@ def _tf_sim( return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial @staticmethod - def _tf_calc_accuracy(sim_pos, sim_neg): + def _tf_calc_accuracy(sim_pos: 'tf.Tensor', sim_neg: 'tf.Tensor') -> 'tf.Tensor': max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1) return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), @@ -792,7 +750,7 @@ def _choose_loss(self, "".format(self.loss_type) ) - def _build_tf_train_graph(self): + def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: # session data are int counts but we need a float tensors (self.a_in, @@ -843,8 +801,8 @@ def _build_tf_train_graph(self): mask) return loss, acc - def _create_tf_placeholders(self, session_data): - dialogue_len = None # use dynamic time for rnn + def _create_tf_placeholders(self, session_data: 'SessionData') -> None: + dialogue_len = None # use dynamic time self.a_in = tf.placeholder( dtype=tf.float32, shape=(None, dialogue_len, session_data.X.shape[-1]), @@ -866,7 +824,7 @@ def _create_tf_placeholders(self, session_data): name="b_prev", ) - def _build_tf_pred_graph(self): + def _build_tf_pred_graph(self) -> 'tf.Tensor': self.dial_embed, mask = self._create_tf_dial() self.sim_all = self._tf_raw_sim( @@ -892,7 +850,7 @@ def _build_tf_pred_graph(self): return confidence - def _extract_attention(self): + def _extract_attention(self) -> Optional['tf.Tensor']: attention = [tf.expand_dims(t, 0) for name, t in self.attention_weights.items() if name.endswith('multihead_attention/dot_product_attention')] @@ -931,6 +889,7 @@ def train( "else set num_neg to the number of actions - 1" "".format(self.num_neg, domain.num_actions) ) + # noinspection PyAttributeOutsideInit self.num_neg = min(self.num_neg, domain.num_actions - 1) # extract actual training data to feed to tf session @@ -993,9 +952,9 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int: return int(self.batch_size[0]) def _train_tf_dataset(self, - train_init_op, - eval_init_op, - batch_size_in, + train_init_op: 'tf.Operation', + eval_init_op: 'tf.Operation', + batch_size_in: 'tf.Tensor', loss: 'tf.Tensor', acc, ) -> None: @@ -1057,7 +1016,10 @@ def _train_tf_dataset(self, "loss={:.3f}, accuracy={:.3f}" "".format(eval_loss, eval_acc)) - def _output_training_stat_dataset(self, eval_init_op, loss, acc) -> Tuple[float, float]: + def _output_training_stat_dataset(self, + eval_init_op: 'tf.Operation', + loss: 'tf.Tensor', + acc: 'tf.Tensor') -> Tuple[float, float]: """Output training statistics""" self.session.run(eval_init_op) @@ -1066,8 +1028,8 @@ def _output_training_stat_dataset(self, eval_init_op, loss, acc) -> Tuple[float, def continue_training( self, - training_trackers: List[DialogueStateTracker], - domain: Domain, + training_trackers: List['DialogueStateTracker'], + domain: 'Domain', **kwargs: Any ) -> None: """Continue training an already trained policy.""" @@ -1095,8 +1057,9 @@ def continue_training( break def tf_feed_dict_for_prediction(self, - tracker: DialogueStateTracker, - domain: Domain) -> Dict: + tracker: 'DialogueStateTracker', + domain: 'Domain' + ) -> Dict['tf.Tensor', 'np.ndarray']: # noinspection PyPep8Naming data_X = self.featurizer.create_X([tracker], domain) session_data = self._create_session_data(data_X) @@ -1106,7 +1069,7 @@ def tf_feed_dict_for_prediction(self, self.b_prev_in: session_data.previous_actions} def predict_action_probabilities( - self, tracker: DialogueStateTracker, domain: Domain + self, tracker: 'DialogueStateTracker', domain: 'Domain' ) -> List[float]: """Predict the next action the bot should take. @@ -1218,10 +1181,10 @@ def load(cls, path: Text) -> "EmbeddingPolicy": graph = tf.Graph() with graph.as_default(): - sess = tf.Session(config=_tf_config) + session = tf.Session(config=_tf_config) saver = tf.train.import_meta_graph(checkpoint + ".meta") - saver.restore(sess, checkpoint) + saver.restore(session, checkpoint) a_in = cls.load_tensor("intent_placeholder") b_in = cls.load_tensor("action_placeholder") @@ -1250,7 +1213,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy": priority=meta["priority"], encoded_all_actions=encoded_all_actions, graph=graph, - session=sess, + session=session, intent_placeholder=a_in, action_placeholder=b_in, slots_placeholder=c_in, From 3ae9b77c9d6ec6d4a3f60ec5b63fde8f3d215fd7 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 21:04:21 +0200 Subject: [PATCH 11/50] Add method descriptions --- rasa/core/policies/embedding_policy.py | 85 ++++++++++++++++++-------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index caf653be37b9..2f5041222b9d 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -60,9 +60,9 @@ class EmbeddingPolicy(Policy): - """Recurrent Embedding Dialogue Policy (REDP) + """Transformer Embedding Dialogue Policy (TEDP) - Transformer version of the policy used in our paper https://arxiv.org/abs/1811.11707 + Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707 """ SUPPORTS_ONLINE_TRAINING = True @@ -259,10 +259,9 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None: def _create_X_slots_previous_actions( self, data_X: 'np.ndarray' ) -> Tuple['np.ndarray', 'np.ndarray', 'np.ndarray']: - """Extract feature vectors + """Extract feature vectors from training data. - for user input (X), slots and - previously executed actions from training data. + For user input (X), slots and previously executed actions. """ featurizer = self.featurizer.state_featurizer @@ -279,6 +278,7 @@ def _create_X_slots_previous_actions( @staticmethod def _actions_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray': """Prepare Y data for training: extract actions indices.""" + return data_Y.argmax(axis=-1) # noinspection PyPep8Naming @@ -329,6 +329,8 @@ def _create_session_data( @staticmethod def _sample_session_data(session_data: 'SessionData', num_samples: int) -> 'SessionData': + """Sample session data.""" + ids = np.random.permutation(len(session_data.X))[:num_samples] return SessionData( X=session_data.X[ids], @@ -343,6 +345,8 @@ def _sample_session_data(session_data: 'SessionData', def _create_tf_dataset(session_data: 'SessionData', batch_size: Union['tf.Tensor', int], shuffle: bool = True) -> 'tf.data.Dataset': + """Create tf dataset.""" + train_dataset = tf.data.Dataset.from_tensor_slices( (session_data.X, session_data.Y, session_data.slots, session_data.previous_actions) @@ -355,6 +359,8 @@ def _create_tf_dataset(session_data: 'SessionData', @staticmethod def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator': + """Create tf iterator.""" + return tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes, output_classes=dataset.output_classes) @@ -382,20 +388,20 @@ def _create_tf_nn( x = tf.layers.dropout(x, rate=droprate, training=self._is_training) return x - def _tf_normalize_if_cosine(self, a: 'tf.Tensor') -> 'tf.Tensor': + def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor': + """Normalize embedding if similarity type is cosine.""" - if self.similarity_type not in {"cosine", "inner"}: + if self.similarity_type == "cosine": + return tf.nn.l2_normalize(x, -1) + elif self.similarity_type == "inner": + return x + else: raise ValueError( "Wrong similarity type {}, " "should be 'cosine' or 'inner'" "".format(self.similarity_type) ) - if self.similarity_type == "cosine": - return tf.nn.l2_normalize(a, -1) - else: - return a - def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': """Create dense embedding layer with a name.""" @@ -422,7 +428,9 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': ) return self._create_tf_embed(b, layer_name_suffix="bot") - def _create_hparams(self) -> 'HParams': + def _create_t2t_hparams(self) -> 'HParams': + """Create parameters for t2t transformer.""" + hparams = transformer_base() hparams.num_hidden_layers = self.num_transformer_layers @@ -444,12 +452,14 @@ def _create_hparams(self) -> 'HParams': return hparams # noinspection PyUnresolvedReferences - def _create_tf_transformer_encoder(self, - x_in: 'tf.Tensor', - mask: 'tf.Tensor', - attention_weights: Dict[Text, 'tf.Tensor'], - ) -> 'tf.Tensor': - hparams = self._create_hparams() + def _create_t2t_transformer_encoder(self, + x_in: 'tf.Tensor', + mask: 'tf.Tensor', + attention_weights: Dict[Text, 'tf.Tensor'], + ) -> 'tf.Tensor': + """Create t2t transformer encoder.""" + + hparams = self._create_t2t_hparams() # When not in training mode, set all forms of dropout to zero. for key, value in hparams.values().items(): @@ -508,6 +518,8 @@ def _create_tf_transformer_encoder(self, return tf.nn.relu(x) def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: + """Create dialogue level embedding and mask.""" + # mask different length sequences # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) @@ -515,7 +527,7 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: x_in = tf.concat([self.a_in, self.b_prev_in, self.c_in], -1) self.attention_weights = {} - x = self._create_tf_transformer_encoder(x_in, mask, self.attention_weights) + x = self._create_t2t_transformer_encoder(x_in, mask, self.attention_weights) dial_embed = self._create_tf_embed(x, layer_name_suffix="dial") @@ -528,12 +540,15 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: @staticmethod def _tf_make_flat(x: 'tf.Tensor') -> 'tf.Tensor': + """Make tensor 2D.""" + return tf.reshape(x, (-1, x.shape[-1])) @staticmethod def _tf_sample_neg(batch_size: 'tf.Tensor', all_bs: 'tf.Tensor', neg_ids: 'tf.Tensor') -> 'tf.Tensor': + """Sample negative examples for given indices""" tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1)) @@ -543,6 +558,7 @@ def _tf_calc_iou_mask(self, pos_b: 'tf.Tensor', all_bs: 'tf.Tensor', neg_ids: 'tf.Tensor') -> 'tf.Tensor': + """Calculate IOU mask for given indices""" pos_b_in_flat = pos_b[:, tf.newaxis, :] neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids) @@ -557,6 +573,7 @@ def _tf_get_negs(self, all_embed: 'tf.Tensor', all_raw: 'tf.Tensor', raw_pos: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']: + """Get negative examples from given tensor.""" batch_size = tf.shape(raw_pos)[0] seq_length = tf.shape(raw_pos)[1] @@ -581,8 +598,8 @@ def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']: + """Sample negative examples.""" - # sample negatives pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :] neg_dial_embed, dial_bad_negs = self._tf_get_negs( self._tf_make_flat(self.dial_embed), @@ -600,6 +617,7 @@ def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor', @staticmethod def _tf_raw_sim(a: 'tf.Tensor', b: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor': + """Calculate similarity between given tensors.""" return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2) @@ -635,6 +653,7 @@ def _tf_sim( @staticmethod def _tf_calc_accuracy(sim_pos: 'tf.Tensor', sim_neg: 'tf.Tensor') -> 'tf.Tensor': + """Calculate accuracy""" max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1) return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), @@ -730,6 +749,7 @@ def _choose_loss(self, sim_neg_dial_dial: 'tf.Tensor', sim_neg_bot_dial: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor': + """Use loss depending on given option.""" if self.loss_type == 'margin': return self._tf_loss_margin(sim_pos, sim_neg, @@ -751,6 +771,7 @@ def _choose_loss(self, ) def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: + """Bulid train graph using iterator.""" # session data are int counts but we need a float tensors (self.a_in, @@ -802,6 +823,8 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: return loss, acc def _create_tf_placeholders(self, session_data: 'SessionData') -> None: + """Create placeholders for prediction.""" + dialogue_len = None # use dynamic time self.a_in = tf.placeholder( dtype=tf.float32, @@ -824,7 +847,11 @@ def _create_tf_placeholders(self, session_data: 'SessionData') -> None: name="b_prev", ) - def _build_tf_pred_graph(self) -> 'tf.Tensor': + def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor': + """Rebuild tf graph for prediction.""" + + self._create_tf_placeholders(session_data) + self.dial_embed, mask = self._create_tf_dial() self.sim_all = self._tf_raw_sim( @@ -851,6 +878,8 @@ def _build_tf_pred_graph(self) -> 'tf.Tensor': return confidence def _extract_attention(self) -> Optional['tf.Tensor']: + """Extract attention probabilities from t2t dict""" + attention = [tf.expand_dims(t, 0) for name, t in self.attention_weights.items() if name.endswith('multihead_attention/dot_product_attention')] @@ -928,8 +957,7 @@ def train( loss, acc) # rebuild the graph for prediction - self._create_tf_placeholders(session_data) - self.pred_confidence = self._build_tf_pred_graph() + self.pred_confidence = self._build_tf_pred_graph(session_data) self.attention_weights = self._extract_attention() @@ -1060,6 +1088,8 @@ def tf_feed_dict_for_prediction(self, tracker: 'DialogueStateTracker', domain: 'Domain' ) -> Dict['tf.Tensor', 'np.ndarray']: + """Create feed dictionary for tf session.""" + # noinspection PyPep8Naming data_X = self.featurizer.create_X([tracker], domain) session_data = self._create_session_data(data_X) @@ -1091,6 +1121,8 @@ def predict_action_probabilities( return confidence[0, -1, :].tolist() def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None: + """Add tensor to collection if it is not None""" + if tensor is not None: self.graph.clear_collection(name) self.graph.add_to_collection(name, tensor) @@ -1148,6 +1180,8 @@ def persist(self, path: Text) -> None: @staticmethod def load_tensor(name: Text) -> Optional['tf.Tensor']: + """Load tensor or set it to None""" + tensor_list = tf.get_collection(name) return tensor_list[0] if tensor_list else None @@ -1155,7 +1189,8 @@ def load_tensor(name: Text) -> Optional['tf.Tensor']: def load(cls, path: Text) -> "EmbeddingPolicy": """Loads a policy from the storage. - **Needs to load its featurizer**""" + **Needs to load its featurizer** + """ if not os.path.exists(path): raise Exception( From 1d28d63427d5a73f49c062def26ee6a2db8b26ca Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 17 Jul 2019 21:18:13 +0200 Subject: [PATCH 12/50] break long lines --- rasa/core/policies/embedding_policy.py | 40 ++++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 2f5041222b9d..06aa26e473fc 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -27,7 +27,9 @@ try: from tensor2tensor.layers import common_attention - from tensor2tensor.models.transformer import transformer_base, transformer_prepare_encoder, transformer_encoder + from tensor2tensor.models.transformer import (transformer_base, + transformer_prepare_encoder, + transformer_encoder) except ImportError: common_attention = None transformer_base = None @@ -131,7 +133,8 @@ def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturize if max_history is None: return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer()) else: - return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), max_history=max_history) + return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), + max_history=max_history) @staticmethod def _check_t2t() -> None: @@ -289,7 +292,8 @@ def _action_features_for_Y(self, actions_for_Y: 'np.ndarray') -> 'np.ndarray': return np.stack( [ np.stack( - [self.encoded_all_actions[action_idx] for action_idx in action_ids] + [self.encoded_all_actions[action_idx] + for action_idx in action_ids] ) for action_ids in actions_for_Y ] @@ -493,7 +497,8 @@ def _create_t2t_transformer_encoder(self, ) = transformer_prepare_encoder(x, None, hparams) if hparams.pos == 'custom_timing': - x = common_attention.add_timing_signal_1d(x, max_timescale=self.pos_max_timescale) + x = common_attention.add_timing_signal_1d( + x, max_timescale=self.pos_max_timescale) x *= tf.expand_dims(mask, -1) @@ -566,7 +571,8 @@ def _tf_calc_iou_mask(self, intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat) union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat) - iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1) + iou = (tf.reduce_sum(intersection_b_in_flat, -1) + / tf.reduce_sum(union_b_in_flat, -1)) return 1. - tf.nn.relu(tf.sign(1. - iou)) def _tf_get_negs(self, @@ -586,7 +592,8 @@ def _tf_get_negs(self, bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids) bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1)) - neg_embed_flat = self._tf_sample_neg(batch_size * seq_length, all_embed, neg_ids) + neg_embed_flat = self._tf_sample_neg(batch_size * seq_length, + all_embed, neg_ids) neg_embed = tf.reshape(neg_embed_flat, (batch_size, seq_length, -1, all_embed.shape[-1])) @@ -777,7 +784,8 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: (self.a_in, self.b_in, self.c_in, - self.b_prev_in) = (tf.cast(x_in, tf.float32) for x_in in self._iterator.get_next()) + self.b_prev_in) = (tf.cast(x_in, tf.float32) + for x_in in self._iterator.get_next()) all_actions = tf.constant(self.encoded_all_actions, dtype=tf.float32, @@ -907,9 +915,8 @@ def train( training_data = self.featurize_for_training(training_trackers, domain, **kwargs) # encode all actions with policies' featurizer - self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions( - domain - ) + self.encoded_all_actions = \ + self.featurizer.state_featurizer.create_encoded_all_actions(domain) # check if number of negatives is less than number of actions logger.debug( @@ -939,8 +946,10 @@ def train( train_init_op = self._iterator.make_initializer(train_dataset) if self.evaluate_on_num_examples: - eval_session_data = self._sample_session_data(session_data, self.evaluate_on_num_examples) - eval_train_dataset = self._create_tf_dataset(eval_session_data, self.evaluate_on_num_examples, shuffle=False) + eval_session_data = self._sample_session_data( + session_data, self.evaluate_on_num_examples) + eval_train_dataset = self._create_tf_dataset( + eval_session_data, self.evaluate_on_num_examples, shuffle=False) eval_init_op = self._iterator.make_initializer(eval_train_dataset) else: eval_init_op = None @@ -1030,12 +1039,13 @@ def _train_tf_dataset(self, }) if self.evaluate_on_num_examples and eval_init_op is not None: - if (ep == 0 or - (ep + 1) % self.evaluate_every_num_epochs == 0 or - (ep + 1) == self.epochs): + if ((ep + 1) % self.evaluate_every_num_epochs == 0 + or (ep + 1) == self.epochs): eval_loss, eval_acc = self._output_training_stat_dataset( eval_init_op, loss, acc ) + if ((ep + 1) % self.evaluate_every_num_epochs == 0 + and (ep + 1) != self.epochs): logger.info("Evaluation results: loss: {:.3f}, acc: {:.3f}" "".format(eval_loss, eval_acc)) From 00604d5afc3e3fc92fe435271f82620523cb6215 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 18 Jul 2019 12:46:44 +0200 Subject: [PATCH 13/50] make methods more generic, by using expand_dims instead of tf.newaxis --- rasa/core/policies/embedding_policy.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 06aa26e473fc..6756b6a77cad 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -351,15 +351,15 @@ def _create_tf_dataset(session_data: 'SessionData', shuffle: bool = True) -> 'tf.data.Dataset': """Create tf dataset.""" - train_dataset = tf.data.Dataset.from_tensor_slices( + dataset = tf.data.Dataset.from_tensor_slices( (session_data.X, session_data.Y, session_data.slots, session_data.previous_actions) ) if shuffle: - train_dataset = train_dataset.shuffle(buffer_size=len(session_data.X)) - train_dataset = train_dataset.batch(batch_size) + dataset = dataset.shuffle(buffer_size=len(session_data.X)) + dataset = dataset.batch(batch_size) - return train_dataset + return dataset @staticmethod def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator': @@ -565,7 +565,7 @@ def _tf_calc_iou_mask(self, neg_ids: 'tf.Tensor') -> 'tf.Tensor': """Calculate IOU mask for given indices""" - pos_b_in_flat = pos_b[:, tf.newaxis, :] + pos_b_in_flat = tf.expand_dims(pos_b, -2) neg_b_in_flat = self._tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids) intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat) @@ -607,13 +607,13 @@ def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']: """Sample negative examples.""" - pos_dial_embed = self.dial_embed[:, :, tf.newaxis, :] + pos_dial_embed = tf.expand_dims(self.dial_embed, -2) neg_dial_embed, dial_bad_negs = self._tf_get_negs( self._tf_make_flat(self.dial_embed), self._tf_make_flat(self.b_in), self.b_in ) - pos_bot_embed = self.bot_embed[:, :, tf.newaxis, :] + pos_bot_embed = tf.expand_dims(self.bot_embed, -2) neg_bot_embed, bot_bad_negs = self._tf_get_negs( self.all_bot_embed, all_actions, From f2b0c91ff4c4597fe81395889bc47a68563959f7 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 23 Jul 2019 14:39:24 +0200 Subject: [PATCH 14/50] add first version of stratified batching --- rasa/core/featurizers.py | 8 - rasa/core/policies/embedding_policy.py | 197 ++++++++++++++++--------- 2 files changed, 127 insertions(+), 78 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index f0e722975078..4bdcbb9c6384 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -26,11 +26,6 @@ class SingleStateFeaturizer(object): the conversation state to a format which a classifier can read: feature vector.""" - def __init__(self): - """Declares instant variables.""" - self.user_feature_len = None - self.slot_feature_len = None - def prepare_from_domain(self, domain: Domain) -> None: """Helper method to init based on domain""" pass @@ -73,9 +68,6 @@ def prepare_from_domain(self, domain: Domain) -> None: self.num_features = domain.num_states self.input_state_map = domain.input_state_map - self.user_feature_len = len(domain.intent_states) + len(domain.entity_states) - self.slot_feature_len = len(domain.slot_states) - def encode(self, state: Dict[Text, float]) -> np.ndarray: """Returns a binary vector indicating which features are active. diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 6756b6a77cad..00030c260b4f 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -5,6 +5,7 @@ import os import warnings +import pandas as pd import numpy as np import typing from tqdm import tqdm @@ -54,9 +55,7 @@ ( "X", "Y", - "slots", - "previous_actions", - "actions_for_Y", + "labels_for_Y", ), ) @@ -258,50 +257,31 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None: self._load_visual_params(config) # data helpers - # noinspection PyPep8Naming - def _create_X_slots_previous_actions( - self, data_X: 'np.ndarray' - ) -> Tuple['np.ndarray', 'np.ndarray', 'np.ndarray']: - """Extract feature vectors from training data. - - For user input (X), slots and previously executed actions. - """ - - featurizer = self.featurizer.state_featurizer - slot_start = featurizer.user_feature_len - previous_start = slot_start + featurizer.slot_feature_len - - X = data_X[:, :, :slot_start] - slots = data_X[:, :, slot_start:previous_start] - previous_actions = data_X[:, :, previous_start:] - - return X, slots, previous_actions - # noinspection PyPep8Naming @staticmethod - def _actions_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray': + def _labels_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray': """Prepare Y data for training: extract actions indices.""" return data_Y.argmax(axis=-1) # noinspection PyPep8Naming - def _action_features_for_Y(self, actions_for_Y: 'np.ndarray') -> 'np.ndarray': + def _action_features_for_Y(self, labels_for_Y: 'np.ndarray') -> 'np.ndarray': """Prepare Y data for training: features for action labels.""" - if len(actions_for_Y.shape) == 2: + if len(labels_for_Y.shape) == 2: return np.stack( [ np.stack( [self.encoded_all_actions[action_idx] for action_idx in action_ids] ) - for action_ids in actions_for_Y + for action_ids in labels_for_Y ] ) else: return np.stack( [ - self.encoded_all_actions[action_idx] for action_idx in actions_for_Y + self.encoded_all_actions[action_idx] for action_idx in labels_for_Y ] ) @@ -311,23 +291,19 @@ def _create_session_data( ) -> 'SessionData': """Combine all tf session related data into a named tuple""" - X, slots, previous_actions = self._create_X_slots_previous_actions(data_X) - if data_Y is not None: # training time - actions_for_Y = self._actions_for_Y(data_Y) - Y = self._action_features_for_Y(actions_for_Y) + labels_for_Y = self._labels_for_Y(data_Y) + Y = self._action_features_for_Y(labels_for_Y) else: # prediction time - actions_for_Y = None + labels_for_Y = None Y = None return SessionData( - X=X, + X=data_X, Y=Y, - slots=slots, - previous_actions=previous_actions, - actions_for_Y=actions_for_Y, + labels_for_Y=labels_for_Y, ) @staticmethod @@ -339,25 +315,121 @@ def _sample_session_data(session_data: 'SessionData', return SessionData( X=session_data.X[ids], Y=session_data.Y[ids], - slots=session_data.slots[ids], - previous_actions=session_data.previous_actions[ids], - actions_for_Y=session_data.actions_for_Y[ids], + labels_for_Y=session_data.labels_for_Y[ids], ) # tf helpers: + # noinspection PyPep8Naming + @staticmethod + def gen_stratified_batch(session_data, batch_size): + + num_examples = len(session_data.X) + ids = np.random.permutation(num_examples) + X = session_data.X[ids] + Y = session_data.Y[ids] + labels_for_Y = session_data.labels_for_Y[ids] + + labels = list(set(labels_for_Y)) + np.random.shuffle(labels) + + class_data = [] + for label in labels: + label_X = X[labels_for_Y == label] + label_Y = Y[labels_for_Y == label] + label_labels_for_Y = labels_for_Y[labels_for_Y == label] + session_data_label = SessionData( + X=label_X, + Y=label_Y, + labels_for_Y=label_labels_for_Y, + ) + + class_data.append(session_data_label) + + num_classes = len(class_data) + + data_idx = [0] * num_classes + num_data_cycles = [0] * num_classes + print(batch_size) + print(X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)) + # print([len(class_i.X) / num_examples for class_i in class_data]) + class_idx = 0 + bbb = 0 + while min(num_data_cycles) == 0: + batch_x = [] + batch_y = [] + batch_len = 0 + while batch_len < batch_size: + + class_i = class_data[class_idx] + + num_i = int(len(class_i.X) / num_examples * batch_size) + 1 + + if batch_len + num_i > batch_size: + num_i = batch_size - batch_len + + if data_idx[class_idx] + num_i > len(class_i.X): + num_i = len(class_i.X) - data_idx[class_idx] + + batch_x.append(class_i.X[data_idx[class_idx]:data_idx[class_idx]+num_i]) + batch_y.append(class_i.Y[data_idx[class_idx]:data_idx[class_idx]+num_i]) + batch_len += num_i + + data_idx[class_idx] += num_i + if data_idx[class_idx] >= len(class_i.X): + num_data_cycles[class_idx] += 1 + data_idx[class_idx] = 0 + + class_idx += 1 + if class_idx >= num_classes: + class_idx = 0 + if max(num_data_cycles) > 0 and max(num_data_cycles) == num_data_cycles[class_idx]: + class_idx += 1 + if class_idx >= num_classes: + class_idx = 0 + bbb+=1 + if min(num_data_cycles) > 0: + print(num_data_cycles) + print(bbb) + yield np.concatenate(batch_x), np.concatenate(batch_y) + + # noinspection PyPep8Naming @staticmethod - def _create_tf_dataset(session_data: 'SessionData', + def gen_sequence_batch(session_data, batch_size): + + ids = np.random.permutation(len(session_data.X)) + X = session_data.X[ids] + Y = session_data.Y[ids] + + num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) + + for batch_num in range(num_batches): + batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size] + batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size] + + yield batch_x, batch_y + + # @staticmethod + def _create_tf_dataset(self, session_data: 'SessionData', batch_size: Union['tf.Tensor', int], shuffle: bool = True) -> 'tf.data.Dataset': """Create tf dataset.""" - dataset = tf.data.Dataset.from_tensor_slices( - (session_data.X, session_data.Y, - session_data.slots, session_data.previous_actions) - ) - if shuffle: - dataset = dataset.shuffle(buffer_size=len(session_data.X)) - dataset = dataset.batch(batch_size) + def train_gen_func(batch_size_): + return self.gen_stratified_batch(session_data, batch_size_) + # return self.gen_sequence_batch(session_data, batch_size_) + + dpt_types = (tf.float32, tf.float32) + dpt_shapes = ([None] + list(session_data.X[0].shape), + [None] + list(session_data.Y[0].shape)) + + dataset = tf.data.Dataset.from_generator(train_gen_func, dpt_types, dpt_shapes, args=([batch_size])) + # dataset = tf.data.Dataset.from_tensor_slices( + # (session_data.X, session_data.Y, + # session_data.slots, session_data.previous_actions) + # ) + # if shuffle: + # dataset = dataset.shuffle(buffer_size=len(session_data.X)) + # dataset = dataset.batch(batch_size) return dataset @@ -520,7 +592,8 @@ def _create_t2t_transformer_encoder(self, x *= tf.expand_dims(mask, -1) - return tf.nn.relu(x) + return tf.nn.dropout(tf.nn.relu(x), + 1.0 - hparams.layer_prepostprocess_dropout) def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: """Create dialogue level embedding and mask.""" @@ -529,12 +602,12 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) - x_in = tf.concat([self.a_in, self.b_prev_in, self.c_in], -1) - self.attention_weights = {} - x = self._create_t2t_transformer_encoder(x_in, mask, self.attention_weights) + a = self._create_t2t_transformer_encoder(self.a_in, + mask, + self.attention_weights) - dial_embed = self._create_tf_embed(x, layer_name_suffix="dial") + dial_embed = self._create_tf_embed(a, layer_name_suffix="dial") if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): # pick last action if max history featurizer is used @@ -781,11 +854,7 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: """Bulid train graph using iterator.""" # session data are int counts but we need a float tensors - (self.a_in, - self.b_in, - self.c_in, - self.b_prev_in) = (tf.cast(x_in, tf.float32) - for x_in in self._iterator.get_next()) + self.a_in, self.b_in = self._iterator.get_next() all_actions = tf.constant(self.encoded_all_actions, dtype=tf.float32, @@ -844,16 +913,6 @@ def _create_tf_placeholders(self, session_data: 'SessionData') -> None: shape=(None, dialogue_len, None, session_data.Y.shape[-1]), name="b", ) - self.c_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.slots.shape[-1]), - name="slt", - ) - self.b_prev_in = tf.placeholder( - dtype=tf.float32, - shape=(None, dialogue_len, session_data.Y.shape[-1]), - name="b_prev", - ) def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor': """Rebuild tf graph for prediction.""" @@ -1104,9 +1163,7 @@ def tf_feed_dict_for_prediction(self, data_X = self.featurizer.create_X([tracker], domain) session_data = self._create_session_data(data_X) - return {self.a_in: session_data.X, - self.c_in: session_data.slots, - self.b_prev_in: session_data.previous_actions} + return {self.a_in: session_data.X} def predict_action_probabilities( self, tracker: 'DialogueStateTracker', domain: 'Domain' From 435611f9643987a99997e7f63dd860eaa8f52413 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 23 Jul 2019 17:40:13 +0200 Subject: [PATCH 15/50] update batching --- rasa/core/policies/embedding_policy.py | 141 ++++++++++++------------- 1 file changed, 66 insertions(+), 75 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 00030c260b4f..3f0311a924f0 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -5,7 +5,6 @@ import os import warnings -import pandas as pd import numpy as np import typing from tqdm import tqdm @@ -55,7 +54,7 @@ ( "X", "Y", - "labels_for_Y", + "labels", ), ) @@ -265,23 +264,23 @@ def _labels_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray': return data_Y.argmax(axis=-1) # noinspection PyPep8Naming - def _action_features_for_Y(self, labels_for_Y: 'np.ndarray') -> 'np.ndarray': + def _action_features_for_Y(self, labels: 'np.ndarray') -> 'np.ndarray': """Prepare Y data for training: features for action labels.""" - if len(labels_for_Y.shape) == 2: + if len(labels.shape) == 2: return np.stack( [ np.stack( [self.encoded_all_actions[action_idx] for action_idx in action_ids] ) - for action_ids in labels_for_Y + for action_ids in labels ] ) else: return np.stack( [ - self.encoded_all_actions[action_idx] for action_idx in labels_for_Y + self.encoded_all_actions[action_idx] for action_idx in labels ] ) @@ -293,17 +292,17 @@ def _create_session_data( if data_Y is not None: # training time - labels_for_Y = self._labels_for_Y(data_Y) - Y = self._action_features_for_Y(labels_for_Y) + labels = self._labels_for_Y(data_Y) + Y = self._action_features_for_Y(labels) else: # prediction time - labels_for_Y = None + labels = None Y = None return SessionData( X=data_X, Y=Y, - labels_for_Y=labels_for_Y, + labels=labels, ) @staticmethod @@ -315,7 +314,7 @@ def _sample_session_data(session_data: 'SessionData', return SessionData( X=session_data.X[ids], Y=session_data.Y[ids], - labels_for_Y=session_data.labels_for_Y[ids], + labels=session_data.labels[ids], ) # tf helpers: @@ -327,70 +326,60 @@ def gen_stratified_batch(session_data, batch_size): ids = np.random.permutation(num_examples) X = session_data.X[ids] Y = session_data.Y[ids] - labels_for_Y = session_data.labels_for_Y[ids] - - labels = list(set(labels_for_Y)) - np.random.shuffle(labels) - - class_data = [] - for label in labels: - label_X = X[labels_for_Y == label] - label_Y = Y[labels_for_Y == label] - label_labels_for_Y = labels_for_Y[labels_for_Y == label] - session_data_label = SessionData( - X=label_X, - Y=label_Y, - labels_for_Y=label_labels_for_Y, - ) + labels = session_data.labels[ids] + + unique_labels, counts_labels = np.unique(labels, return_counts=True) + num_labels = len(unique_labels) + ids = np.random.permutation(num_labels) + unique_labels = unique_labels[ids] + counts_labels = counts_labels[ids] + + label_data = [] + for label in unique_labels: + label_data.append(SessionData(X=X[labels == label], + Y=Y[labels == label], + labels=labels[labels == label])) + + data_idx = [0] * num_labels + num_data_cycles = [0] * num_labels + skipped = [False] * num_labels + new_X = [] + new_Y = [] + while min(num_data_cycles) == 0: + for i in range(num_labels): + if num_data_cycles[i] > 0 and not skipped[i]: + skipped[i] = True + continue + else: + skipped[i] = False - class_data.append(session_data_label) + num_i = int(counts_labels[i] / num_examples * batch_size) + 1 - num_classes = len(class_data) + new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i]) + new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i]) - data_idx = [0] * num_classes - num_data_cycles = [0] * num_classes - print(batch_size) - print(X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0)) - # print([len(class_i.X) / num_examples for class_i in class_data]) - class_idx = 0 - bbb = 0 - while min(num_data_cycles) == 0: - batch_x = [] - batch_y = [] - batch_len = 0 - while batch_len < batch_size: - - class_i = class_data[class_idx] - - num_i = int(len(class_i.X) / num_examples * batch_size) + 1 - - if batch_len + num_i > batch_size: - num_i = batch_size - batch_len - - if data_idx[class_idx] + num_i > len(class_i.X): - num_i = len(class_i.X) - data_idx[class_idx] - - batch_x.append(class_i.X[data_idx[class_idx]:data_idx[class_idx]+num_i]) - batch_y.append(class_i.Y[data_idx[class_idx]:data_idx[class_idx]+num_i]) - batch_len += num_i - - data_idx[class_idx] += num_i - if data_idx[class_idx] >= len(class_i.X): - num_data_cycles[class_idx] += 1 - data_idx[class_idx] = 0 - - class_idx += 1 - if class_idx >= num_classes: - class_idx = 0 - if max(num_data_cycles) > 0 and max(num_data_cycles) == num_data_cycles[class_idx]: - class_idx += 1 - if class_idx >= num_classes: - class_idx = 0 - bbb+=1 - if min(num_data_cycles) > 0: - print(num_data_cycles) - print(bbb) - yield np.concatenate(batch_x), np.concatenate(batch_y) + data_idx[i] += num_i + if data_idx[i] >= counts_labels[i]: + num_data_cycles[i] += 1 + data_idx[i] = 0 + + if min(num_data_cycles) > 0: + break + + print(num_data_cycles) + num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) + print(num_batches) + + X = np.concatenate(new_X) + Y = np.concatenate(new_Y) + + num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) + print(num_batches) + for batch_num in range(num_batches): + batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size] + batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size] + + yield batch_x, batch_y # noinspection PyPep8Naming @staticmethod @@ -658,9 +647,11 @@ def _tf_get_negs(self, seq_length = tf.shape(raw_pos)[1] raw_flat = self._tf_make_flat(raw_pos) - neg_ids = tf.random.categorical(tf.log(tf.ones((batch_size * seq_length, - tf.shape(all_raw)[0]))), - self.num_neg) + total_cands = tf.shape(all_embed)[0] + + all_indices = tf.tile(tf.expand_dims(tf.range(0, total_cands, 1), 0), (batch_size * seq_length, 1)) + shuffled_indices = tf.transpose(tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)) + neg_ids = shuffled_indices[:, :self.num_neg] bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids) bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1)) From 0f48690cf4adc36fc833d2a59a5a1730c53f2569 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 23 Jul 2019 17:41:01 +0200 Subject: [PATCH 16/50] remove prints --- rasa/core/policies/embedding_policy.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 3f0311a924f0..ba528d7f38de 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -366,15 +366,11 @@ def gen_stratified_batch(session_data, batch_size): if min(num_data_cycles) > 0: break - print(num_data_cycles) - num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) - print(num_batches) - X = np.concatenate(new_X) Y = np.concatenate(new_Y) num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) - print(num_batches) + for batch_num in range(num_batches): batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size] batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size] From c6634be85c188687742924ae05f77b570835124e Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 10:14:28 +0200 Subject: [PATCH 17/50] add random perturbation of labels --- rasa/core/policies/embedding_policy.py | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index ba528d7f38de..334e56fdd10a 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -87,6 +87,7 @@ class EmbeddingPolicy(Policy): # initial and final batch sizes - batch size will be # linearly increased for each epoch "batch_size": [8, 32], + "batch_strategy": 'sequence', # string 'sequence' or 'balanced' # number of epochs "epochs": 1, # set random seed to any int to get reproducible results @@ -211,6 +212,7 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: self.num_transformer_layers = config["num_transformer_layers"] self.batch_size = config["batch_size"] + self.batch_strategy = config["batch_strategy"] self.epochs = config["epochs"] @@ -320,7 +322,7 @@ def _sample_session_data(session_data: 'SessionData', # tf helpers: # noinspection PyPep8Naming @staticmethod - def gen_stratified_batch(session_data, batch_size): + def gen_balanced_batch(session_data, batch_size): num_examples = len(session_data.X) ids = np.random.permutation(num_examples) @@ -330,9 +332,6 @@ def gen_stratified_batch(session_data, batch_size): unique_labels, counts_labels = np.unique(labels, return_counts=True) num_labels = len(unique_labels) - ids = np.random.permutation(num_labels) - unique_labels = unique_labels[ids] - counts_labels = counts_labels[ids] label_data = [] for label in unique_labels: @@ -346,7 +345,8 @@ def gen_stratified_batch(session_data, batch_size): new_X = [] new_Y = [] while min(num_data_cycles) == 0: - for i in range(num_labels): + ids = np.random.permutation(num_labels) + for i in ids: if num_data_cycles[i] > 0 and not skipped[i]: skipped[i] = True continue @@ -393,28 +393,28 @@ def gen_sequence_batch(session_data, batch_size): yield batch_x, batch_y - # @staticmethod + def train_gen_func(self, session_data, batch_size): + if self.batch_strategy == 'sequence': + return self.gen_sequence_batch(session_data, batch_size) + elif self.batch_strategy == 'balanced': + return self.gen_balanced_batch(session_data, batch_size) + else: + raise ValueError( + "Wrong batch strategy '{}', " + "should be 'sequence' or 'balanced'" + "".format(self.batch_strategy) + ) + def _create_tf_dataset(self, session_data: 'SessionData', batch_size: Union['tf.Tensor', int], shuffle: bool = True) -> 'tf.data.Dataset': """Create tf dataset.""" - def train_gen_func(batch_size_): - return self.gen_stratified_batch(session_data, batch_size_) - # return self.gen_sequence_batch(session_data, batch_size_) - dpt_types = (tf.float32, tf.float32) dpt_shapes = ([None] + list(session_data.X[0].shape), [None] + list(session_data.Y[0].shape)) - dataset = tf.data.Dataset.from_generator(train_gen_func, dpt_types, dpt_shapes, args=([batch_size])) - # dataset = tf.data.Dataset.from_tensor_slices( - # (session_data.X, session_data.Y, - # session_data.slots, session_data.previous_actions) - # ) - # if shuffle: - # dataset = dataset.shuffle(buffer_size=len(session_data.X)) - # dataset = dataset.batch(batch_size) + dataset = tf.data.Dataset.from_generator(lambda x: self.train_gen_func(session_data, x), dpt_types, dpt_shapes, args=([batch_size])) return dataset @@ -458,7 +458,7 @@ def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor': return x else: raise ValueError( - "Wrong similarity type {}, " + "Wrong similarity type '{}', " "should be 'cosine' or 'inner'" "".format(self.similarity_type) ) @@ -832,7 +832,7 @@ def _choose_loss(self, mask) else: raise ValueError( - "Wrong loss type {}, " + "Wrong loss type '{}', " "should be 'margin' or 'softmax'" "".format(self.loss_type) ) @@ -1248,7 +1248,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy": if not os.path.exists(path): raise Exception( - "Failed to load dialogue model. Path {} " + "Failed to load dialogue model. Path '{}' " "doesn't exist".format(os.path.abspath(path)) ) From b917e37b132c0fb198f5b188806d8b7f465d1266 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 16:49:29 +0200 Subject: [PATCH 18/50] add validation split --- rasa/core/policies/embedding_policy.py | 473 ++++++++++++++----------- 1 file changed, 266 insertions(+), 207 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 334e56fdd10a..687136c4363b 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -8,7 +8,7 @@ import numpy as np import typing from tqdm import tqdm -from typing import Any, List, Optional, Text, Dict, Tuple, Union +from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator import rasa.utils.io from rasa.core import utils @@ -23,18 +23,20 @@ from rasa.core.trackers import DialogueStateTracker from rasa.utils.common import is_logging_disabled +from sklearn.model_selection import train_test_split import tensorflow as tf try: - from tensor2tensor.layers import common_attention from tensor2tensor.models.transformer import (transformer_base, transformer_prepare_encoder, transformer_encoder) + from tensor2tensor.layers.common_attention import large_compatible_negative + except ImportError: - common_attention = None transformer_base = None transformer_prepare_encoder = None transformer_encoder = None + large_compatible_negative = None try: import cPickle as pickle @@ -73,20 +75,21 @@ class EmbeddingPolicy(Policy): # a list of hidden layers sizes before bot embed layer # number of hidden layers is equal to the length of this list "hidden_layers_sizes_bot": [], - - "pos_encoding": "timing", # {"timing", "emb", "custom_timing"} - # introduce phase shift in time encodings between transformers - # 0.5 - 0.8 works on small dataset - "pos_max_timescale": 1.0e1, + # type of positional encoding in transformer + "pos_encoding": "timing", # {"timing", "emb"} + # max sequence length if pos_encoding='emb' "max_seq_length": 256, + # number of attention heads in transformer "num_heads": 4, # number of units in transformer "transformer_size": 128, + # number of transformer layers "num_transformer_layers": 1, # training parameters - # initial and final batch sizes - batch size will be - # linearly increased for each epoch + # initial and final batch sizes: + # batch size will be linearly increased for each epoch "batch_size": [8, 32], + # how to create batches "batch_strategy": 'sequence', # string 'sequence' or 'balanced' # number of epochs "epochs": 1, @@ -102,6 +105,7 @@ class EmbeddingPolicy(Policy): "mu_neg": -0.2, # should be -1.0 < ... < 1.0 for 'cosine' # the type of the similarity "similarity_type": "auto", # string 'auto' or 'cosine' or 'inner' + # the type of the loss function "loss_type": 'softmax', # string 'softmax' or 'margin' # the number of incorrect actions, the algorithm will minimize # their similarity to the user input during training @@ -119,10 +123,10 @@ class EmbeddingPolicy(Policy): # dropout rate for dial nn "droprate_dial": 0.1, # visualization of accuracy - # how often calculate train accuracy + # how often calculate validation accuracy "evaluate_every_num_epochs": 20, # small values may hurt performance - # how many examples to use for calculation of train accuracy - "evaluate_on_num_examples": 100, # large values may hurt performance + # how many examples to use for hold out validation set + "evaluate_on_num_examples": 0, # large values may hurt performance } # end default properties (DOC MARKER - don't remove) @@ -137,7 +141,7 @@ def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturize @staticmethod def _check_t2t() -> None: - if common_attention is None: + if transformer_base is None: raise ImportError("Please install tensor2tensor") def __init__( @@ -204,7 +208,6 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"] self.pos_encoding = config['pos_encoding'] - self.pos_max_timescale = config['pos_max_timescale'] self.max_seq_length = config['max_seq_length'] self.num_heads = config['num_heads'] @@ -296,6 +299,12 @@ def _create_session_data( # training time labels = self._labels_for_Y(data_Y) Y = self._action_features_for_Y(labels) + + # idea taken from sklearn's stratify split + if labels.ndim == 2: + # for multi-label y, map each distinct row to a string repr + # using join because str(row) uses an ellipsis if len(row) > 1000 + labels = np.array([' '.join(row.astype('str')) for row in labels]) else: # prediction time labels = None @@ -307,12 +316,43 @@ def _create_session_data( labels=labels, ) + # noinspection PyPep8Naming + def _train_val_split(self, session_data: 'SessionData' + ) -> Tuple['SessionData', 'SessionData']: + """Create random hold out validation set using stratified split.""" + + label_counts = dict(zip(*np.unique(session_data.labels, + return_counts=True, axis=0))) + counts = np.array([label_counts[label] for label in session_data.labels]) + + multi_X = session_data.X[counts > 1] + multi_Y = session_data.Y[counts > 1] + multi_labels = session_data.labels[counts > 1] + + solo_X = session_data.X[counts == 1] + solo_Y = session_data.Y[counts == 1] + solo_labels = session_data.labels[counts == 1] + + (X_train, X_val, + Y_train, Y_val, + labels_train, labels_val) = train_test_split( + multi_X, multi_Y, multi_labels, + test_size=self.evaluate_on_num_examples, + random_state=self.random_seed, + stratify=multi_labels + ) + X_train = np.concatenate([X_train, solo_X]) + Y_train = np.concatenate([Y_train, solo_Y]) + labels_train = np.concatenate([labels_train, solo_labels]) + + return (SessionData(X=X_train, Y=Y_train, labels=labels_train), + SessionData(X=X_val, Y=Y_val, labels=labels_val)) + @staticmethod - def _sample_session_data(session_data: 'SessionData', - num_samples: int) -> 'SessionData': - """Sample session data.""" + def _shuffle_session_data(session_data: 'SessionData') -> 'SessionData': + """Shuffle session data.""" - ids = np.random.permutation(len(session_data.X))[:num_samples] + ids = np.random.permutation(len(session_data.X)) return SessionData( X=session_data.X[ids], Y=session_data.Y[ids], @@ -321,102 +361,94 @@ def _sample_session_data(session_data: 'SessionData', # tf helpers: # noinspection PyPep8Naming - @staticmethod - def gen_balanced_batch(session_data, batch_size): - - num_examples = len(session_data.X) - ids = np.random.permutation(num_examples) - X = session_data.X[ids] - Y = session_data.Y[ids] - labels = session_data.labels[ids] - - unique_labels, counts_labels = np.unique(labels, return_counts=True) - num_labels = len(unique_labels) - - label_data = [] - for label in unique_labels: - label_data.append(SessionData(X=X[labels == label], - Y=Y[labels == label], - labels=labels[labels == label])) - - data_idx = [0] * num_labels - num_data_cycles = [0] * num_labels - skipped = [False] * num_labels - new_X = [] - new_Y = [] - while min(num_data_cycles) == 0: - ids = np.random.permutation(num_labels) - for i in ids: - if num_data_cycles[i] > 0 and not skipped[i]: - skipped[i] = True - continue + def _gen_batch(self, + session_data: 'SessionData', + batch_size: int, + batch_strategy: Text = 'sequence', + shuffle: bool = False + ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]: + """Generate batches.""" + + if shuffle: + session_data = self._shuffle_session_data(session_data) + + if batch_strategy == 'balanced': + num_examples = len(session_data.X) + unique_labels, counts_labels = np.unique(session_data.labels, + return_counts=True, + axis=0) + num_labels = len(unique_labels) + + label_data = [] + for label in unique_labels: + label_data.append(SessionData( + X=session_data.X[session_data.labels == label], + Y=session_data.Y[session_data.labels == label], + labels=None # ignore new labels + )) + + data_idx = [0] * num_labels + num_data_cycles = [0] * num_labels + skipped = [False] * num_labels + new_X = [] + new_Y = [] + while min(num_data_cycles) == 0: + if shuffle: + ids = np.random.permutation(num_labels) else: - skipped[i] = False + ids = range(num_labels) - num_i = int(counts_labels[i] / num_examples * batch_size) + 1 + for i in ids: + if num_data_cycles[i] > 0 and not skipped[i]: + skipped[i] = True + continue + else: + skipped[i] = False - new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i]) - new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i]) + num_i = int(counts_labels[i] / num_examples * batch_size) + 1 - data_idx[i] += num_i - if data_idx[i] >= counts_labels[i]: - num_data_cycles[i] += 1 - data_idx[i] = 0 + new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i]) + new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i]) - if min(num_data_cycles) > 0: - break + data_idx[i] += num_i + if data_idx[i] >= counts_labels[i]: + num_data_cycles[i] += 1 + data_idx[i] = 0 - X = np.concatenate(new_X) - Y = np.concatenate(new_Y) + if min(num_data_cycles) > 0: + break - num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) + session_data = SessionData(X=np.concatenate(new_X), + Y=np.concatenate(new_Y), + labels=None) # ignore new labels - for batch_num in range(num_batches): - batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size] - batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size] - - yield batch_x, batch_y - - # noinspection PyPep8Naming - @staticmethod - def gen_sequence_batch(session_data, batch_size): - - ids = np.random.permutation(len(session_data.X)) - X = session_data.X[ids] - Y = session_data.Y[ids] - - num_batches = X.shape[0] // batch_size + int(X.shape[0] % batch_size > 0) + num_batches = (session_data.X.shape[0] // batch_size + + int(session_data.X.shape[0] % batch_size > 0)) for batch_num in range(num_batches): - batch_x = X[batch_num * batch_size: (batch_num + 1) * batch_size] - batch_y = Y[batch_num * batch_size: (batch_num + 1) * batch_size] + batch_x = session_data.X[ + batch_num * batch_size: (batch_num + 1) * batch_size] + batch_y = session_data.Y[ + batch_num * batch_size: (batch_num + 1) * batch_size] yield batch_x, batch_y - def train_gen_func(self, session_data, batch_size): - if self.batch_strategy == 'sequence': - return self.gen_sequence_batch(session_data, batch_size) - elif self.batch_strategy == 'balanced': - return self.gen_balanced_batch(session_data, batch_size) - else: - raise ValueError( - "Wrong batch strategy '{}', " - "should be 'sequence' or 'balanced'" - "".format(self.batch_strategy) - ) - def _create_tf_dataset(self, session_data: 'SessionData', batch_size: Union['tf.Tensor', int], - shuffle: bool = True) -> 'tf.data.Dataset': + batch_strategy: Text = 'sequence', + shuffle: bool = False) -> 'tf.data.Dataset': """Create tf dataset.""" - dpt_types = (tf.float32, tf.float32) - dpt_shapes = ([None] + list(session_data.X[0].shape), - [None] + list(session_data.Y[0].shape)) - - dataset = tf.data.Dataset.from_generator(lambda x: self.train_gen_func(session_data, x), dpt_types, dpt_shapes, args=([batch_size])) - - return dataset + return tf.data.Dataset.from_generator( + lambda batch_size_: self._gen_batch(session_data, + batch_size_, + batch_strategy, + shuffle), + output_types=(tf.float32, tf.float32), + output_shapes=([None] + list(session_data.X[0].shape), # set batch to None + [None] + list(session_data.Y[0].shape)), # set batch to None + args=([batch_size]) + ) @staticmethod def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator': @@ -553,10 +585,6 @@ def _create_t2t_transformer_encoder(self, encoder_decoder_attention_bias ) = transformer_prepare_encoder(x, None, hparams) - if hparams.pos == 'custom_timing': - x = common_attention.add_timing_signal_1d( - x, max_timescale=self.pos_max_timescale) - x *= tf.expand_dims(mask, -1) x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) @@ -643,10 +671,13 @@ def _tf_get_negs(self, seq_length = tf.shape(raw_pos)[1] raw_flat = self._tf_make_flat(raw_pos) - total_cands = tf.shape(all_embed)[0] + total_candidates = tf.shape(all_embed)[0] - all_indices = tf.tile(tf.expand_dims(tf.range(0, total_cands, 1), 0), (batch_size * seq_length, 1)) - shuffled_indices = tf.transpose(tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)) + all_indices = tf.tile(tf.expand_dims(tf.range(0, total_candidates, 1), 0), + (batch_size * seq_length, 1)) + shuffled_indices = tf.transpose( + tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0) + ) neg_ids = shuffled_indices[:, :self.num_neg] bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids) @@ -702,7 +733,7 @@ def _tf_sim( # calculate similarity with several # embedded actions for the loss - neg_inf = common_attention.large_compatible_negative(pos_dial_embed.dtype) + neg_inf = large_compatible_negative(pos_dial_embed.dtype) sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask) sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed, @@ -886,6 +917,119 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: mask) return loss, acc + # training helpers + def _linearly_increasing_batch_size(self, epoch: int) -> int: + """Linearly increase batch size with every epoch. + + The idea comes from https://arxiv.org/abs/1711.00489. + """ + + if not isinstance(self.batch_size, list): + return int(self.batch_size) + + if self.epochs > 1: + return int( + self.batch_size[0] + + epoch * (self.batch_size[1] - self.batch_size[0]) / (self.epochs - 1) + ) + else: + return int(self.batch_size[0]) + + def _train_tf_dataset(self, + train_init_op: 'tf.Operation', + eval_init_op: 'tf.Operation', + batch_size_in: 'tf.Tensor', + loss: 'tf.Tensor', + acc, + ) -> None: + """Train tf graph""" + + self.session.run(tf.global_variables_initializer()) + + if self.evaluate_on_num_examples: + logger.info( + "Validation accuracy is calculated every {} epochs" + "".format(self.evaluate_every_num_epochs) + ) + pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled()) + + train_loss = 0 + train_acc = 0 + eval_loss = 0 + eval_acc = 0 + for ep in pbar: + + batch_size = self._linearly_increasing_batch_size(ep) + + self.session.run(train_init_op, feed_dict={batch_size_in: batch_size}) + + ep_train_loss = 0 + ep_train_acc = 0 + batches_per_epoch = 0 + while True: + try: + _, batch_train_loss, batch_train_acc = self.session.run( + [self._train_op, loss, acc], feed_dict={self._is_training: True} + ) + batches_per_epoch += 1 + ep_train_loss += batch_train_loss + ep_train_acc += batch_train_acc + + except tf.errors.OutOfRangeError: + break + + train_loss = ep_train_loss / batches_per_epoch + train_acc = ep_train_acc / batches_per_epoch + + pbar.set_postfix({ + "loss": "{:.3f}".format(train_loss), + "acc": "{:.3f}".format(train_acc) + }) + + if eval_init_op is not None: + if ((ep + 1) % self.evaluate_every_num_epochs == 0 + or (ep + 1) == self.epochs): + eval_loss, eval_acc = self._output_training_stat_dataset( + eval_init_op, loss, acc + ) + if (ep + 1) != self.epochs: + logger.info("Evaluation results: " + "validation loss: {:.3f}, " + "validation accuracy: {:.3f}" + "".format(eval_loss, eval_acc)) + + final_message = ("Finished training embedding policy, " + "train loss={:.3f}, train accuracy={:.3f}" + "".format(train_loss, train_acc)) + if eval_init_op is not None: + final_message += (", validation loss={:.3f}, validation accuracy={:.3f}" + "".format(eval_loss, eval_acc)) + logger.info(final_message) + + def _output_training_stat_dataset(self, + eval_init_op: 'tf.Operation', + loss: 'tf.Tensor', + acc: 'tf.Tensor') -> Tuple[float, float]: + """Output training statistics""" + + self.session.run(eval_init_op) + ep_val_loss = 0 + ep_val_acc = 0 + batches_per_epoch = 0 + while True: + try: + batch_val_loss, batch_val_acc = self.session.run( + [loss, acc], feed_dict={self._is_training: False} + ) + batches_per_epoch += 1 + ep_val_loss += batch_val_loss + ep_val_acc += batch_val_acc + except tf.errors.OutOfRangeError: + break + + return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch + + # prepare for prediction def _create_tf_placeholders(self, session_data: 'SessionData') -> None: """Create placeholders for prediction.""" @@ -977,6 +1121,11 @@ def train( # extract actual training data to feed to tf session session_data = self._create_session_data(training_data.X, training_data.y) + if self.evaluate_on_num_examples: + session_data, eval_session_data = self._train_val_split(session_data) + else: + eval_session_data = None + self.graph = tf.Graph() with self.graph.as_default(): @@ -985,18 +1134,22 @@ def train( # allows increasing batch size batch_size_in = tf.placeholder(tf.int64) - train_dataset = self._create_tf_dataset(session_data, batch_size_in) + train_dataset = self._create_tf_dataset(session_data, + batch_size_in, + batch_strategy=self.batch_strategy, + shuffle=True) self._iterator = self._create_tf_iterator(train_dataset) train_init_op = self._iterator.make_initializer(train_dataset) - if self.evaluate_on_num_examples: - eval_session_data = self._sample_session_data( - session_data, self.evaluate_on_num_examples) - eval_train_dataset = self._create_tf_dataset( - eval_session_data, self.evaluate_on_num_examples, shuffle=False) - eval_init_op = self._iterator.make_initializer(eval_train_dataset) + if eval_session_data is not None: + eval_init_op = self._iterator.make_initializer( + self._create_tf_dataset( + eval_session_data, + # pick maximum batch_size for eval + self._linearly_increasing_batch_size(self.epochs)) + ) else: eval_init_op = None @@ -1016,100 +1169,6 @@ def train( self.attention_weights = self._extract_attention() - # training helpers - def _linearly_increasing_batch_size(self, epoch: int) -> int: - """Linearly increase batch size with every epoch. - - The idea comes from https://arxiv.org/abs/1711.00489. - """ - - if not isinstance(self.batch_size, list): - return int(self.batch_size) - - if self.epochs > 1: - return int( - self.batch_size[0] - + epoch * (self.batch_size[1] - self.batch_size[0]) / (self.epochs - 1) - ) - else: - return int(self.batch_size[0]) - - def _train_tf_dataset(self, - train_init_op: 'tf.Operation', - eval_init_op: 'tf.Operation', - batch_size_in: 'tf.Tensor', - loss: 'tf.Tensor', - acc, - ) -> None: - """Train tf graph""" - - self.session.run(tf.global_variables_initializer()) - - if self.evaluate_on_num_examples: - logger.info( - "Accuracy is updated every {} epochs" - "".format(self.evaluate_every_num_epochs) - ) - pbar = tqdm(range(self.epochs), desc="Epochs", disable=is_logging_disabled()) - - eval_acc = 0 - eval_loss = 0 - for ep in pbar: - - batch_size = self._linearly_increasing_batch_size(ep) - - self.session.run(train_init_op, feed_dict={batch_size_in: batch_size}) - - ep_train_loss = 0 - ep_train_acc = 0 - batches_per_epoch = 0 - while True: - try: - _, batch_train_loss, batch_train_acc = self.session.run( - [self._train_op, loss, acc], - feed_dict={self._is_training: True} - ) - batches_per_epoch += 1 - ep_train_loss += batch_train_loss - ep_train_acc += batch_train_acc - - except tf.errors.OutOfRangeError: - break - - ep_train_loss /= batches_per_epoch - ep_train_acc /= batches_per_epoch - - pbar.set_postfix({ - "loss": "{:.3f}".format(ep_train_loss), - "acc": "{:.3f}".format(ep_train_acc) - }) - - if self.evaluate_on_num_examples and eval_init_op is not None: - if ((ep + 1) % self.evaluate_every_num_epochs == 0 - or (ep + 1) == self.epochs): - eval_loss, eval_acc = self._output_training_stat_dataset( - eval_init_op, loss, acc - ) - if ((ep + 1) % self.evaluate_every_num_epochs == 0 - and (ep + 1) != self.epochs): - logger.info("Evaluation results: loss: {:.3f}, acc: {:.3f}" - "".format(eval_loss, eval_acc)) - - if self.evaluate_on_num_examples: - logger.info("Finished training embedding classifier, " - "loss={:.3f}, accuracy={:.3f}" - "".format(eval_loss, eval_acc)) - - def _output_training_stat_dataset(self, - eval_init_op: 'tf.Operation', - loss: 'tf.Tensor', - acc: 'tf.Tensor') -> Tuple[float, float]: - """Output training statistics""" - - self.session.run(eval_init_op) - - return self.session.run([loss, acc], feed_dict={self._is_training: False}) - def continue_training( self, training_trackers: List['DialogueStateTracker'], From 717d6e0f7bd60987fe167e8a161c2ef1640f40c3 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 16:56:50 +0200 Subject: [PATCH 19/50] black --- rasa/core/policies/embedding_policy.py | 711 +++++++++++++------------ 1 file changed, 377 insertions(+), 334 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 687136c4363b..5145039c7a02 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -17,7 +17,7 @@ TrackerFeaturizer, FullDialogueTrackerFeaturizer, LabelTokenizerSingleStateFeaturizer, - MaxHistoryTrackerFeaturizer + MaxHistoryTrackerFeaturizer, ) from rasa.core.policies.policy import Policy from rasa.core.trackers import DialogueStateTracker @@ -27,9 +27,11 @@ import tensorflow as tf try: - from tensor2tensor.models.transformer import (transformer_base, - transformer_prepare_encoder, - transformer_encoder) + from tensor2tensor.models.transformer import ( + transformer_base, + transformer_prepare_encoder, + transformer_encoder, + ) from tensor2tensor.layers.common_attention import large_compatible_negative except ImportError: @@ -51,14 +53,7 @@ logger = logging.getLogger(__name__) # namedtuple for all tf session related data -SessionData = namedtuple( - "SessionData", - ( - "X", - "Y", - "labels", - ), -) +SessionData = namedtuple("SessionData", ("X", "Y", "labels")) class EmbeddingPolicy(Policy): @@ -90,7 +85,7 @@ class EmbeddingPolicy(Policy): # batch size will be linearly increased for each epoch "batch_size": [8, 32], # how to create batches - "batch_strategy": 'sequence', # string 'sequence' or 'balanced' + "batch_strategy": "sequence", # string 'sequence' or 'balanced' # number of epochs "epochs": 1, # set random seed to any int to get reproducible results @@ -106,7 +101,7 @@ class EmbeddingPolicy(Policy): # the type of the similarity "similarity_type": "auto", # string 'auto' or 'cosine' or 'inner' # the type of the loss function - "loss_type": 'softmax', # string 'softmax' or 'margin' + "loss_type": "softmax", # string 'softmax' or 'margin' # the number of incorrect actions, the algorithm will minimize # their similarity to the user input during training "num_neg": 20, @@ -132,12 +127,13 @@ class EmbeddingPolicy(Policy): # end default properties (DOC MARKER - don't remove) @staticmethod - def _standard_featurizer(max_history: Optional[int] = None) -> 'TrackerFeaturizer': + def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturizer": if max_history is None: return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer()) else: - return MaxHistoryTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer(), - max_history=max_history) + return MaxHistoryTrackerFeaturizer( + LabelTokenizerSingleStateFeaturizer(), max_history=max_history + ) @staticmethod def _check_t2t() -> None: @@ -146,22 +142,22 @@ def _check_t2t() -> None: def __init__( self, - featurizer: Optional['TrackerFeaturizer'] = None, + featurizer: Optional["TrackerFeaturizer"] = None, priority: int = 1, - encoded_all_actions: Optional['np.ndarray'] = None, - graph: Optional['tf.Graph'] = None, - session: Optional['tf.Session'] = None, - intent_placeholder: Optional['tf.Tensor'] = None, - action_placeholder: Optional['tf.Tensor'] = None, - slots_placeholder: Optional['tf.Tensor'] = None, - prev_act_placeholder: Optional['tf.Tensor'] = None, - similarity_all: Optional['tf.Tensor'] = None, - pred_confidence: Optional['tf.Tensor'] = None, - similarity: Optional['tf.Tensor'] = None, - dial_embed: Optional['tf.Tensor'] = None, - bot_embed: Optional['tf.Tensor'] = None, - all_bot_embed: Optional['tf.Tensor'] = None, - attention_weights=None, + encoded_all_actions: Optional["np.ndarray"] = None, + graph: Optional["tf.Graph"] = None, + session: Optional["tf.Session"] = None, + intent_placeholder: Optional["tf.Tensor"] = None, + action_placeholder: Optional["tf.Tensor"] = None, + slots_placeholder: Optional["tf.Tensor"] = None, + prev_act_placeholder: Optional["tf.Tensor"] = None, + similarity_all: Optional["tf.Tensor"] = None, + pred_confidence: Optional["tf.Tensor"] = None, + similarity: Optional["tf.Tensor"] = None, + dial_embed: Optional["tf.Tensor"] = None, + bot_embed: Optional["tf.Tensor"] = None, + all_bot_embed: Optional["tf.Tensor"] = None, + attention_weights: Optional["tf.Tensor"] = None, max_history: Optional[int] = None, **kwargs: Any ) -> None: @@ -207,9 +203,9 @@ def __init__( def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"] - self.pos_encoding = config['pos_encoding'] - self.max_seq_length = config['max_seq_length'] - self.num_heads = config['num_heads'] + self.pos_encoding = config["pos_encoding"] + self.max_seq_length = config["max_seq_length"] + self.num_heads = config["num_heads"] self.transformer_size = config["transformer_size"] self.num_transformer_layers = config["num_transformer_layers"] @@ -226,12 +222,12 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None: self.mu_pos = config["mu_pos"] self.mu_neg = config["mu_neg"] self.similarity_type = config["similarity_type"] - self.loss_type = config['loss_type'] - if self.similarity_type == 'auto': - if self.loss_type == 'softmax': - self.similarity_type = 'inner' - elif self.loss_type == 'margin': - self.similarity_type = 'cosine' + self.loss_type = config["loss_type"] + if self.similarity_type == "auto": + if self.loss_type == "softmax": + self.similarity_type = "inner" + elif self.loss_type == "margin": + self.similarity_type = "cosine" self.num_neg = config["num_neg"] self.use_max_sim_neg = config["use_max_sim_neg"] @@ -239,10 +235,7 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None: def _load_regularization_params(self, config: Dict[Text, Any]) -> None: self.C2 = config["C2"] self.C_emb = config["C_emb"] - self.droprate = { - "bot": config["droprate_bot"], - "dial": config["droprate_dial"], - } + self.droprate = {"bot": config["droprate_bot"], "dial": config["droprate_dial"]} def _load_visual_params(self, config: Dict[Text, Any]) -> None: self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"] @@ -263,36 +256,36 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None: # data helpers # noinspection PyPep8Naming @staticmethod - def _labels_for_Y(data_Y: 'np.ndarray') -> 'np.ndarray': + def _labels_for_Y(data_Y: "np.ndarray") -> "np.ndarray": """Prepare Y data for training: extract actions indices.""" return data_Y.argmax(axis=-1) # noinspection PyPep8Naming - def _action_features_for_Y(self, labels: 'np.ndarray') -> 'np.ndarray': + def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray": """Prepare Y data for training: features for action labels.""" if len(labels.shape) == 2: return np.stack( [ np.stack( - [self.encoded_all_actions[action_idx] - for action_idx in action_ids] + [ + self.encoded_all_actions[action_idx] + for action_idx in action_ids + ] ) for action_ids in labels ] ) else: return np.stack( - [ - self.encoded_all_actions[action_idx] for action_idx in labels - ] + [self.encoded_all_actions[action_idx] for action_idx in labels] ) # noinspection PyPep8Naming def _create_session_data( - self, data_X: 'np.ndarray', data_Y: Optional['np.ndarray'] = None - ) -> 'SessionData': + self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None + ) -> "SessionData": """Combine all tf session related data into a named tuple""" if data_Y is not None: @@ -304,25 +297,23 @@ def _create_session_data( if labels.ndim == 2: # for multi-label y, map each distinct row to a string repr # using join because str(row) uses an ellipsis if len(row) > 1000 - labels = np.array([' '.join(row.astype('str')) for row in labels]) + labels = np.array([" ".join(row.astype("str")) for row in labels]) else: # prediction time labels = None Y = None - return SessionData( - X=data_X, - Y=Y, - labels=labels, - ) + return SessionData(X=data_X, Y=Y, labels=labels) # noinspection PyPep8Naming - def _train_val_split(self, session_data: 'SessionData' - ) -> Tuple['SessionData', 'SessionData']: + def _train_val_split( + self, session_data: "SessionData" + ) -> Tuple["SessionData", "SessionData"]: """Create random hold out validation set using stratified split.""" - label_counts = dict(zip(*np.unique(session_data.labels, - return_counts=True, axis=0))) + label_counts = dict( + zip(*np.unique(session_data.labels, return_counts=True, axis=0)) + ) counts = np.array([label_counts[label] for label in session_data.labels]) multi_X = session_data.X[counts > 1] @@ -333,23 +324,25 @@ def _train_val_split(self, session_data: 'SessionData' solo_Y = session_data.Y[counts == 1] solo_labels = session_data.labels[counts == 1] - (X_train, X_val, - Y_train, Y_val, - labels_train, labels_val) = train_test_split( - multi_X, multi_Y, multi_labels, + (X_train, X_val, Y_train, Y_val, labels_train, labels_val) = train_test_split( + multi_X, + multi_Y, + multi_labels, test_size=self.evaluate_on_num_examples, random_state=self.random_seed, - stratify=multi_labels + stratify=multi_labels, ) X_train = np.concatenate([X_train, solo_X]) Y_train = np.concatenate([Y_train, solo_Y]) labels_train = np.concatenate([labels_train, solo_labels]) - return (SessionData(X=X_train, Y=Y_train, labels=labels_train), - SessionData(X=X_val, Y=Y_val, labels=labels_val)) + return ( + SessionData(X=X_train, Y=Y_train, labels=labels_train), + SessionData(X=X_val, Y=Y_val, labels=labels_val), + ) @staticmethod - def _shuffle_session_data(session_data: 'SessionData') -> 'SessionData': + def _shuffle_session_data(session_data: "SessionData") -> "SessionData": """Shuffle session data.""" ids = np.random.permutation(len(session_data.X)) @@ -361,31 +354,34 @@ def _shuffle_session_data(session_data: 'SessionData') -> 'SessionData': # tf helpers: # noinspection PyPep8Naming - def _gen_batch(self, - session_data: 'SessionData', - batch_size: int, - batch_strategy: Text = 'sequence', - shuffle: bool = False - ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]: + def _gen_batch( + self, + session_data: "SessionData", + batch_size: int, + batch_strategy: Text = "sequence", + shuffle: bool = False, + ) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]: """Generate batches.""" if shuffle: session_data = self._shuffle_session_data(session_data) - if batch_strategy == 'balanced': + if batch_strategy == "balanced": num_examples = len(session_data.X) - unique_labels, counts_labels = np.unique(session_data.labels, - return_counts=True, - axis=0) + unique_labels, counts_labels = np.unique( + session_data.labels, return_counts=True, axis=0 + ) num_labels = len(unique_labels) label_data = [] for label in unique_labels: - label_data.append(SessionData( - X=session_data.X[session_data.labels == label], - Y=session_data.Y[session_data.labels == label], - labels=None # ignore new labels - )) + label_data.append( + SessionData( + X=session_data.X[session_data.labels == label], + Y=session_data.Y[session_data.labels == label], + labels=None, # ignore new labels + ) + ) data_idx = [0] * num_labels num_data_cycles = [0] * num_labels @@ -407,8 +403,8 @@ def _gen_batch(self, num_i = int(counts_labels[i] / num_examples * batch_size) + 1 - new_X.append(label_data[i].X[data_idx[i]:data_idx[i]+num_i]) - new_Y.append(label_data[i].Y[data_idx[i]:data_idx[i]+num_i]) + new_X.append(label_data[i].X[data_idx[i] : data_idx[i] + num_i]) + new_Y.append(label_data[i].Y[data_idx[i] : data_idx[i] + num_i]) data_idx[i] += num_i if data_idx[i] >= counts_labels[i]: @@ -418,53 +414,62 @@ def _gen_batch(self, if min(num_data_cycles) > 0: break - session_data = SessionData(X=np.concatenate(new_X), - Y=np.concatenate(new_Y), - labels=None) # ignore new labels + session_data = SessionData( + X=np.concatenate(new_X), Y=np.concatenate(new_Y), labels=None + ) # ignore new labels - num_batches = (session_data.X.shape[0] // batch_size - + int(session_data.X.shape[0] % batch_size > 0)) + num_batches = session_data.X.shape[0] // batch_size + int( + session_data.X.shape[0] % batch_size > 0 + ) for batch_num in range(num_batches): batch_x = session_data.X[ - batch_num * batch_size: (batch_num + 1) * batch_size] + batch_num * batch_size : (batch_num + 1) * batch_size + ] batch_y = session_data.Y[ - batch_num * batch_size: (batch_num + 1) * batch_size] + batch_num * batch_size : (batch_num + 1) * batch_size + ] yield batch_x, batch_y - def _create_tf_dataset(self, session_data: 'SessionData', - batch_size: Union['tf.Tensor', int], - batch_strategy: Text = 'sequence', - shuffle: bool = False) -> 'tf.data.Dataset': + def _create_tf_dataset( + self, + session_data: "SessionData", + batch_size: Union["tf.Tensor", int], + batch_strategy: Text = "sequence", + shuffle: bool = False, + ) -> "tf.data.Dataset": """Create tf dataset.""" return tf.data.Dataset.from_generator( - lambda batch_size_: self._gen_batch(session_data, - batch_size_, - batch_strategy, - shuffle), + lambda batch_size_: self._gen_batch( + session_data, batch_size_, batch_strategy, shuffle + ), output_types=(tf.float32, tf.float32), - output_shapes=([None] + list(session_data.X[0].shape), # set batch to None - [None] + list(session_data.Y[0].shape)), # set batch to None - args=([batch_size]) + output_shapes=( + [None] + list(session_data.X[0].shape), # set batch to None + [None] + list(session_data.Y[0].shape), # set batch to None + ), + args=([batch_size]), ) @staticmethod - def _create_tf_iterator(dataset: 'tf.data.Dataset') -> 'tf.data.Iterator': + def _create_tf_iterator(dataset: "tf.data.Dataset") -> "tf.data.Iterator": """Create tf iterator.""" - return tf.data.Iterator.from_structure(dataset.output_types, - dataset.output_shapes, - output_classes=dataset.output_classes) + return tf.data.Iterator.from_structure( + dataset.output_types, + dataset.output_shapes, + output_classes=dataset.output_classes, + ) def _create_tf_nn( self, - x_in: 'tf.Tensor', + x_in: "tf.Tensor", layer_sizes: List[int], droprate: float, layer_name_suffix: Text, - ) -> 'tf.Tensor': + ) -> "tf.Tensor": """Create nn with hidden layers and name suffix.""" reg = tf.contrib.layers.l2_regularizer(self.C2) @@ -481,7 +486,7 @@ def _create_tf_nn( x = tf.layers.dropout(x, rate=droprate, training=self._is_training) return x - def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor': + def _tf_normalize_if_cosine(self, x: "tf.Tensor") -> "tf.Tensor": """Normalize embedding if similarity type is cosine.""" if self.similarity_type == "cosine": @@ -495,7 +500,7 @@ def _tf_normalize_if_cosine(self, x: 'tf.Tensor') -> 'tf.Tensor': "".format(self.similarity_type) ) - def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tensor': + def _create_tf_embed(self, x: "tf.Tensor", layer_name_suffix: Text) -> "tf.Tensor": """Create dense embedding layer with a name.""" reg = tf.contrib.layers.l2_regularizer(self.C2) @@ -510,7 +515,7 @@ def _create_tf_embed(self, x: 'tf.Tensor', layer_name_suffix: Text) -> 'tf.Tenso # normalize embedding vectors for cosine similarity return self._tf_normalize_if_cosine(embed_x) - def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': + def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor": """Create embedding bot vector.""" b = self._create_tf_nn( @@ -521,9 +526,9 @@ def _create_tf_bot_embed(self, b_in: 'tf.Tensor') -> 'tf.Tensor': ) return self._create_tf_embed(b, layer_name_suffix="bot") - def _create_t2t_hparams(self) -> 'HParams': + def _create_t2t_hparams(self) -> "HParams": """Create parameters for t2t transformer.""" - + hparams = transformer_base() hparams.num_hidden_layers = self.num_transformer_layers @@ -545,13 +550,14 @@ def _create_t2t_hparams(self) -> 'HParams': return hparams # noinspection PyUnresolvedReferences - def _create_t2t_transformer_encoder(self, - x_in: 'tf.Tensor', - mask: 'tf.Tensor', - attention_weights: Dict[Text, 'tf.Tensor'], - ) -> 'tf.Tensor': + def _create_t2t_transformer_encoder( + self, + x_in: "tf.Tensor", + mask: "tf.Tensor", + attention_weights: Dict[Text, "tf.Tensor"], + ) -> "tf.Tensor": """Create t2t transformer encoder.""" - + hparams = self._create_t2t_hparams() # When not in training mode, set all forms of dropout to zero. @@ -566,24 +572,27 @@ def _create_t2t_transformer_encoder(self, units=hparams.hidden_size, use_bias=False, kernel_initializer=tf.random_normal_initializer( - 0.0, hparams.hidden_size ** -0.5), + 0.0, hparams.hidden_size ** -0.5 + ), kernel_regularizer=reg, - name='transformer_embed_layer', - reuse=tf.AUTO_REUSE + name="transformer_embed_layer", + reuse=tf.AUTO_REUSE, + ) + x = tf.layers.dropout( + x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training ) - x = tf.layers.dropout(x, rate=hparams.layer_prepostprocess_dropout, - training=self._is_training) if hparams.multiply_embedding_mode == "sqrt_depth": x *= hparams.hidden_size ** 0.5 x *= tf.expand_dims(mask, -1) - with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): - (x, - self_attention_bias, - encoder_decoder_attention_bias - ) = transformer_prepare_encoder(x, None, hparams) + with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE): + ( + x, + self_attention_bias, + encoder_decoder_attention_bias, + ) = transformer_prepare_encoder(x, None, hparams) x *= tf.expand_dims(mask, -1) @@ -605,20 +614,21 @@ def _create_t2t_transformer_encoder(self, x *= tf.expand_dims(mask, -1) - return tf.nn.dropout(tf.nn.relu(x), - 1.0 - hparams.layer_prepostprocess_dropout) + return tf.nn.dropout( + tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout + ) - def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: + def _create_tf_dial(self) -> Tuple["tf.Tensor", "tf.Tensor"]: """Create dialogue level embedding and mask.""" - + # mask different length sequences # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) self.attention_weights = {} - a = self._create_t2t_transformer_encoder(self.a_in, - mask, - self.attention_weights) + a = self._create_t2t_transformer_encoder( + self.a_in, mask, self.attention_weights + ) dial_embed = self._create_tf_embed(a, layer_name_suffix="dial") @@ -630,25 +640,24 @@ def _create_tf_dial(self) -> Tuple['tf.Tensor', 'tf.Tensor']: return dial_embed, mask @staticmethod - def _tf_make_flat(x: 'tf.Tensor') -> 'tf.Tensor': + def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor": """Make tensor 2D.""" return tf.reshape(x, (-1, x.shape[-1])) @staticmethod - def _tf_sample_neg(batch_size: 'tf.Tensor', - all_bs: 'tf.Tensor', - neg_ids: 'tf.Tensor') -> 'tf.Tensor': + def _tf_sample_neg( + batch_size: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor" + ) -> "tf.Tensor": """Sample negative examples for given indices""" tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1)) return tf.batch_gather(tiled_all_bs, neg_ids) - def _tf_calc_iou_mask(self, - pos_b: 'tf.Tensor', - all_bs: 'tf.Tensor', - neg_ids: 'tf.Tensor') -> 'tf.Tensor': + def _tf_calc_iou_mask( + self, pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor" + ) -> "tf.Tensor": """Calculate IOU mask for given indices""" pos_b_in_flat = tf.expand_dims(pos_b, -2) @@ -657,14 +666,14 @@ def _tf_calc_iou_mask(self, intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat) union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat) - iou = (tf.reduce_sum(intersection_b_in_flat, -1) - / tf.reduce_sum(union_b_in_flat, -1)) - return 1. - tf.nn.relu(tf.sign(1. - iou)) + iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum( + union_b_in_flat, -1 + ) + return 1.0 - tf.nn.relu(tf.sign(1.0 - iou)) - def _tf_get_negs(self, - all_embed: 'tf.Tensor', - all_raw: 'tf.Tensor', - raw_pos: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']: + def _tf_get_negs( + self, all_embed: "tf.Tensor", all_raw: "tf.Tensor", raw_pos: "tf.Tensor" + ) -> Tuple["tf.Tensor", "tf.Tensor"]: """Get negative examples from given tensor.""" batch_size = tf.shape(raw_pos)[0] @@ -673,62 +682,69 @@ def _tf_get_negs(self, total_candidates = tf.shape(all_embed)[0] - all_indices = tf.tile(tf.expand_dims(tf.range(0, total_candidates, 1), 0), - (batch_size * seq_length, 1)) + all_indices = tf.tile( + tf.expand_dims(tf.range(0, total_candidates, 1), 0), + (batch_size * seq_length, 1), + ) shuffled_indices = tf.transpose( tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0) ) - neg_ids = shuffled_indices[:, :self.num_neg] + neg_ids = shuffled_indices[:, : self.num_neg] bad_negs_flat = self._tf_calc_iou_mask(raw_flat, all_raw, neg_ids) bad_negs = tf.reshape(bad_negs_flat, (batch_size, seq_length, -1)) - neg_embed_flat = self._tf_sample_neg(batch_size * seq_length, - all_embed, neg_ids) - neg_embed = tf.reshape(neg_embed_flat, - (batch_size, seq_length, -1, all_embed.shape[-1])) + neg_embed_flat = self._tf_sample_neg( + batch_size * seq_length, all_embed, neg_ids + ) + neg_embed = tf.reshape( + neg_embed_flat, (batch_size, seq_length, -1, all_embed.shape[-1]) + ) return neg_embed, bad_negs - def _sample_negatives(self, all_actions: 'tf.Tensor') -> Tuple['tf.Tensor', - 'tf.Tensor', - 'tf.Tensor', - 'tf.Tensor', - 'tf.Tensor', - 'tf.Tensor']: + def _sample_negatives( + self, all_actions: "tf.Tensor" + ) -> Tuple[ + "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor" + ]: """Sample negative examples.""" pos_dial_embed = tf.expand_dims(self.dial_embed, -2) neg_dial_embed, dial_bad_negs = self._tf_get_negs( self._tf_make_flat(self.dial_embed), self._tf_make_flat(self.b_in), - self.b_in + self.b_in, ) pos_bot_embed = tf.expand_dims(self.bot_embed, -2) neg_bot_embed, bot_bad_negs = self._tf_get_negs( - self.all_bot_embed, - all_actions, - self.b_in + self.all_bot_embed, all_actions, self.b_in + ) + return ( + pos_dial_embed, + pos_bot_embed, + neg_dial_embed, + neg_bot_embed, + dial_bad_negs, + bot_bad_negs, ) - return (pos_dial_embed, pos_bot_embed, neg_dial_embed, neg_bot_embed, - dial_bad_negs, bot_bad_negs) @staticmethod - def _tf_raw_sim(a: 'tf.Tensor', b: 'tf.Tensor', mask: 'tf.Tensor') -> 'tf.Tensor': + def _tf_raw_sim(a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor") -> "tf.Tensor": """Calculate similarity between given tensors.""" return tf.reduce_sum(a * b, -1) * tf.expand_dims(mask, 2) def _tf_sim( self, - pos_dial_embed: 'tf.Tensor', - pos_bot_embed: 'tf.Tensor', - neg_dial_embed: 'tf.Tensor', - neg_bot_embed: 'tf.Tensor', - dial_bad_negs: 'tf.Tensor', - bot_bad_negs: 'tf.Tensor', - mask: 'tf.Tensor', - ) -> Tuple['tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor', 'tf.Tensor']: + pos_dial_embed: "tf.Tensor", + pos_bot_embed: "tf.Tensor", + neg_dial_embed: "tf.Tensor", + neg_bot_embed: "tf.Tensor", + dial_bad_negs: "tf.Tensor", + bot_bad_negs: "tf.Tensor", + mask: "tf.Tensor", + ) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]: """Define similarity.""" # calculate similarity with several @@ -736,61 +752,70 @@ def _tf_sim( neg_inf = large_compatible_negative(pos_dial_embed.dtype) sim_pos = self._tf_raw_sim(pos_dial_embed, pos_bot_embed, mask) - sim_neg = self._tf_raw_sim(pos_dial_embed, neg_bot_embed, - mask) + neg_inf * bot_bad_negs - sim_neg_bot_bot = self._tf_raw_sim(pos_bot_embed, neg_bot_embed, - mask) + neg_inf * bot_bad_negs - sim_neg_dial_dial = self._tf_raw_sim(pos_dial_embed, neg_dial_embed, - mask) + neg_inf * dial_bad_negs - sim_neg_bot_dial = self._tf_raw_sim(pos_bot_embed, neg_dial_embed, - mask) + neg_inf * dial_bad_negs + sim_neg = ( + self._tf_raw_sim(pos_dial_embed, neg_bot_embed, mask) + + neg_inf * bot_bad_negs + ) + sim_neg_bot_bot = ( + self._tf_raw_sim(pos_bot_embed, neg_bot_embed, mask) + + neg_inf * bot_bad_negs + ) + sim_neg_dial_dial = ( + self._tf_raw_sim(pos_dial_embed, neg_dial_embed, mask) + + neg_inf * dial_bad_negs + ) + sim_neg_bot_dial = ( + self._tf_raw_sim(pos_bot_embed, neg_dial_embed, mask) + + neg_inf * dial_bad_negs + ) # output similarities between user input and bot actions # and similarities between bot actions and similarities between user inputs return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial @staticmethod - def _tf_calc_accuracy(sim_pos: 'tf.Tensor', sim_neg: 'tf.Tensor') -> 'tf.Tensor': + def _tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor": """Calculate accuracy""" max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1) - return tf.reduce_mean(tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), - tf.float32)) + return tf.reduce_mean( + tf.cast(tf.math.equal(max_all_sim, sim_pos[:, :, 0]), tf.float32) + ) def _tf_loss_margin( self, - sim_pos: 'tf.Tensor', - sim_neg: 'tf.Tensor', - sim_neg_bot_bot: 'tf.Tensor', - sim_neg_dial_dial: 'tf.Tensor', - sim_neg_bot_dial: 'tf.Tensor', - mask: 'tf.Tensor', - ) -> 'tf.Tensor': + sim_pos: "tf.Tensor", + sim_neg: "tf.Tensor", + sim_neg_bot_bot: "tf.Tensor", + sim_neg_dial_dial: "tf.Tensor", + sim_neg_bot_dial: "tf.Tensor", + mask: "tf.Tensor", + ) -> "tf.Tensor": """Define max margin loss.""" # loss for maximizing similarity with correct action - loss = tf.maximum(0., self.mu_pos - sim_pos[:, :, 0]) + loss = tf.maximum(0.0, self.mu_pos - sim_pos[:, :, 0]) # loss for minimizing similarity with `num_neg` incorrect actions if self.use_max_sim_neg: # minimize only maximum similarity over incorrect actions max_sim_neg = tf.reduce_max(sim_neg, -1) - loss += tf.maximum(0., self.mu_neg + max_sim_neg) + loss += tf.maximum(0.0, self.mu_neg + max_sim_neg) else: # minimize all similarities with incorrect actions - max_margin = tf.maximum(0., self.mu_neg + sim_neg) + max_margin = tf.maximum(0.0, self.mu_neg + sim_neg) loss += tf.reduce_sum(max_margin, -1) # penalize max similarity between pos bot and neg bot embeddings - max_sim_neg_bot = tf.maximum(0., tf.reduce_max(sim_neg_bot_bot, -1)) + max_sim_neg_bot = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_bot, -1)) loss += max_sim_neg_bot * self.C_emb # penalize max similarity between pos dial and neg dial embeddings - max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_dial_dial, -1)) + max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_dial_dial, -1)) loss += max_sim_neg_dial * self.C_emb # penalize max similarity between pos bot and neg dial embeddings - max_sim_neg_dial = tf.maximum(0., tf.reduce_max(sim_neg_bot_dial, -1)) + max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_dial, -1)) loss += max_sim_neg_dial * self.C_emb # mask loss for different length sequences @@ -807,21 +832,18 @@ def _tf_loss_margin( @staticmethod def _tf_loss_softmax( - sim_pos: 'tf.Tensor', - sim_neg: 'tf.Tensor', - sim_neg_bot_bot: 'tf.Tensor', - sim_neg_dial_dial: 'tf.Tensor', - sim_neg_bot_dial: 'tf.Tensor', - mask: 'tf.Tensor', - ) -> 'tf.Tensor': + sim_pos: "tf.Tensor", + sim_neg: "tf.Tensor", + sim_neg_bot_bot: "tf.Tensor", + sim_neg_dial_dial: "tf.Tensor", + sim_neg_bot_dial: "tf.Tensor", + mask: "tf.Tensor", + ) -> "tf.Tensor": """Define softmax loss.""" - logits = tf.concat([sim_pos, - sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial - ], -1) + logits = tf.concat( + [sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial], -1 + ) # create labels for softmax pos_labels = tf.ones_like(logits[:, :, :1]) @@ -832,35 +854,41 @@ def _tf_loss_softmax( pred = tf.nn.softmax(logits) already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4) - loss = tf.losses.softmax_cross_entropy(labels, - logits, - mask * already_learned) + loss = tf.losses.softmax_cross_entropy(labels, logits, mask * already_learned) # add regularization losses loss += tf.losses.get_regularization_loss() return loss - def _choose_loss(self, - sim_pos: 'tf.Tensor', - sim_neg: 'tf.Tensor', - sim_neg_bot_bot: 'tf.Tensor', - sim_neg_dial_dial: 'tf.Tensor', - sim_neg_bot_dial: 'tf.Tensor', - mask: 'tf.Tensor') -> 'tf.Tensor': + def _choose_loss( + self, + sim_pos: "tf.Tensor", + sim_neg: "tf.Tensor", + sim_neg_bot_bot: "tf.Tensor", + sim_neg_dial_dial: "tf.Tensor", + sim_neg_bot_dial: "tf.Tensor", + mask: "tf.Tensor", + ) -> "tf.Tensor": """Use loss depending on given option.""" - if self.loss_type == 'margin': - return self._tf_loss_margin(sim_pos, sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial, - mask) - elif self.loss_type == 'softmax': - return self._tf_loss_softmax(sim_pos, sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial, - mask) + if self.loss_type == "margin": + return self._tf_loss_margin( + sim_pos, + sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + mask, + ) + elif self.loss_type == "softmax": + return self._tf_loss_softmax( + sim_pos, + sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + mask, + ) else: raise ValueError( "Wrong loss type '{}', " @@ -868,15 +896,15 @@ def _choose_loss(self, "".format(self.loss_type) ) - def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: + def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]: """Bulid train graph using iterator.""" # session data are int counts but we need a float tensors self.a_in, self.b_in = self._iterator.get_next() - all_actions = tf.constant(self.encoded_all_actions, - dtype=tf.float32, - name="all_actions") + all_actions = tf.constant( + self.encoded_all_actions, dtype=tf.float32, name="all_actions" + ) self.dial_embed, mask = self._create_tf_dial() @@ -888,33 +916,37 @@ def _build_tf_train_graph(self) -> Tuple['tf.Tensor', 'tf.Tensor']: self.b_in = self.b_in[:, tf.newaxis, :] self.bot_embed = self.bot_embed[:, tf.newaxis, :] - (pos_dial_embed, - pos_bot_embed, - neg_dial_embed, - neg_bot_embed, - dial_bad_negs, - bot_bad_negs) = self._sample_negatives(all_actions) + ( + pos_dial_embed, + pos_bot_embed, + neg_dial_embed, + neg_bot_embed, + dial_bad_negs, + bot_bad_negs, + ) = self._sample_negatives(all_actions) # calculate similarities - (sim_pos, - sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial) = self._tf_sim(pos_dial_embed, - pos_bot_embed, - neg_dial_embed, - neg_bot_embed, - dial_bad_negs, - bot_bad_negs, - mask) + ( + sim_pos, + sim_neg, + sim_neg_bot_bot, + sim_neg_dial_dial, + sim_neg_bot_dial, + ) = self._tf_sim( + pos_dial_embed, + pos_bot_embed, + neg_dial_embed, + neg_bot_embed, + dial_bad_negs, + bot_bad_negs, + mask, + ) acc = self._tf_calc_accuracy(sim_pos, sim_neg) - loss = self._choose_loss(sim_pos, sim_neg, - sim_neg_bot_bot, - sim_neg_dial_dial, - sim_neg_bot_dial, - mask) + loss = self._choose_loss( + sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial, mask + ) return loss, acc # training helpers @@ -935,13 +967,14 @@ def _linearly_increasing_batch_size(self, epoch: int) -> int: else: return int(self.batch_size[0]) - def _train_tf_dataset(self, - train_init_op: 'tf.Operation', - eval_init_op: 'tf.Operation', - batch_size_in: 'tf.Tensor', - loss: 'tf.Tensor', - acc, - ) -> None: + def _train_tf_dataset( + self, + train_init_op: "tf.Operation", + eval_init_op: "tf.Operation", + batch_size_in: "tf.Tensor", + loss: "tf.Tensor", + acc: "tf.Tensor", + ) -> None: """Train tf graph""" self.session.run(tf.global_variables_initializer()) @@ -981,35 +1014,40 @@ def _train_tf_dataset(self, train_loss = ep_train_loss / batches_per_epoch train_acc = ep_train_acc / batches_per_epoch - pbar.set_postfix({ - "loss": "{:.3f}".format(train_loss), - "acc": "{:.3f}".format(train_acc) - }) + pbar.set_postfix( + {"loss": "{:.3f}".format(train_loss), "acc": "{:.3f}".format(train_acc)} + ) if eval_init_op is not None: - if ((ep + 1) % self.evaluate_every_num_epochs == 0 - or (ep + 1) == self.epochs): + if (ep + 1) % self.evaluate_every_num_epochs == 0 or ( + ep + 1 + ) == self.epochs: eval_loss, eval_acc = self._output_training_stat_dataset( eval_init_op, loss, acc ) if (ep + 1) != self.epochs: - logger.info("Evaluation results: " - "validation loss: {:.3f}, " - "validation accuracy: {:.3f}" - "".format(eval_loss, eval_acc)) - - final_message = ("Finished training embedding policy, " - "train loss={:.3f}, train accuracy={:.3f}" - "".format(train_loss, train_acc)) + logger.info( + "Evaluation results: " + "validation loss: {:.3f}, " + "validation accuracy: {:.3f}" + "".format(eval_loss, eval_acc) + ) + + final_message = ( + "Finished training embedding policy, " + "train loss={:.3f}, train accuracy={:.3f}" + "".format(train_loss, train_acc) + ) if eval_init_op is not None: - final_message += (", validation loss={:.3f}, validation accuracy={:.3f}" - "".format(eval_loss, eval_acc)) + final_message += ( + ", validation loss={:.3f}, validation accuracy={:.3f}" + "".format(eval_loss, eval_acc) + ) logger.info(final_message) - def _output_training_stat_dataset(self, - eval_init_op: 'tf.Operation', - loss: 'tf.Tensor', - acc: 'tf.Tensor') -> Tuple[float, float]: + def _output_training_stat_dataset( + self, eval_init_op: "tf.Operation", loss: "tf.Tensor", acc: "tf.Tensor" + ) -> Tuple[float, float]: """Output training statistics""" self.session.run(eval_init_op) @@ -1030,9 +1068,9 @@ def _output_training_stat_dataset(self, return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch # prepare for prediction - def _create_tf_placeholders(self, session_data: 'SessionData') -> None: + def _create_tf_placeholders(self, session_data: "SessionData") -> None: """Create placeholders for prediction.""" - + dialogue_len = None # use dynamic time self.a_in = tf.placeholder( dtype=tf.float32, @@ -1045,17 +1083,17 @@ def _create_tf_placeholders(self, session_data: 'SessionData') -> None: name="b", ) - def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor': + def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor": """Rebuild tf graph for prediction.""" - + self._create_tf_placeholders(session_data) - + self.dial_embed, mask = self._create_tf_dial() self.sim_all = self._tf_raw_sim( self.dial_embed[:, :, tf.newaxis, :], self.all_bot_embed[tf.newaxis, tf.newaxis, :, :], - mask + mask, ) if self.similarity_type == "cosine": @@ -1068,19 +1106,19 @@ def _build_tf_pred_graph(self, session_data: 'SessionData') -> 'tf.Tensor': self.bot_embed = self._create_tf_bot_embed(self.b_in) self.sim = self._tf_raw_sim( - self.dial_embed[:, :, tf.newaxis, :], - self.bot_embed, - mask + self.dial_embed[:, :, tf.newaxis, :], self.bot_embed, mask ) return confidence - def _extract_attention(self) -> Optional['tf.Tensor']: + def _extract_attention(self) -> Optional["tf.Tensor"]: """Extract attention probabilities from t2t dict""" - - attention = [tf.expand_dims(t, 0) - for name, t in self.attention_weights.items() - if name.endswith('multihead_attention/dot_product_attention')] + + attention = [ + tf.expand_dims(t, 0) + for name, t in self.attention_weights.items() + if name.endswith("multihead_attention/dot_product_attention") + ] if attention: return tf.concat(attention, 0) @@ -1090,8 +1128,8 @@ def _extract_attention(self) -> Optional['tf.Tensor']: # training methods def train( self, - training_trackers: List['DialogueStateTracker'], - domain: 'Domain', + training_trackers: List["DialogueStateTracker"], + domain: "Domain", **kwargs: Any ) -> None: """Train the policy on given training trackers.""" @@ -1105,8 +1143,9 @@ def train( training_data = self.featurize_for_training(training_trackers, domain, **kwargs) # encode all actions with policies' featurizer - self.encoded_all_actions = \ - self.featurizer.state_featurizer.create_encoded_all_actions(domain) + self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions( + domain + ) # check if number of negatives is less than number of actions logger.debug( @@ -1134,10 +1173,12 @@ def train( # allows increasing batch size batch_size_in = tf.placeholder(tf.int64) - train_dataset = self._create_tf_dataset(session_data, - batch_size_in, - batch_strategy=self.batch_strategy, - shuffle=True) + train_dataset = self._create_tf_dataset( + session_data, + batch_size_in, + batch_strategy=self.batch_strategy, + shuffle=True, + ) self._iterator = self._create_tf_iterator(train_dataset) @@ -1148,7 +1189,8 @@ def train( self._create_tf_dataset( eval_session_data, # pick maximum batch_size for eval - self._linearly_increasing_batch_size(self.epochs)) + self._linearly_increasing_batch_size(self.epochs), + ) ) else: eval_init_op = None @@ -1161,8 +1203,9 @@ def train( # train tensorflow graph self.session = tf.Session(config=self._tf_config) - self._train_tf_dataset(train_init_op, eval_init_op, batch_size_in, - loss, acc) + self._train_tf_dataset( + train_init_op, eval_init_op, batch_size_in, loss, acc + ) # rebuild the graph for prediction self.pred_confidence = self._build_tf_pred_graph(session_data) @@ -1171,8 +1214,8 @@ def train( def continue_training( self, - training_trackers: List['DialogueStateTracker'], - domain: 'Domain', + training_trackers: List["DialogueStateTracker"], + domain: "Domain", **kwargs: Any ) -> None: """Continue training an already trained policy.""" @@ -1193,18 +1236,18 @@ def continue_training( # fit to one extra example using updated trackers while True: try: - self.session.run(self._train_op, - feed_dict={self._is_training: True}) + self.session.run( + self._train_op, feed_dict={self._is_training: True} + ) except tf.errors.OutOfRangeError: break - def tf_feed_dict_for_prediction(self, - tracker: 'DialogueStateTracker', - domain: 'Domain' - ) -> Dict['tf.Tensor', 'np.ndarray']: + def tf_feed_dict_for_prediction( + self, tracker: "DialogueStateTracker", domain: "Domain" + ) -> Dict["tf.Tensor", "np.ndarray"]: """Create feed dictionary for tf session.""" - + # noinspection PyPep8Naming data_X = self.featurizer.create_X([tracker], domain) session_data = self._create_session_data(data_X) @@ -1212,7 +1255,7 @@ def tf_feed_dict_for_prediction(self, return {self.a_in: session_data.X} def predict_action_probabilities( - self, tracker: 'DialogueStateTracker', domain: 'Domain' + self, tracker: "DialogueStateTracker", domain: "Domain" ) -> List[float]: """Predict the next action the bot should take. @@ -1233,9 +1276,9 @@ def predict_action_probabilities( return confidence[0, -1, :].tolist() - def _persist_tensor(self, name: Text, tensor: 'tf.Tensor') -> None: + def _persist_tensor(self, name: Text, tensor: "tf.Tensor") -> None: """Add tensor to collection if it is not None""" - + if tensor is not None: self.graph.clear_collection(name) self.graph.add_to_collection(name, tensor) @@ -1292,7 +1335,7 @@ def persist(self, path: Text) -> None: pickle.dump(self._tf_config, f) @staticmethod - def load_tensor(name: Text) -> Optional['tf.Tensor']: + def load_tensor(name: Text) -> Optional["tf.Tensor"]: """Load tensor or set it to None""" tensor_list = tf.get_collection(name) From 7d30f55ec072ca8c15928f5d1a7d6fa99234fcf5 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 17:25:44 +0200 Subject: [PATCH 20/50] update changelog --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f219c13016e8..30116b0c1107 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,6 +20,8 @@ Changed - ``Agent.update_model()`` and ``Agent.handle_message()`` now work without needing to set a domain or a policy ensemble - Update pytype to ``2019.7.11`` +- Substitute LSTM with Transformer in ``EmbeddingPolicy`` +- ``EmbeddingPolicy`` can now use ``MaxHistoryTrackerFeaturizer`` Removed ------- From dc27bfcf0b7e5d3ae305c54eaf889e2e411efa32 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 17:36:32 +0200 Subject: [PATCH 21/50] update tests --- tests/core/test_policies.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 483e797deead..95b22d96beb1 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -393,43 +393,27 @@ def test_train_with_shuffle_false( policy.train(trackers, domain=default_domain) -class TestEmbeddingPolicyNoAttention(PolicyTestCollection): +class TestEmbeddingPolicyWithFeaturizer(PolicyTestCollection): def create_policy(self, featurizer, priority): - # use standard featurizer from EmbeddingPolicy, - # since it is using FullDialogueTrackerFeaturizer - p = EmbeddingPolicy( - priority=priority, attn_before_rnn=False, attn_after_rnn=False - ) + p = EmbeddingPolicy(featurizer=featurizer, priority=priority) return p -class TestEmbeddingPolicyAttentionBeforeRNN(PolicyTestCollection): +class TestEmbeddingPolicyWithFullDialogue(PolicyTestCollection): def create_policy(self, featurizer, priority): # use standard featurizer from EmbeddingPolicy, # since it is using FullDialogueTrackerFeaturizer - p = EmbeddingPolicy( - priority=priority, attn_before_rnn=True, attn_after_rnn=False - ) + # if max_history is not specified + p = EmbeddingPolicy(priority=priority) return p -class TestEmbeddingPolicyAttentionAfterRNN(PolicyTestCollection): +class TestEmbeddingPolicyWithMaxHistory(PolicyTestCollection): def create_policy(self, featurizer, priority): # use standard featurizer from EmbeddingPolicy, - # since it is using FullDialogueTrackerFeaturizer - p = EmbeddingPolicy( - priority=priority, attn_before_rnn=False, attn_after_rnn=True - ) - return p - - -class TestEmbeddingPolicyAttentionBoth(PolicyTestCollection): - def create_policy(self, featurizer, priority): - # use standard featurizer from EmbeddingPolicy, - # since it is using FullDialogueTrackerFeaturizer - p = EmbeddingPolicy( - priority=priority, attn_before_rnn=True, attn_after_rnn=True - ) + # since it is using MaxHistoryTrackerFeaturizer + # if max_history is specified + p = EmbeddingPolicy(priority=priority, max_history=self.max_history) return p @@ -437,6 +421,7 @@ class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection): def create_policy(self, featurizer, priority): # use standard featurizer from EmbeddingPolicy, # since it is using FullDialogueTrackerFeaturizer + # if max_history is not specified p = EmbeddingPolicy(priority=priority, **tf_defaults()) return p From 2faf24a2ac58dfe58aaff878374d69752c1e8bd4 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 19:38:36 +0200 Subject: [PATCH 22/50] fix featurizer, add t2t requirements --- docs/core/policies.rst | 41 +- rasa/core/featurizers.py | 2 +- rasa/core/policies/tf_utils.py | 957 --------------------------------- requirements.txt | 3 +- 4 files changed, 9 insertions(+), 994 deletions(-) delete mode 100644 rasa/core/policies/tf_utils.py diff --git a/docs/core/policies.rst b/docs/core/policies.rst index a64dc15fac64..97606621df81 100644 --- a/docs/core/policies.rst +++ b/docs/core/policies.rst @@ -165,8 +165,10 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer. Embedding Policy ^^^^^^^^^^^^^^^^ -The Recurrent Embedding Dialogue Policy (REDP) -described in our paper: ``_ +Transformer Embedding Dialogue Policy (TEDP) + +Transformer version of the Recurrent Embedding Dialogue Policy (REDP) +used in our paper: ``_ This policy has a pre-defined architecture, which comprises the following steps: @@ -201,11 +203,6 @@ following steps: This step is based on the `StarSpace `_ idea. -.. note:: - - This policy only works with - ``FullDialogueTrackerFeaturizer(state_featurizer)``. - It is recommended to use ``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)`` (see :ref:`featurization` for details). @@ -219,40 +216,14 @@ It is recommended to use Pass an appropriate number of ``epochs`` to the ``EmbeddingPolicy``, otherwise the policy will be trained only for ``1`` - epoch. Since this is an embedding based policy, it requires a large - number of epochs, which depends on the complexity of the - training data and whether attention is used or not. - - The main feature of this policy is an **attention** mechanism over - previous user input and system actions. - **Attention is turned on by default**; in order to turn it off, - configure the following parameters: - - - ``attn_before_rnn`` if ``true`` the algorithm will use - attention mechanism over previous user input, default ``true``; - - ``attn_after_rnn`` if ``true`` the algorithm will use - attention mechanism over previous system actions and will be - able to copy previously executed action together with LSTM's - hidden state from its history, default ``true``; - - ``sparse_attention`` if ``true`` ``sparsemax`` will be used - instead of ``softmax`` for attention probabilities, default - ``false``; - - ``attn_shift_range`` the range of allowed location-based - attention shifts for system memory (``attn_after_rnn``), see - ``_ for details; + epoch. - .. note:: - - Attention requires larger values of ``epochs`` and takes longer - to train. But it can learn more complicated and nonlinear behaviour. + The main feature of this policy is **transformer**. The algorithm also has hyper-parameters to control: - neural network's architecture: - - ``hidden_layers_sizes_a`` sets a list of hidden layers - sizes before embedding layer for user inputs, the number - of hidden layers is equal to the length of the list; - ``hidden_layers_sizes_b`` sets a list of hidden layers sizes before embedding layer for system actions, the number of hidden layers is equal to the length of the list; diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index 4bdcbb9c6384..4c32158260ee 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -355,7 +355,7 @@ def _featurize_labels( y = np.array(labels) # if it is MaxHistoryFeaturizer, squeeze out time axis - if y.shape[1] == 1 and isinstance(self, MaxHistoryTrackerFeaturizer): + if y.ndim == 3 and isinstance(self, MaxHistoryTrackerFeaturizer): y = y[:, 0, :] return y diff --git a/rasa/core/policies/tf_utils.py b/rasa/core/policies/tf_utils.py deleted file mode 100644 index 2cfddda81bdd..000000000000 --- a/rasa/core/policies/tf_utils.py +++ /dev/null @@ -1,957 +0,0 @@ -from collections import namedtuple -import tensorflow as tf - -tf.contrib._warning = None # avoid warning println on contrib import - remove for tf 2 - - -class TimedNTM(object): - """Timed Neural Turing Machine - - Inspired by paper: - https://arxiv.org/pdf/1410.5401.pdf - Implementation inspired by: - https://github.com/carpedm20/NTM-tensorflow/blob/master/ntm_cell.py - - See our paper for details: https://arxiv.org/abs/1811.11707 - """ - - def __init__(self, attn_shift_range, sparse_attention, name): - """Construct the `TimedNTM`. - - Args: - attn_shift_range: Python int. - A time range within which to attend to the memory by location - sparse_attention: Python bool. - If `True` use sparsemax instead of softmax for probs - name: Name to use when creating ops. - """ - - # interpolation gate - self.name = "timed_ntm_" + name - - self._inter_gate = tf.layers.Dense( - units=1, activation=tf.sigmoid, name=self.name + "/inter_gate" - ) - # if use sparsemax instead of softmax for probs - self._sparse_attention = sparse_attention - - if sparse_attention: - # sparsemax doesn't support inf - self._inf = float(5000) - else: - self._inf = float("inf") - - # shift weighting if range is provided - if attn_shift_range: - self._shift_weight = tf.layers.Dense( - units=2 * attn_shift_range + 1, - activation=tf.nn.softmax, - name=self.name + "/shift_weight", - ) - else: - self._shift_weight = None - - # sharpening parameter - self._gamma_sharp = tf.layers.Dense( - units=1, - activation=lambda a: tf.nn.softplus(a) + 1, - bias_initializer=tf.constant_initializer(1), - name=self.name + "/gamma_sharp", - ) - - def __call__(self, attn_inputs, scores, scores_state, mask): - # apply exponential moving average with interpolation gate weight - # to scores from previous time which are equal to probs at this point - # different from original NTM where it is applied after softmax - i_g = self._inter_gate(attn_inputs) - - # scores limited by time - scores = tf.concat( - [i_g * scores[:, :-1] + (1 - i_g) * scores_state, scores[:, -1:]], 1 - ) - next_scores_state = scores - - if mask is not None: - # apply mask to scores - if self._shift_weight is not None: - # rearrange scores to make them continuous for convolution - scores = tf.map_fn( - self._rearrange_fn, [scores, mask], dtype=scores.dtype - ) - else: - scores = tf.where(mask > 0, scores, -self._inf * tf.ones_like(scores)) - - # create probabilities for attention - if self._sparse_attention: - probs = tf.contrib.sparsemax.sparsemax(scores) - else: - probs = tf.nn.softmax(scores) - - if self._shift_weight is not None: - s_w = self._shift_weight(attn_inputs) - - # we want to go back in time during convolution - conv_probs = tf.reverse(probs, axis=[1]) - - # preare probs for tf.nn.depthwise_conv2d - # [in_width, in_channels=batch] - conv_probs = tf.transpose(conv_probs, [1, 0]) - # [batch=1, in_height=1, in_width=time+1, in_channels=batch] - conv_probs = conv_probs[tf.newaxis, tf.newaxis, :, :] - - # [filter_height=1, filter_width=2*attn_shift_range+1, - # in_channels=batch, channel_multiplier=1] - conv_s_w = tf.transpose(s_w, [1, 0]) - conv_s_w = conv_s_w[tf.newaxis, :, :, tf.newaxis] - - # perform 1d convolution - # [batch=1, out_height=1, out_width=time+1, out_channels=batch] - conv_probs = tf.nn.depthwise_conv2d_native( - conv_probs, conv_s_w, [1, 1, 1, 1], "SAME" - ) - conv_probs = conv_probs[0, 0, :, :] - conv_probs = tf.transpose(conv_probs, [1, 0]) - - probs = tf.reverse(conv_probs, axis=[1]) - - if mask is not None: - # arrange probs back to their original time order - probs = tf.map_fn( - self._arrange_back_fn, [probs, mask], dtype=probs.dtype - ) - - # sharpening - g_sh = self._gamma_sharp(attn_inputs) - - powed_probs = tf.pow(probs, g_sh) - probs = powed_probs / (tf.reduce_sum(powed_probs, 1, keepdims=True) + 1e-32) - - return probs, next_scores_state - - def _rearrange_fn(self, list_tensor_1d_mask_1d): - """Rearranges tensor_1d to put all the values - where mask_1d=1 to the right and - where mask_1d=0 to the left and sets them to -infinity""" - tensor_1d, mask_1d = list_tensor_1d_mask_1d - - partitioned_tensor = tf.dynamic_partition(tensor_1d, mask_1d, 2) - partitioned_tensor[0] = -self._inf * tf.ones_like(partitioned_tensor[0]) - - return tf.concat(partitioned_tensor, 0) - - @staticmethod - def _arrange_back_fn(list_tensor_1d_mask_1d): - """Arranges back tensor_1d to restore original order - modified by `_rearrange_fn` according to mask_1d: - - number of 0s in mask_1d values on the left are set to - their corresponding places where mask_1d=0, - - number of 1s in mask_1d values on the right are set to - their corresponding places where mask_1d=1""" - tensor_1d, mask_1d = list_tensor_1d_mask_1d - - mask_indices = tf.dynamic_partition( - tf.range(tf.shape(tensor_1d)[0]), mask_1d, 2 - ) - - mask_sum = tf.reduce_sum(mask_1d, axis=0) - partitioned_tensor = [ - tf.zeros_like(tensor_1d[:-mask_sum]), - tensor_1d[-mask_sum:], - ] - - return tf.dynamic_stitch(mask_indices, partitioned_tensor) - - -def _compute_time_attention( - attention_mechanism, - attn_inputs, - attention_state, - # time is added to calculate time attention - time, - timed_ntm, - time_mask, - ignore_mask, - attention_layer, -): - """Computes the attention and alignments limited by time - for a given attention_mechanism. - - Modified helper method from tensorflow.""" - - scores, _ = attention_mechanism(attn_inputs, state=attention_state) - - # take only scores from current and past times - timed_scores = scores[:, : time + 1] - timed_scores_state = attention_state[:, :time] - - # get mask for past times - timed_time_mask = time_mask[:, :time] - if ignore_mask is not None: - timed_time_mask *= 1 - ignore_mask[:, :time] - - # set mask for current time to 1 - timed_time_mask = tf.concat([timed_time_mask, tf.ones_like(time_mask[:, :1])], 1) - - # pass these scores to NTM - probs, next_scores_state = timed_ntm( - attn_inputs, timed_scores, timed_scores_state, timed_time_mask - ) - - # concatenate probs with zeros to get new alignments - zeros = tf.zeros_like(scores) - # remove current time from attention - alignments = tf.concat([probs[:, :-1], zeros[:, time:]], 1) - - # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] - expanded_alignments = tf.expand_dims(alignments, 1) - - # Context is the inner product of alignments and values along the - # memory time dimension. - # alignments shape is - # [batch_size, 1, memory_time] - # attention_mechanism.values shape is - # [batch_size, memory_time, memory_size] - # the batched matmul is over memory_time, so the output shape is - # [batch_size, 1, memory_size]. - # we then squeeze out the singleton dim. - context = tf.matmul(expanded_alignments, attention_mechanism.values) - context = tf.squeeze(context, [1]) - - if attention_layer is not None: - attention = attention_layer(tf.concat([attn_inputs, context], 1)) - else: - attention = context - - # return current time to attention - alignments = tf.concat([probs, zeros[:, time + 1 :]], 1) - next_attention_state = tf.concat([next_scores_state, zeros[:, time + 1 :]], 1) - return attention, alignments, next_attention_state - - -# noinspection PyProtectedMember -class TimeAttentionWrapperState( - namedtuple( - "TimeAttentionWrapperState", - tf.contrib.seq2seq.AttentionWrapperState._fields - + ("all_time_masks", "all_cell_states"), - ) -): # added - """Modified from tensorflow's tf.contrib.seq2seq.AttentionWrapperState - see there for description of the parameters - - Additional fields: - - `all_time_masks`: A mask applied to a memory - that filters certain time steps - - `all_cell_states`: All states of the wrapped `RNNCell` - at all the previous time steps. - """ - - def clone(self, **kwargs): - """Copied from tensorflow's tf.contrib.seq2seq.AttentionWrapperState - see there for description of the parameters""" - - def with_same_shape(old, new): - """Check and set new tensor's shape.""" - if isinstance(old, tf.Tensor) and isinstance(new, tf.Tensor): - return tf.contrib.framework.with_same_shape(old, new) - return new - - return tf.contrib.framework.nest.map_structure( - with_same_shape, - self, - super(TimeAttentionWrapperState, self)._replace(**kwargs), - ) - - -class TimeAttentionWrapper(tf.contrib.seq2seq.AttentionWrapper): - """Custom AttentionWrapper that takes into account time - when calculating attention. - Attention is calculated before calling rnn cell. - - Modified from tensorflow's tf.contrib.seq2seq.AttentionWrapper. - - See our paper for details: https://arxiv.org/abs/1811.11707 - """ - - def __init__( - self, - cell, - attention_mechanism, - sequence_len, - attn_shift_range=0, - sparse_attention=False, - attention_layer_size=None, - alignment_history=False, - rnn_and_attn_inputs_fn=None, - ignore_mask=None, - cell_input_fn=None, - index_of_attn_to_copy=None, - likelihood_fn=None, - tensor_not_to_copy=None, - output_attention=False, - initial_cell_state=None, - name=None, - attention_layer=None, - ): - """Construct the `TimeAttentionWrapper`. - See the super class for the original arguments description. - - Additional args: - sequence_len: Python integer. - Maximum length of the sequence, used to create - appropriate TensorArray for all cell states - in TimeAttentionWrapperState - attn_shift_range: Python integer (`0` by default). - A time range within which to attend to the memory - by location in Neural Turing Machine. - sparse_attention: Python bool. - A flag to use sparsemax (if `True`) instead of - softmax (if `False`, default) for probabilities - inputs_and_attn_inputs_fn: (optional) A `callable`. - A function that creates inputs and attention inputs tensors. - ignore_mask: (optional) Boolean Tensor. - Determines which time steps to ignore in attention - index_of_attn_to_copy: (optional) Python integer. - An index of attention mechanism that picks - which part of attention tensor to use for copying to output, - the default is `None`, which turns off copying mechanism. - Copy inspired by: https://arxiv.org/pdf/1603.06393.pdf - likelihood_fn: (optional) A `callable`. - A method to perform likelihood calculation to - filter time step in copy mechanism. - Returns a tuple of binary likelihood and likelihood - tensor_not_to_copy: (optional) A Tensor. - A tensor, which shouldn't be copied from previous time steps - - Modified args: - output_attention: Python bool. If `True`, the output at each - time step is the concatenated cell outputs, - attention values and additional values described in - `additional_output_size()`, used in copy mechanism. - """ - super(TimeAttentionWrapper, self).__init__( - cell, - attention_mechanism, - attention_layer_size, - alignment_history, - cell_input_fn, - output_attention, - initial_cell_state, - name, - attention_layer, - ) - self._sequence_len = sequence_len - - if not isinstance(attn_shift_range, list): - # attn_shift_range might not be a list - attn_shift_range = [attn_shift_range] - self._timed_ntms = [TimedNTM(attn_shift_range[0], sparse_attention, name="0")] - if self._is_multi: - # if there are several attention mechanisms, - # create additional TimedNTMs for them - if len(attn_shift_range) == 1: - # original attn_shift_range might not be a list - attn_shift_range *= len(attention_mechanism) - elif len(attn_shift_range) != len(attention_mechanism): - raise ValueError( - "If provided, `attn_shift_range` must contain exactly one " - "integer per attention_mechanism, saw: {} vs {}" - "".format(len(attn_shift_range), len(attention_mechanism)) - ) - for i in range(1, len(attention_mechanism)): - self._timed_ntms.append( - TimedNTM(attn_shift_range[i], sparse_attention, name=str(i)) - ) - - if rnn_and_attn_inputs_fn is None: - rnn_and_attn_inputs_fn = self._default_rnn_and_attn_inputs_fn - else: - if not callable(rnn_and_attn_inputs_fn): - raise TypeError( - "`rnn_and_attn_inputs_fn` must be callable, saw type: {}" - "".format(type(rnn_and_attn_inputs_fn).__name__) - ) - self._rnn_and_attn_inputs_fn = rnn_and_attn_inputs_fn - - if not isinstance(ignore_mask, list): - self._ignore_mask = [tf.cast(ignore_mask, tf.int32)] - else: - self._ignore_mask = [tf.cast(i_m, tf.int32) for i_m in ignore_mask] - - self._index_of_attn_to_copy = index_of_attn_to_copy - - self._likelihood_fn = likelihood_fn - self._tensor_not_to_copy = tensor_not_to_copy - - @staticmethod - def _default_rnn_and_attn_inputs_fn(inputs, cell_state): - if isinstance(cell_state, tf.contrib.rnn.LSTMStateTuple): - return inputs, tf.concat([inputs, cell_state.h], -1) - else: - return inputs, tf.concat([inputs, cell_state], -1) - - @staticmethod - def additional_output_size(): - """Number of additional outputs: - - likelihoods: - attn_likelihood, state_likelihood - debugging info: - current_time_prob, - bin_likelihood_not_to_copy, bin_likelihood_to_copy - - **Method should be static** - """ - return 2 + 3 - - @property - def output_size(self): - if self._output_attention: - if self._index_of_attn_to_copy is not None: - # output both raw rnn cell_output and - # cell_output with copied attention - # together with attention vector itself - # and additional output - return ( - 2 * self._cell.output_size - + self._attention_layer_size - + self.additional_output_size() - ) - else: - return self._cell.output_size + self._attention_layer_size - else: - return self._cell.output_size - - @property - def state_size(self): - """The `state_size` property of `TimeAttentionWrapper`. - Returns: - A `TimeAttentionWrapperState` tuple containing shapes - used by this object. - """ - - # use AttentionWrapperState from superclass - state_size = super(TimeAttentionWrapper, self).state_size - - all_cell_states = self._cell.state_size - - return TimeAttentionWrapperState( - cell_state=state_size.cell_state, - time=state_size.time, - attention=state_size.attention, - alignments=state_size.alignments, - attention_state=state_size.attention_state, - alignment_history=state_size.alignment_history, - all_time_masks=self._sequence_len, - all_cell_states=all_cell_states, - ) - - def zero_state(self, batch_size, dtype): - """Modified from tensorflow's zero_state - see there for description of the parameters""" - - # use AttentionWrapperState from superclass - zero_state = super(TimeAttentionWrapper, self).zero_state(batch_size, dtype) - - with tf.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): - # store time masks - all_time_masks = tf.TensorArray( - tf.int32, - size=self._sequence_len + 1, - dynamic_size=False, - clear_after_read=False, - ).write(0, tf.zeros([batch_size, self.state_size.all_time_masks], tf.int32)) - - # store all cell states into a tensor array to allow - # copy mechanism to go back in time - if isinstance(self._cell.state_size, tf.contrib.rnn.LSTMStateTuple): - all_cell_states = tf.contrib.rnn.LSTMStateTuple( - tf.TensorArray( - dtype, - size=self._sequence_len + 1, - dynamic_size=False, - clear_after_read=False, - ).write(0, zero_state.cell_state.c), - tf.TensorArray( - dtype, - size=self._sequence_len + 1, - dynamic_size=False, - clear_after_read=False, - ).write(0, zero_state.cell_state.h), - ) - else: - all_cell_states = tf.TensorArray( - dtype, size=0, dynamic_size=False, clear_after_read=False - ).write(0, zero_state.cell_state) - - return TimeAttentionWrapperState( - cell_state=zero_state.cell_state, - time=zero_state.time, - attention=zero_state.attention, - alignments=zero_state.alignments, - attention_state=zero_state.attention_state, - alignment_history=zero_state.alignment_history, - all_time_masks=all_time_masks, - all_cell_states=all_cell_states, - ) - - def call(self, inputs, state): - """Perform a step of attention-wrapped RNN. - - The order has changed: - - Step 1: Calculate attention inputs based on the previous cell state - and current inputs - - Step 2: Score the output with `attention_mechanism`. - - Step 3: Calculate the alignments by passing the score through the - `normalizer` and limit them by time. - - Step 4: Calculate the context vector as the inner product between the - alignments and the attention_mechanism's values (memory). - - Step 5: Calculate the attention output by concatenating - the cell output and context through the attention layer - (a linear layer with `attention_layer_size` outputs). - - Step 6: Mix the `inputs` and `attention` output via - `cell_input_fn` to get cell inputs. - - Step 7: Call the wrapped `cell` with these cell inputs and - its previous state. - - Step 8: (optional) Maybe copy output and cell state from history - - Args: - inputs: (Possibly nested tuple of) Tensor, - the input at this time step. - state: An instance of `TimeAttentionWrapperState` - containing tensors from the previous time step. - - Returns: - A tuple `(attention_or_cell_output, next_state)`, where: - - - `attention_or_cell_output` depending on `output_attention`. - - `next_state` is an instance of `TimeAttentionWrapperState` - containing the state calculated at this time step. - - Raises: - TypeError: If `state` is not an instance of - `TimeAttentionWrapperState`. - """ - if not isinstance(state, TimeAttentionWrapperState): - raise TypeError( - "Expected state to be instance of " - "TimeAttentionWrapperState. " - "Received type {} instead.".format(type(state)) - ) - - # Step 1: Calculate attention based on - # the previous output and current input - cell_state = state.cell_state - - rnn_inputs, attn_inputs = self._rnn_and_attn_inputs_fn(inputs, cell_state) - - cell_batch_size = attn_inputs.shape[0].value or tf.shape(attn_inputs)[0] - error_message = ( - "When applying AttentionWrapper %s: " % self.name - + "Non-matching batch sizes between the memory " - "(encoder output) and the query (decoder output). " - "Are you using " - "the BeamSearchDecoder? " - "You may need to tile your memory input via " - "the tf.contrib.seq2seq.tile_batch function with argument " - "multiple=beam_width." - ) - with tf.control_dependencies( - self._batch_size_checks(cell_batch_size, error_message) - ): - attn_inputs = tf.identity(attn_inputs, name="checked_attn_inputs") - - if self._is_multi: - previous_attention_state = state.attention_state - previous_alignment_history = state.alignment_history - else: - previous_attention_state = [state.attention_state] - previous_alignment_history = [state.alignment_history] - - all_alignments = [] - all_attentions = [] - all_attention_states = [] - maybe_all_histories = [] - - prev_time_masks = self._read_from_tensor_array(state.all_time_masks, state.time) - prev_time_mask = prev_time_masks[:, -1, :] - - for i, attention_mechanism in enumerate(self._attention_mechanisms): - # Steps 2 - 5 are performed inside `_compute_time_attention` - (attention, alignments, next_attention_state) = _compute_time_attention( - attention_mechanism, - attn_inputs, - previous_attention_state[i], - # time is added to calculate time attention - state.time, - self._timed_ntms[i], - # provide boolean masks, to ignore some time steps - prev_time_mask, - self._ignore_mask[i], - self._attention_layers[i] if self._attention_layers else None, - ) - - alignment_history = ( - previous_alignment_history[i].write(state.time, alignments) - if self._alignment_history - else () - ) - - all_attention_states.append(next_attention_state) - all_alignments.append(alignments) - all_attentions.append(attention) - maybe_all_histories.append(alignment_history) - - attention = tf.concat(all_attentions, 1) - - # Step 6: Mix the `inputs` and `attention` output via - # `cell_input_fn` to get cell inputs. - cell_inputs = self._cell_input_fn(rnn_inputs, attention) - - # Step 7: Call the wrapped `cell` with these cell inputs and - # its previous state. - cell_output, next_cell_state = self._cell(cell_inputs, cell_state) - - prev_all_cell_states = state.all_cell_states - - time_mask = tf.concat( - [ - prev_time_mask[:, : state.time], - tf.ones_like(prev_time_mask[:, :1]), - prev_time_mask[:, state.time + 1 :], - ], - 1, - ) - - if self._index_of_attn_to_copy is not None: - # Step 8: Maybe copy output and cell state from history - - # get relevant previous outputs from history - attn_to_copy = all_attentions[self._index_of_attn_to_copy] - # copy them to current output - cell_output_with_attn = cell_output + attn_to_copy - - memory_probs = self._get_memory_probs(all_alignments, state.time) - - # check that we do not pay attention to `tensor_not_to_copy` - bin_likelihood_not_to_copy, _ = self._likelihood_fn( - cell_output_with_attn, self._tensor_not_to_copy - ) - # recalculate probs - memory_probs *= 1 - bin_likelihood_not_to_copy - - history_alignments = self._history_alignments(memory_probs) - - # get previous output from the history - prev_output = self._prev_output( - cell_output_with_attn, history_alignments, state.time - ) - - # check that current output is close to - # the one in the history to which we pay attention to - bin_likelihood_to_copy, _ = self._likelihood_fn( - cell_output_with_attn, prev_output - ) - # recalculate probs - memory_probs *= bin_likelihood_to_copy - - history_alignments = self._history_alignments(memory_probs) - current_time_prob = history_alignments[:, -1:] - - # create additional likelihoods to maximize - attn_likelihood = self._additional_likelihood( - attn_to_copy, prev_output, current_time_prob - ) - state_likelihood = self._additional_likelihood( - cell_output + tf.stop_gradient(attn_to_copy), - prev_output, - current_time_prob, - ) - - # recalculate time_mask - time_mask = self._apply_alignments_to_history( - tf.cast(history_alignments, time_mask.dtype), - prev_time_masks[:, :-1, :], - time_mask, - ) - - # recalculate new next_cell_state based on history_alignments - next_cell_state = self._new_next_cell_state( - prev_all_cell_states, - next_cell_state, - cell_output_with_attn, - history_alignments, - state.time, - ) - - all_cell_states = self._all_cell_states( - prev_all_cell_states, next_cell_state, state.time - ) - - if self._output_attention: - # concatenate cell outputs, attention, additional likelihoods - # and copy_attn_debug - output = tf.concat( - [ - cell_output_with_attn, - cell_output, - attention, - # additional likelihoods - attn_likelihood, - state_likelihood, - # copy_attn_debug - bin_likelihood_not_to_copy, - bin_likelihood_to_copy, - current_time_prob, - ], - 1, - ) - else: - output = cell_output_with_attn - - else: - # do not waste resources on storing history - all_cell_states = prev_all_cell_states - - if self._output_attention: - output = tf.concat([cell_output, attention], 1) - else: - output = cell_output - - all_time_masks = state.all_time_masks.write(state.time + 1, time_mask) - - next_state = TimeAttentionWrapperState( - time=state.time + 1, - cell_state=next_cell_state, - attention=attention, - attention_state=self._item_or_tuple(all_attention_states), - alignments=self._item_or_tuple(all_alignments), - alignment_history=self._item_or_tuple(maybe_all_histories), - all_time_masks=all_time_masks, - all_cell_states=all_cell_states, - ) - return output, next_state - - # helper for TensorArray - @staticmethod - def _read_from_tensor_array(tensor_array, time): - """TensorArray time reader""" - return tf.transpose(tensor_array.gather(tf.range(0, time + 1)), [1, 0, 2]) - - # helper methods for copy mechanism - def _get_memory_probs(self, all_alignments, time): - """Helper method to get memory_probs from all_alignments""" - - memory_probs = tf.stop_gradient( - all_alignments[self._index_of_attn_to_copy][:, :time] - ) - - # binarize memory_probs only if max value is larger than margin=0.1 - memory_probs_max = tf.reduce_max(memory_probs, axis=1, keepdims=True) - memory_probs_max = tf.where( - memory_probs_max > 0.1, memory_probs_max, -memory_probs_max - ) - - return tf.where( - tf.equal(memory_probs, memory_probs_max), - tf.ones_like(memory_probs), - tf.zeros_like(memory_probs), - ) - - @staticmethod - def _history_alignments(memory_probs): - """Helper method to apply binary mask to memory_probs""" - - current_time_prob = 1 - tf.reduce_sum(memory_probs, 1, keepdims=True) - return tf.concat([memory_probs, current_time_prob], 1) - - @staticmethod - def _apply_alignments_to_history(alignments, history_states, state): - """Helper method to apply attention probabilities to rnn history - - copied from tf's `_compute_attention(...)`""" - - expanded_alignments = tf.stop_gradient(tf.expand_dims(alignments, 1)) - - history_states = tf.concat([history_states, tf.expand_dims(state, 1)], 1) - - # Context is the inner product of alignments and values along the - # memory time dimension. - # expanded_alignments shape is - # [batch_size, 1, memory_time] - # history_states shape is - # [batch_size, memory_time, memory_size] - # the batched matmul is over memory_time, so the output shape is - # [batch_size, 1, memory_size]. - # we then squeeze out the singleton dim. - - return tf.squeeze(tf.matmul(expanded_alignments, history_states), [1]) - - def _prev_output(self, state, alignments, time): - """Helper method to get previous output from memory""" - - # get all previous outputs from appropriate - # attention mechanism's memory limited by current time - prev_outputs = tf.stop_gradient( - self._attention_mechanisms[self._index_of_attn_to_copy].values[:, :time, :] - ) - - # multiply by alignments to get one vector from one time step - return self._apply_alignments_to_history(alignments, prev_outputs, state) - - def _additional_likelihood(self, output, prev_output, current_time_prob): - """Helper method to create additional likelihood to maximize""" - - _, likelihood = self._likelihood_fn(output, tf.stop_gradient(prev_output)) - return tf.where(current_time_prob < 0.5, likelihood, tf.ones_like(likelihood)) - - def _new_hidden_state(self, prev_all_cell_states, new_state, alignments, time): - """Helper method to look into rnn history""" - - # reshape to (batch, time, memory_time) and - # do not include current time because - # we do not want to pay attention to it, - # but we need to read it instead of - # adding conditional flow if time == 0 - prev_cell_states = self._read_from_tensor_array(prev_all_cell_states, time)[ - :, :-1, : - ] - - return self._apply_alignments_to_history( - alignments, prev_cell_states, new_state - ) - - def _new_next_cell_state( - self, prev_all_cell_states, next_cell_state, new_cell_output, alignments, time - ): - """Helper method to recalculate new next_cell_state""" - - if isinstance(next_cell_state, tf.contrib.rnn.LSTMStateTuple): - next_cell_state_c = self._new_hidden_state( - prev_all_cell_states.c, next_cell_state.c, alignments, time - ) - next_cell_state_h = self._new_hidden_state( - prev_all_cell_states.h, new_cell_output, alignments, time - ) - return tf.contrib.rnn.LSTMStateTuple(next_cell_state_c, next_cell_state_h) - else: - return self._new_hidden_state( - prev_all_cell_states, alignments, new_cell_output, time - ) - - @staticmethod - def _all_cell_states(prev_all_cell_states, next_cell_state, time): - """Helper method to recalculate all_cell_states tensor array""" - - if isinstance(next_cell_state, tf.contrib.rnn.LSTMStateTuple): - return tf.contrib.rnn.LSTMStateTuple( - prev_all_cell_states.c.write(time + 1, next_cell_state.c), - prev_all_cell_states.h.write(time + 1, next_cell_state.h), - ) - else: - return prev_all_cell_states.write(time + 1, next_cell_state) - - -class ChronoBiasLayerNormBasicLSTMCell(tf.contrib.rnn.LayerNormBasicLSTMCell): - """Custom LayerNormBasicLSTMCell that allows chrono initialization - of gate biases. - - See super class for description. - - See https://arxiv.org/abs/1804.11188 - for details about chrono initialization - """ - - def __init__( - self, - num_units, - forget_bias=1.0, - input_bias=0.0, - activation=tf.tanh, - layer_norm=True, - norm_gain=1.0, - norm_shift=0.0, - dropout_keep_prob=1.0, - dropout_prob_seed=None, - out_layer_size=None, - reuse=None, - ): - """Initializes the basic LSTM cell - - Additional args: - input_bias: float, The bias added to input gates. - out_layer_size: (optional) integer, The number of units in - the optional additional output layer. - """ - super(ChronoBiasLayerNormBasicLSTMCell, self).__init__( - num_units, - forget_bias=forget_bias, - activation=activation, - layer_norm=layer_norm, - norm_gain=norm_gain, - norm_shift=norm_shift, - dropout_keep_prob=dropout_keep_prob, - dropout_prob_seed=dropout_prob_seed, - reuse=reuse, - ) - self._input_bias = input_bias - self._out_layer_size = out_layer_size - - @property - def output_size(self): - return self._out_layer_size or self._num_units - - @property - def state_size(self): - return tf.contrib.rnn.LSTMStateTuple(self._num_units, self.output_size) - - @staticmethod - def _dense_layer(args, layer_size): - """Optional out projection layer""" - proj_size = args.get_shape()[-1] - dtype = args.dtype - weights = tf.get_variable("kernel", [proj_size, layer_size], dtype=dtype) - bias = tf.get_variable("bias", [layer_size], dtype=dtype) - out = tf.nn.bias_add(tf.matmul(args, weights), bias) - return out - - def call(self, inputs, state): - """LSTM cell with layer normalization and recurrent dropout.""" - c, h = state - args = tf.concat([inputs, h], 1) - concat = self._linear(args) - dtype = args.dtype - - i, j, f, o = tf.split(value=concat, num_or_size_splits=4, axis=1) - if self._layer_norm: - i = self._norm(i, "input", dtype=dtype) - j = self._norm(j, "transform", dtype=dtype) - f = self._norm(f, "forget", dtype=dtype) - o = self._norm(o, "output", dtype=dtype) - - g = self._activation(j) - if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1: - g = tf.nn.dropout(g, self._keep_prob, seed=self._seed) - - new_c = c * tf.sigmoid(f + self._forget_bias) + g * tf.sigmoid( - i + self._input_bias - ) # added input_bias - - # do not do layer normalization on the new c, - # because there are no trainable weights - # if self._layer_norm: - # new_c = self._norm(new_c, "state", dtype=dtype) - - new_h = self._activation(new_c) * tf.sigmoid(o) - - # added dropout to the hidden state h - if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1: - new_h = tf.nn.dropout(new_h, self._keep_prob, seed=self._seed) - - # add postprocessing of the output - if self._out_layer_size is not None: - with tf.variable_scope("out_layer"): - new_h = self._dense_layer(new_h, self._out_layer_size) - - new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) - return new_h, new_state diff --git a/requirements.txt b/requirements.txt index cff22f35bc60..04edbadfa16c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,8 @@ fakeredis==1.0.3 pymongo==3.8.0 numpy==1.16.3 scipy==1.2.1 -tensorflow==1.13.1 +tensorflow==1.14.0 +tensor2tensor=1.13.4 apscheduler==3.6.0 tqdm==4.31.0 networkx==2.3 From bcd3bffd6b634c9e88eb908b47e2015fdbec55ee Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 19:38:39 +0200 Subject: [PATCH 23/50] fix featurizer, add t2t requirements --- rasa/core/policies/embedding_policy.py | 27 ++++++-------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 5145039c7a02..8c033315aa1f 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -25,20 +25,12 @@ from sklearn.model_selection import train_test_split import tensorflow as tf - -try: - from tensor2tensor.models.transformer import ( - transformer_base, - transformer_prepare_encoder, - transformer_encoder, - ) - from tensor2tensor.layers.common_attention import large_compatible_negative - -except ImportError: - transformer_base = None - transformer_prepare_encoder = None - transformer_encoder = None - large_compatible_negative = None +from tensor2tensor.models.transformer import ( + transformer_base, + transformer_prepare_encoder, + transformer_encoder, +) +from tensor2tensor.layers.common_attention import large_compatible_negative try: import cPickle as pickle @@ -135,11 +127,6 @@ def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturize LabelTokenizerSingleStateFeaturizer(), max_history=max_history ) - @staticmethod - def _check_t2t() -> None: - if transformer_base is None: - raise ImportError("Please install tensor2tensor") - def __init__( self, featurizer: Optional["TrackerFeaturizer"] = None, @@ -161,8 +148,6 @@ def __init__( max_history: Optional[int] = None, **kwargs: Any ) -> None: - # check if t2t is installed - self._check_t2t() if not featurizer: featurizer = self._standard_featurizer(max_history) From 9342569cab780f4e1fd646f16c8fbc83f2c0b7b7 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 19:41:26 +0200 Subject: [PATCH 24/50] add tfp for t2t to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 04edbadfa16c..51d6de84a827 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ pymongo==3.8.0 numpy==1.16.3 scipy==1.2.1 tensorflow==1.14.0 +tensorflow-probability==0.7.0 tensor2tensor=1.13.4 apscheduler==3.6.0 tqdm==4.31.0 From 3c204b98849598f3815c4720c4e4613f9fec265a Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 19:44:46 +0200 Subject: [PATCH 25/50] fix requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 51d6de84a827..bc58d8d43640 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ numpy==1.16.3 scipy==1.2.1 tensorflow==1.14.0 tensorflow-probability==0.7.0 -tensor2tensor=1.13.4 +tensor2tensor==1.13.4 apscheduler==3.6.0 tqdm==4.31.0 networkx==2.3 From 2cd22a865b6daa215193c0803a29c91fae8643a4 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 22:14:25 +0200 Subject: [PATCH 26/50] remove check tf, update setup.py --- alt_requirements/requirements_full.txt | 2 +- rasa/core/policies/embedding_policy.py | 7 ++---- .../embedding_intent_classifier.py | 23 ++++--------------- setup.py | 3 ++- 4 files changed, 10 insertions(+), 25 deletions(-) diff --git a/alt_requirements/requirements_full.txt b/alt_requirements/requirements_full.txt index e9114035ccea..a700141c6b13 100644 --- a/alt_requirements/requirements_full.txt +++ b/alt_requirements/requirements_full.txt @@ -1,4 +1,4 @@ -# Minimum Instal Requirements +# Minimum Install Requirements -r ../requirements.txt # Spacy Requirements diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 8c033315aa1f..c4f63d295230 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -3,6 +3,7 @@ import json import logging import os +import pickle import warnings import numpy as np @@ -32,11 +33,6 @@ ) from tensor2tensor.layers.common_attention import large_compatible_negative -try: - import cPickle as pickle -except ImportError: - import pickle - if typing.TYPE_CHECKING: from tensor2tensor.utils.hparam import HParams @@ -148,6 +144,7 @@ def __init__( max_history: Optional[int] = None, **kwargs: Any ) -> None: + """Declare instant variables with default values""" if not featurizer: featurizer = self._standard_featurizer(max_history) diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py index a613321cb6ba..3fa504c3c79d 100644 --- a/rasa/nlu/classifiers/embedding_intent_classifier.py +++ b/rasa/nlu/classifiers/embedding_intent_classifier.py @@ -11,6 +11,11 @@ from rasa.nlu.components import Component from rasa.utils.common import is_logging_disabled +import tensorflow as tf + +# avoid warning println on contrib import - remove for tf 2 +tf.contrib._warning = None + logger = logging.getLogger(__name__) if typing.TYPE_CHECKING: @@ -20,14 +25,6 @@ from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message -try: - import tensorflow as tf - - # avoid warning println on contrib import - remove for tf 2 - tf.contrib._warning = None -except ImportError: - tf = None - class EmbeddingIntentClassifier(Component): """Intent classifier using supervised embeddings. @@ -120,7 +117,6 @@ def __init__( ) -> None: """Declare instant variables with default values""" - self._check_tensorflow() super(EmbeddingIntentClassifier, self).__init__(component_config) self._load_params() @@ -195,15 +191,6 @@ def _load_params(self) -> None: def required_packages(cls) -> List[Text]: return ["tensorflow"] - @staticmethod - def _check_tensorflow(): - if tf is None: - raise ImportError( - "Failed to import `tensorflow`. " - "Please install `tensorflow`. " - "For example with `pip install tensorflow`." - ) - # training data helpers: @staticmethod def _create_intent_dict(training_data: "TrainingData") -> Dict[Text, int]: diff --git a/setup.py b/setup.py index 7efa8ec2e8cf..310bf0214908 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,8 @@ "pymongo~=3.8", "numpy~=1.16", "scipy~=1.2", - "tensorflow~=1.13.0", + "tensorflow~=1.14.0", + "tensor2tensor~=1.13.4", "apscheduler~=3.0", "tqdm~=4.0", "networkx~=2.3", From 0c089678e1e8924d00b5ee8301e19775b6ccb37d Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Wed, 24 Jul 2019 22:37:08 +0200 Subject: [PATCH 27/50] remove unused variables --- rasa/core/policies/embedding_policy.py | 30 +++++++++----------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index c4f63d295230..200118f63466 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -130,10 +130,8 @@ def __init__( encoded_all_actions: Optional["np.ndarray"] = None, graph: Optional["tf.Graph"] = None, session: Optional["tf.Session"] = None, - intent_placeholder: Optional["tf.Tensor"] = None, - action_placeholder: Optional["tf.Tensor"] = None, - slots_placeholder: Optional["tf.Tensor"] = None, - prev_act_placeholder: Optional["tf.Tensor"] = None, + user_placeholder: Optional["tf.Tensor"] = None, + bot_placeholder: Optional["tf.Tensor"] = None, similarity_all: Optional["tf.Tensor"] = None, pred_confidence: Optional["tf.Tensor"] = None, similarity: Optional["tf.Tensor"] = None, @@ -162,10 +160,8 @@ def __init__( # tf related instances self.graph = graph self.session = session - self.a_in = intent_placeholder - self.b_in = action_placeholder - self.c_in = slots_placeholder - self.b_prev_in = prev_act_placeholder + self.a_in = user_placeholder + self.b_in = bot_placeholder self.sim_all = similarity_all self.pred_confidence = pred_confidence self.sim = similarity @@ -1288,10 +1284,8 @@ def persist(self, path: Text) -> None: rasa.utils.io.create_directory_for_file(checkpoint) with self.graph.as_default(): - self._persist_tensor("intent_placeholder", self.a_in) - self._persist_tensor("action_placeholder", self.b_in) - self._persist_tensor("slots_placeholder", self.c_in) - self._persist_tensor("prev_act_placeholder", self.b_prev_in) + self._persist_tensor("user_placeholder", self.a_in) + self._persist_tensor("bot_placeholder", self.b_in) self._persist_tensor("similarity_all", self.sim_all) self._persist_tensor("pred_confidence", self.pred_confidence) @@ -1359,10 +1353,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy": saver.restore(session, checkpoint) - a_in = cls.load_tensor("intent_placeholder") - b_in = cls.load_tensor("action_placeholder") - c_in = cls.load_tensor("slots_placeholder") - b_prev_in = cls.load_tensor("prev_act_placeholder") + a_in = cls.load_tensor("user_placeholder") + b_in = cls.load_tensor("bot_placeholder") sim_all = cls.load_tensor("similarity_all") pred_confidence = cls.load_tensor("pred_confidence") @@ -1387,10 +1379,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy": encoded_all_actions=encoded_all_actions, graph=graph, session=session, - intent_placeholder=a_in, - action_placeholder=b_in, - slots_placeholder=c_in, - prev_act_placeholder=b_prev_in, + user_placeholder=a_in, + bot_placeholder=b_in, similarity_all=sim_all, pred_confidence=pred_confidence, similarity=sim, From 8bfd238e34a3871587ca44637f759c1edca14d3e Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 00:18:26 +0200 Subject: [PATCH 28/50] update setuptools --- requirements.txt | 1 + setup.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index bc58d8d43640..56d5bfe5c6eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,3 +49,4 @@ SQLAlchemy~=1.3.3 kafka-python==1.4.6 sklearn-crfsuite==0.3.6 psycopg2-binary==2.8.2 +setuptools==41.0.1 diff --git a/setup.py b/setup.py index 310bf0214908..71853a681f3f 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ "numpy~=1.16", "scipy~=1.2", "tensorflow~=1.14.0", + "tensorflow-probability~=0.7.0", "tensor2tensor~=1.13.4", "apscheduler~=3.0", "tqdm~=4.0", @@ -75,6 +76,7 @@ "SQLAlchemy~=1.3.0", "kafka-python~=1.4", "sklearn-crfsuite~=0.3.6", + "setuptools~=41.0.1" ] extras_requires = { From 185d38fff2e2d192cf064ea9e5f76ffcb0598ea1 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 10:01:39 +0200 Subject: [PATCH 29/50] change default tf config test --- tests/core/test_policies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 95b22d96beb1..70440b2531a8 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -149,12 +149,13 @@ def test_persist_and_load_empty_policy(self, tmpdir): def test_tf_config(self, trained_policy, tmpdir): if hasattr(trained_policy, "session"): + import tensorflow as tf # noinspection PyProtectedMember - assert trained_policy.session._config is None + assert trained_policy.session._config == tf.Session()._config trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) # noinspection PyProtectedMember - assert loaded.session._config is None + assert loaded.session._config == tf.Session()._config class TestKerasPolicy(PolicyTestCollection): From 45b225b550c57f0cdb37d5a3e88563aae4cc6e02 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 11:32:27 +0200 Subject: [PATCH 30/50] black it --- setup.py | 2 +- tests/core/test_policies.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 71853a681f3f..785138b45300 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ "SQLAlchemy~=1.3.0", "kafka-python~=1.4", "sklearn-crfsuite~=0.3.6", - "setuptools~=41.0.1" + "setuptools~=41.0.1", ] extras_requires = { diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 70440b2531a8..80f34fad89ff 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -150,6 +150,7 @@ def test_persist_and_load_empty_policy(self, tmpdir): def test_tf_config(self, trained_policy, tmpdir): if hasattr(trained_policy, "session"): import tensorflow as tf + # noinspection PyProtectedMember assert trained_policy.session._config == tf.Session()._config trained_policy.persist(tmpdir.strpath) From 7c3a1eaeafe5cf79dbf1878e5235b01512082f5a Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 12:29:25 +0200 Subject: [PATCH 31/50] refactor pre transformer embedding --- docs/core/policies.rst | 7 ++-- rasa/core/policies/embedding_policy.py | 46 ++++++++++++-------------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/docs/core/policies.rst b/docs/core/policies.rst index 97606621df81..8049e280362f 100644 --- a/docs/core/policies.rst +++ b/docs/core/policies.rst @@ -175,10 +175,9 @@ following steps: - apply dense layers to create embeddings for user intents, entities and system actions including previous actions and slots; - - use the embeddings of previous user inputs as a user memory - and embeddings of previous system actions as a system memory; - - concatenate user input, previous system action and slots - embeddings for current time into an input vector to rnn; + - concatenate user input (user intents and entities), + previous system action and slots + for current time into an input vector to pre-transformer embedding layer; - using user and previous system action embeddings from the input vector, calculate attention probabilities over the user and system memories (for system memory, this policy uses diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 200118f63466..0afc2146eecf 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -9,7 +9,7 @@ import numpy as np import typing from tqdm import tqdm -from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator +from typing import Any, List, Optional, Text, Dict, Tuple, Union, Generator, Callable import rasa.utils.io from rasa.core import utils @@ -447,6 +447,9 @@ def _create_tf_nn( layer_sizes: List[int], droprate: float, layer_name_suffix: Text, + activation: Optional[Callable] = tf.nn.relu, + use_bias: bool = True, + kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None, ) -> "tf.Tensor": """Create nn with hidden layers and name suffix.""" @@ -456,7 +459,9 @@ def _create_tf_nn( x = tf.layers.dense( inputs=x, units=layer_size, - activation=tf.nn.relu, + activation=activation, + use_bias=use_bias, + kernel_initializer=kernel_initializer, kernel_regularizer=reg, name="hidden_layer_{}_{}".format(layer_name_suffix, i), reuse=tf.AUTO_REUSE, @@ -542,30 +547,23 @@ def _create_t2t_transformer_encoder( for key, value in hparams.values().items(): if key.endswith("dropout") or key == "label_smoothing": setattr(hparams, key, value * tf.cast(self._is_training, tf.float32)) - reg = tf.contrib.layers.l2_regularizer(self.C2) - - x = tf.nn.relu(x_in) - x = tf.layers.dense( - inputs=x, - units=hparams.hidden_size, - use_bias=False, - kernel_initializer=tf.random_normal_initializer( - 0.0, hparams.hidden_size ** -0.5 - ), - kernel_regularizer=reg, - name="transformer_embed_layer", - reuse=tf.AUTO_REUSE, - ) - x = tf.layers.dropout( - x, rate=hparams.layer_prepostprocess_dropout, training=self._is_training - ) - - if hparams.multiply_embedding_mode == "sqrt_depth": - x *= hparams.hidden_size ** 0.5 - - x *= tf.expand_dims(mask, -1) with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE): + x = self._create_tf_nn( + x_in, + [hparams.hidden_size], + hparams.layer_prepostprocess_dropout, + layer_name_suffix="pre_embed", + activation=None, + use_bias=False, + kernel_initializer=tf.random_normal_initializer( + 0.0, hparams.hidden_size ** -0.5 + ), + ) + if hparams.multiply_embedding_mode == "sqrt_depth": + x *= hparams.hidden_size ** 0.5 + + x *= tf.expand_dims(mask, -1) ( x, self_attention_bias, From 5765fd0041358c31dfe7a88694af6d064660ba27 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 13:01:21 +0200 Subject: [PATCH 32/50] update docs --- docs/core/policies.rst | 84 +++++++++++++------------- rasa/core/policies/embedding_policy.py | 30 ++++----- 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/docs/core/policies.rst b/docs/core/policies.rst index 8049e280362f..7cf7a4b8882d 100644 --- a/docs/core/policies.rst +++ b/docs/core/policies.rst @@ -173,31 +173,14 @@ used in our paper: ``_ This policy has a pre-defined architecture, which comprises the following steps: - - apply dense layers to create embeddings for user intents, - entities and system actions including previous actions and slots; - - concatenate user input (user intents and entities), - previous system action and slots - for current time into an input vector to pre-transformer embedding layer; - - using user and previous system action embeddings from the input - vector, calculate attention probabilities over the user and - system memories (for system memory, this policy uses - `NTM mechanism `_ with attention - by location); - - sum the user embedding and user attention vector and feed it - and the embeddings of the slots as an input to an LSTM cell; - - apply a dense layer to the output of the LSTM to get a raw - recurrent embedding of a dialogue; - - sum this raw recurrent embedding of a dialogue with system - attention vector to create dialogue level embedding, this step - allows the algorithm to repeat previous system action by copying - its embedding vector directly to the current time output; - - weight previous LSTM states with system attention probabilities - to get the previous action embedding, the policy is likely payed - attention to; - - if the similarity between this previous action embedding and - current time dialogue embedding is high, overwrite current LSTM - state with the one from the time when this action happened; - - for each LSTM time step, calculate the similarity between the + - concatenate user input (user intent and entities), + previous system action and slots for current time into an input vector + to pre-transformer embedding layer; + - feed it to tranformer; + - apply a dense layer to the output of the transformer + to get embeddings of a dialogue for each time step; + - apply a dense layer to create embeddings for system actions for each time step; + - calculate the similarity between the dialogue embedding and embedded system actions. This step is based on the `StarSpace `_ idea. @@ -226,15 +209,21 @@ It is recommended to use - ``hidden_layers_sizes_b`` sets a list of hidden layers sizes before embedding layer for system actions, the number of hidden layers is equal to the length of the list; - - ``rnn_size`` sets the number of units in the LSTM cell; + - ``transformer_size`` sets the number of units in the transfomer; + - ``num_transformer_layers`` sets the number of transformer layers; + - ``pos_encoding`` sets the type of positional encoding in transformer, + it should be either ``timing`` or ``emb``; + - ``max_seq_length`` sets maximum sequence length + if embedding positional encodings are used; + - ``num_heads`` sets the number of heads in multihead attention; - training: - - ``layer_norm`` if ``true`` layer normalization for lstm - cell is turned on, default ``true``; - ``batch_size`` sets the number of training examples in one forward/backward pass, the higher the batch size, the more memory space you'll need; + - ``batch_strategy`` sets the type of batching strategy, + it should be either ``sequence`` or ``balanced``; - ``epochs`` sets the number of times the algorithm will see training data, where one ``epoch`` equals one forward pass and one backward pass of all the training examples; @@ -244,38 +233,49 @@ It is recommended to use - embedding: - ``embed_dim`` sets the dimension of embedding space; - - ``mu_pos`` controls how similar the algorithm should try - to make embedding vectors for correct intent labels; - - ``mu_neg`` controls maximum negative similarity for - incorrect intents; - - ``similarity_type`` sets the type of the similarity, - it should be either ``cosine`` or ``inner``; - ``num_neg`` sets the number of incorrect intent labels, the algorithm will minimize their similarity to the user input during training; + - ``similarity_type`` sets the type of the similarity, + it should be either ``auto``, ``cosine`` or ``inner``, + if ``auto``, it will be set depending on ``loss_type``, + ``inner`` for ``softmax``, ``cosine`` for ``margin``; + - ``loss_type`` sets the type of the loss function, + it should be either ``softmax`` or ``margin``; + - ``mu_pos`` controls how similar the algorithm should try + to make embedding vectors for correct intent labels, + used only if ``loss_type`` is set to ``margin``; + - ``mu_neg`` controls maximum negative similarity for + incorrect intents, + used only if ``loss_type`` is set to ``margin``; - ``use_max_sim_neg`` if ``true`` the algorithm only - minimizes maximum similarity over incorrect intent labels; + minimizes maximum similarity over incorrect intent labels, + used only if ``loss_type`` is set to ``margin``; - regularization: - ``C2`` sets the scale of L2 regularization - ``C_emb`` sets the scale of how important is to minimize the maximum similarity between embeddings of different - intent labels; - - ``droprate_a`` sets the dropout rate between hidden + intent labels, used only if ``loss_type`` is set to ``margin``; + - ``droprate_a`` sets the dropout rate between layers before embedding layer for user inputs; - - ``droprate_b`` sets the dropout rate between hidden layers + - ``droprate_b`` sets the dropout rate between layers before embedding layer for system actions; - - ``droprate_rnn`` sets the recurrent dropout rate on - the LSTM hidden state ``_; - train accuracy calculation: - ``evaluate_every_num_epochs`` sets how often to calculate train accuracy, small values may hurt performance; - ``evaluate_on_num_examples`` how many examples to use for - calculation of train accuracy, large values may hurt - performance. + hold out validation set to calculate of validation accuracy, + large values may hurt performance. + + .. warning:: + + if ``evaluate_on_num_examples`` is non zero, random examples will be + picked by stratified split and used as **hold out** validation set, + so they will be excluded from training data. .. note:: diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 0afc2146eecf..b8534c7e22c6 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -58,16 +58,16 @@ class EmbeddingPolicy(Policy): # a list of hidden layers sizes before bot embed layer # number of hidden layers is equal to the length of this list "hidden_layers_sizes_bot": [], + # number of units in transformer + "transformer_sizes": 128, + # number of transformer layers + "num_transformer_layers": 1, # type of positional encoding in transformer - "pos_encoding": "timing", # {"timing", "emb"} + "pos_encoding": "timing", # string 'timing' or 'emb' # max sequence length if pos_encoding='emb' "max_seq_length": 256, # number of attention heads in transformer "num_heads": 4, - # number of units in transformer - "transformer_size": 128, - # number of transformer layers - "num_transformer_layers": 1, # training parameters # initial and final batch sizes: # batch size will be linearly increased for each epoch @@ -81,19 +81,19 @@ class EmbeddingPolicy(Policy): # embedding parameters # dimension size of embedding vectors "embed_dim": 20, + # the type of the similarity + "num_neg": 20, + # flag if minimize only maximum similarity over incorrect actions + "similarity_type": "auto", # string 'auto' or 'cosine' or 'inner' + # the type of the loss function + "loss_type": "softmax", # string 'softmax' or 'margin' # how similar the algorithm should try # to make embedding vectors for correct actions "mu_pos": 0.8, # should be 0.0 < ... < 1.0 for 'cosine' # maximum negative similarity for incorrect actions "mu_neg": -0.2, # should be -1.0 < ... < 1.0 for 'cosine' - # the type of the similarity - "similarity_type": "auto", # string 'auto' or 'cosine' or 'inner' - # the type of the loss function - "loss_type": "softmax", # string 'softmax' or 'margin' # the number of incorrect actions, the algorithm will minimize # their similarity to the user input during training - "num_neg": 20, - # flag if minimize only maximum similarity over incorrect actions "use_max_sim_neg": True, # flag which loss function to use # regularization # the scale of L2 regularization @@ -101,10 +101,10 @@ class EmbeddingPolicy(Policy): # the scale of how important is to minimize the maximum similarity # between embeddings of different actions "C_emb": 0.8, - # dropout rate for bot nn - "droprate_bot": 0.0, # dropout rate for dial nn - "droprate_dial": 0.1, + "droprate_a": 0.1, + # dropout rate for bot nn + "droprate_b": 0.0, # visualization of accuracy # how often calculate validation accuracy "evaluate_every_num_epochs": 20, # small values may hurt performance @@ -213,7 +213,7 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None: def _load_regularization_params(self, config: Dict[Text, Any]) -> None: self.C2 = config["C2"] self.C_emb = config["C_emb"] - self.droprate = {"bot": config["droprate_bot"], "dial": config["droprate_dial"]} + self.droprate = {"bot": config["droprate_b"], "dial": config["droprate_a"]} def _load_visual_params(self, config: Dict[Text, Any]) -> None: self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"] From bed12fa69a740f3592306c7e3f7b49e9cb6b5429 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 13:02:36 +0200 Subject: [PATCH 33/50] update docs --- docs/core/policies.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/core/policies.rst b/docs/core/policies.rst index 7cf7a4b8882d..068ebe9d1510 100644 --- a/docs/core/policies.rst +++ b/docs/core/policies.rst @@ -174,7 +174,8 @@ This policy has a pre-defined architecture, which comprises the following steps: - concatenate user input (user intent and entities), - previous system action and slots for current time into an input vector + previous system action, slots and active form + for each time step into an input vector to pre-transformer embedding layer; - feed it to tranformer; - apply a dense layer to the output of the transformer From aaa4dc883b8db9554e6104b9d4a99405caf2931b Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 14:24:52 +0200 Subject: [PATCH 34/50] fix typo in defaults --- rasa/core/policies/embedding_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index b8534c7e22c6..f8c08470e4a4 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -59,7 +59,7 @@ class EmbeddingPolicy(Policy): # number of hidden layers is equal to the length of this list "hidden_layers_sizes_bot": [], # number of units in transformer - "transformer_sizes": 128, + "transformer_size": 128, # number of transformer layers "num_transformer_layers": 1, # type of positional encoding in transformer From 7cbba6ec0f4af301f4ae565ee3313bbe4b97262c Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 14:40:19 +0200 Subject: [PATCH 35/50] update changelog --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e07da8e4a551..e87e1cee3578 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -22,6 +22,8 @@ Changed - Update pytype to ``2019.7.11`` - Substitute LSTM with Transformer in ``EmbeddingPolicy`` - ``EmbeddingPolicy`` can now use ``MaxHistoryTrackerFeaturizer`` +- in ``EmbeddingPolicy``, non zero ``evaluate_on_num_examples`` is the size of + hold out validation set that is excluded from training data Removed ------- From f9917dcacf200d3fddfd8478b2675ad529f50e2c Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 15:06:46 +0200 Subject: [PATCH 36/50] update docstrings in featurizers --- rasa/core/featurizers.py | 149 ++++++++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 55 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index 4c32158260ee..2d6b0c750631 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -19,18 +19,21 @@ class SingleStateFeaturizer(object): - """Base class for mechanisms to transform the conversations state - into machine learning formats. + """Base class for mechanisms to transform the conversations state into ML formats. Subclasses of SingleStateFeaturizer decide how the bot will transform the conversation state to a format which a classifier can read: - feature vector.""" + feature vector. + """ def prepare_from_domain(self, domain: Domain) -> None: - """Helper method to init based on domain""" + """Helper method to init based on domain.""" + pass def encode(self, state: Dict[Text, float]) -> np.ndarray: + """Encode user input.""" + raise NotImplementedError( "SingleStateFeaturizer must have " "the capacity to " @@ -39,6 +42,8 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray: @staticmethod def action_as_one_hot(action: Text, domain: Domain) -> np.ndarray: + """Encode system action as one-hot vector.""" + if action is None: return np.ones(domain.num_actions, dtype=int) * -1 @@ -47,46 +52,50 @@ def action_as_one_hot(action: Text, domain: Domain) -> np.ndarray: return y def create_encoded_all_actions(self, domain: Domain) -> np.ndarray: - """Create matrix with all actions from domain - encoded in rows.""" + """Create matrix with all actions from domain encoded in rows.""" + pass class BinarySingleStateFeaturizer(SingleStateFeaturizer): """Assumes all features are binary. - All features should be either on or off, denoting them with 1 or 0.""" + All features should be either on or off, denoting them with 1 or 0. + """ def __init__(self): """Declares instant variables.""" + super(BinarySingleStateFeaturizer, self).__init__() self.num_features = None self.input_state_map = None def prepare_from_domain(self, domain: Domain) -> None: + """Use Domain to prepare featurizer.""" + self.num_features = domain.num_states self.input_state_map = domain.input_state_map def encode(self, state: Dict[Text, float]) -> np.ndarray: """Returns a binary vector indicating which features are active. - Given a dictionary of states (e.g. 'intent_greet', - 'prev_action_listen',...) return a binary vector indicating which - features of `self.input_features` are in the bag. NB it's a - regular double precision float array type. + Given a dictionary of states (e.g. 'intent_greet', + 'prev_action_listen',...) return a binary vector indicating which + features of `self.input_features` are in the bag. NB it's a + regular double precision float array type. - For example with two active features out of five possible features - this would return a vector like `[0 0 1 0 1]` + For example with two active features out of five possible features + this would return a vector like `[0 0 1 0 1]` - If intent features are given with a probability, for example - with two active features and two uncertain intents out - of five possible features this would return a vector - like `[0.3, 0.7, 1.0, 0, 1.0]`. + If intent features are given with a probability, for example + with two active features and two uncertain intents out + of five possible features this would return a vector + like `[0.3, 0.7, 1.0, 0, 1.0]`. - If this is just a padding vector we set all values to `-1`. - padding vectors are specified by a `None` or `[None]` - value for states. + If this is just a padding vector we set all values to `-1`. + padding vectors are specified by a `None` or `[None]` + value for states. """ if not self.num_features: @@ -119,15 +128,16 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray: return used_features def create_encoded_all_actions(self, domain: Domain) -> np.ndarray: - """Create matrix with all actions from domain - encoded in rows as bag of words.""" + """Create matrix with all actions from domain encoded in rows as bag of words""" + return np.eye(domain.num_actions) class LabelTokenizerSingleStateFeaturizer(SingleStateFeaturizer): - """SingleStateFeaturizer that splits user intents and - bot action names into tokens and uses these tokens to - create bag-of-words feature vectors. + """Creates bag-of-words feature vectors. + + User intents and bot action names are split into tokens + and used to create bag-of-words feature vectors. Args: split_symbol: The symbol that separates words in @@ -157,8 +167,10 @@ def __init__( @staticmethod def _create_label_token_dict(labels, split_symbol="_"): """Splits labels into tokens by using provided symbol. + Creates the lookup dictionary for this tokens. - Values in this dict are used for featurization.""" + Values in this dict are used for featurization. + """ distinct_tokens = set( [token for label in labels for token in label.split(split_symbol)] @@ -166,8 +178,8 @@ def _create_label_token_dict(labels, split_symbol="_"): return {token: idx for idx, token in enumerate(sorted(distinct_tokens))} def prepare_from_domain(self, domain: Domain) -> None: - """Creates internal vocabularies for user intents - and bot actions to use for featurization""" + """Creates internal vocabularies for user intents and bot actions.""" + self.user_labels = domain.intent_states + domain.entity_states self.slot_labels = domain.slot_states + domain.form_states self.bot_labels = domain.action_names @@ -189,10 +201,9 @@ def prepare_from_domain(self, domain: Domain) -> None: len(self.user_vocab) + len(self.slot_labels) + len(self.bot_vocab) ) - self.user_feature_len = len(self.user_vocab) - self.slot_feature_len = len(self.slot_labels) - def encode(self, state: Dict[Text, float]) -> np.ndarray: + """Returns a binary vector indicating which tokens are present.""" + if not self.num_features: raise Exception( "LabelTokenizerSingleStateFeaturizer " @@ -238,8 +249,8 @@ def encode(self, state: Dict[Text, float]) -> np.ndarray: return used_features def create_encoded_all_actions(self, domain: Domain) -> np.ndarray: - """Create matrix with all actions from domain - encoded in rows as bag of words.""" + """Create matrix with all actions from domain encoded in rows as bag of words""" + encoded_all_actions = np.zeros( (domain.num_actions, len(self.bot_vocab)), dtype=np.int32 ) @@ -250,7 +261,7 @@ def create_encoded_all_actions(self, domain: Domain) -> np.ndarray: class TrackerFeaturizer(object): - """Base class for actual tracker featurizers""" + """Base class for actual tracker featurizers.""" def __init__( self, @@ -268,9 +279,12 @@ def _create_states( is_binary_training: bool = False, ) -> List[Dict[Text, float]]: """Create states: a list of dictionaries. - If use_intent_probabilities is False (default behaviour), - pick the most probable intent out of all provided ones and - set its probability to 1.0, while all the others to 0.0.""" + + If use_intent_probabilities is False (default behaviour), + pick the most probable intent out of all provided ones and + set its probability to 1.0, while all the others to 0.0. + """ + states = tracker.past_states(domain) # during training we encounter only 1 or 0 @@ -304,12 +318,15 @@ def _create_states( return [dict(state) for state in states] def _pad_states(self, states: List[Any]) -> List[Any]: + """Pads states.""" + return states def _featurize_states( self, trackers_as_states: List[List[Dict[Text, float]]] ) -> Tuple[np.ndarray, List[int]]: - """Create X""" + """Create X.""" + features = [] true_lengths = [] @@ -338,7 +355,7 @@ def _featurize_states( def _featurize_labels( self, trackers_as_actions: List[List[Text]], domain: Domain ) -> np.ndarray: - """Create y""" + """Create y.""" labels = [] for tracker_actions in trackers_as_actions: @@ -363,7 +380,8 @@ def _featurize_labels( def training_states_and_actions( self, trackers: List[DialogueStateTracker], domain: Domain ) -> Tuple[List[List[Dict]], List[List[Text]]]: - """Transforms list of trackers to lists of states and actions""" + """Transforms list of trackers to lists of states and actions.""" + raise NotImplementedError( "Featurizer must have the capacity to encode trackers to feature vectors" ) @@ -371,7 +389,8 @@ def training_states_and_actions( def featurize_trackers( self, trackers: List[DialogueStateTracker], domain: Domain ) -> DialogueTrainingData: - """Create training data""" + """Create training data.""" + self.state_featurizer.prepare_from_domain(domain) (trackers_as_states, trackers_as_actions) = self.training_states_and_actions( @@ -387,7 +406,8 @@ def featurize_trackers( def prediction_states( self, trackers: List[DialogueStateTracker], domain: Domain ) -> List[List[Dict[Text, float]]]: - """Transforms list of trackers to lists of states for prediction""" + """Transforms list of trackers to lists of states for prediction.""" + raise NotImplementedError( "Featurizer must have the capacity to create feature vector" ) @@ -396,7 +416,7 @@ def prediction_states( def create_X( self, trackers: List[DialogueStateTracker], domain: Domain ) -> np.ndarray: - """Create X for prediction""" + """Create X for prediction.""" trackers_as_states = self.prediction_states(trackers, domain) X, _ = self._featurize_states(trackers_as_states) @@ -411,6 +431,8 @@ def persist(self, path): @staticmethod def load(path): + """Loads the featurizer from file.""" + featurizer_file = os.path.join(path, "featurizer.json") if os.path.isfile(featurizer_file): return jsonpickle.decode(rasa.utils.io.read_file(featurizer_file)) @@ -423,17 +445,18 @@ def load(path): class FullDialogueTrackerFeaturizer(TrackerFeaturizer): - """Tracker featurizer that takes the trackers - and creates full dialogue training data for - time distributed rnn. - Training data is padded up to the length of the longest - dialogue with -1""" + """Creates full dialogue training data for time distributed architectures. + + Creates training data that uses each time output for prediction. + Training data is padded up to the length of the longest dialogue with -1. + """ def __init__( self, state_featurizer: SingleStateFeaturizer, use_intent_probabilities: bool = False, ) -> None: + super(FullDialogueTrackerFeaturizer, self).__init__( state_featurizer, use_intent_probabilities ) @@ -441,13 +464,15 @@ def __init__( @staticmethod def _calculate_max_len(trackers_as_actions): + """Calculate the length of the longest dialogue.""" + if trackers_as_actions: return max([len(states) for states in trackers_as_actions]) else: return None def _pad_states(self, states: List[Any]) -> List[Any]: - """Pads states up to max_len""" + """Pads states up to max_len.""" if len(states) < self.max_len: states += [None] * (self.max_len - len(states)) @@ -457,6 +482,10 @@ def _pad_states(self, states: List[Any]) -> List[Any]: def training_states_and_actions( self, trackers: List[DialogueStateTracker], domain: Domain ) -> Tuple[List[List[Dict]], List[List[Text]]]: + """Transforms list of trackers to lists of states and actions. + + Training data is padded up to the length of the longest dialogue with -1. + """ trackers_as_states = [] trackers_as_actions = [] @@ -504,6 +533,7 @@ def training_states_and_actions( def prediction_states( self, trackers: List[DialogueStateTracker], domain: Domain ) -> List[List[Dict[Text, float]]]: + """Transforms list of trackers to lists of states for prediction.""" trackers_as_states = [ self._create_states(tracker, domain) for tracker in trackers @@ -513,11 +543,11 @@ def prediction_states( class MaxHistoryTrackerFeaturizer(TrackerFeaturizer): - """Tracker featurizer that takes the trackers, - slices them into max_history batches and - creates training data for rnn that uses last output - for prediction. - Training data is padded up to the max_history with -1""" + """Slices the tracker history into max_history batches. + + Creates training data that uses last output for prediction. + Training data is padded up to the max_history with -1. + """ MAX_HISTORY_DEFAULT = 5 @@ -528,6 +558,7 @@ def __init__( remove_duplicates: bool = True, use_intent_probabilities: bool = False, ) -> None: + super(MaxHistoryTrackerFeaturizer, self).__init__( state_featurizer, use_intent_probabilities ) @@ -541,7 +572,8 @@ def slice_state_history( """Slices states from the trackers history. If the slice is at the array borders, padding will be added to ensure - the slice length.""" + the slice length. + """ slice_end = len(states) slice_start = max(0, slice_end - slice_length) @@ -552,6 +584,8 @@ def slice_state_history( @staticmethod def _hash_example(states, action): + """Hash states for efficient deduplication.""" + frozen_states = tuple( (s if s is None else frozenset(s.items()) for s in states) ) @@ -561,6 +595,10 @@ def _hash_example(states, action): def training_states_and_actions( self, trackers: List[DialogueStateTracker], domain: Domain ) -> Tuple[List[List[Optional[Dict[Text, float]]]], List[List[Text]]]: + """Transforms list of trackers to lists of states and actions. + + Training data is padded up to the max_history with -1. + """ trackers_as_states = [] trackers_as_actions = [] @@ -615,6 +653,7 @@ def training_states_and_actions( def prediction_states( self, trackers: List[DialogueStateTracker], domain: Domain ) -> List[List[Dict[Text, float]]]: + """Transforms list of trackers to lists of states for prediction.""" trackers_as_states = [ self._create_states(tracker, domain) for tracker in trackers From 8364807241ff815fbda8c779428a6d301f1874e7 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Thu, 25 Jul 2019 15:14:37 +0200 Subject: [PATCH 37/50] do not persist encoded_all_actions --- rasa/core/policies/embedding_policy.py | 29 +++++--------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index f8c08470e4a4..bf834d14bcfe 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -127,7 +127,6 @@ def __init__( self, featurizer: Optional["TrackerFeaturizer"] = None, priority: int = 1, - encoded_all_actions: Optional["np.ndarray"] = None, graph: Optional["tf.Graph"] = None, session: Optional["tf.Session"] = None, user_placeholder: Optional["tf.Tensor"] = None, @@ -150,12 +149,8 @@ def __init__( self._load_params(**kwargs) - # chrono initialization for forget bias - self.characteristic_time = None - # encode all actions with numbers - # persist this array for prediction time - self.encoded_all_actions = encoded_all_actions + self._encoded_all_actions = None # tf related instances self.graph = graph @@ -248,7 +243,7 @@ def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray": [ np.stack( [ - self.encoded_all_actions[action_idx] + self._encoded_all_actions[action_idx] for action_idx in action_ids ] ) @@ -257,7 +252,7 @@ def _action_features_for_Y(self, labels: "np.ndarray") -> "np.ndarray": ) else: return np.stack( - [self.encoded_all_actions[action_idx] for action_idx in labels] + [self._encoded_all_actions[action_idx] for action_idx in labels] ) # noinspection PyPep8Naming @@ -879,7 +874,7 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]: self.a_in, self.b_in = self._iterator.get_next() all_actions = tf.constant( - self.encoded_all_actions, dtype=tf.float32, name="all_actions" + self._encoded_all_actions, dtype=tf.float32, name="all_actions" ) self.dial_embed, mask = self._create_tf_dial() @@ -1119,7 +1114,7 @@ def train( training_data = self.featurize_for_training(training_trackers, domain, **kwargs) # encode all actions with policies' featurizer - self.encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions( + self._encoded_all_actions = self.featurizer.state_featurizer.create_encoded_all_actions( domain ) @@ -1298,12 +1293,6 @@ def persist(self, path: Text) -> None: saver = tf.train.Saver() saver.save(self.session, checkpoint) - encoded_actions_file = os.path.join( - path, file_name + ".encoded_all_actions.pkl" - ) - with open(encoded_actions_file, "wb") as f: - pickle.dump(self.encoded_all_actions, f) - tf_config_file = os.path.join(path, file_name + ".tf_config.pkl") with open(tf_config_file, "wb") as f: pickle.dump(self._tf_config, f) @@ -1364,17 +1353,9 @@ def load(cls, path: Text) -> "EmbeddingPolicy": attention_weights = cls.load_tensor("attention_weights") - encoded_actions_file = os.path.join( - path, "{}.encoded_all_actions.pkl".format(file_name) - ) - - with open(encoded_actions_file, "rb") as f: - encoded_all_actions = pickle.load(f) - return cls( featurizer=featurizer, priority=meta["priority"], - encoded_all_actions=encoded_all_actions, graph=graph, session=session, user_placeholder=a_in, From 1d4470f37d3740bf028c63603ba9fa37f0a84867 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Fri, 26 Jul 2019 17:33:30 +0200 Subject: [PATCH 38/50] refactor test_policies, add test for policy.featurizer types --- rasa/core/featurizers.py | 2 +- rasa/core/policies/mapping_policy.py | 4 + tests/core/test_policies.py | 264 +++++++++++++++++---------- 3 files changed, 169 insertions(+), 101 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index 2d6b0c750631..ae1dab2f49d0 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -269,7 +269,7 @@ def __init__( use_intent_probabilities: bool = False, ) -> None: - self.state_featurizer = state_featurizer or SingleStateFeaturizer() + self.state_featurizer = state_featurizer self.use_intent_probabilities = use_intent_probabilities def _create_states( diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py index 1d194fba9cd2..a6b653b70118 100644 --- a/rasa/core/policies/mapping_policy.py +++ b/rasa/core/policies/mapping_policy.py @@ -27,6 +27,10 @@ class MappingPolicy(Policy): executed whenever the intent is detected. This policy takes precedence over any other policy.""" + @staticmethod + def _standard_featurizer(): + return None + def __init__(self, priority: int = 3) -> None: """Create a new Mapping policy.""" diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 80f34fad89ff..7ba4b42cc5e3 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -5,7 +5,7 @@ import pytest import rasa.utils.io -from rasa.core import training, utils +from rasa.core import training from rasa.core.actions.action import ( ACTION_DEFAULT_ASK_AFFIRMATION_NAME, ACTION_DEFAULT_ASK_REPHRASE_NAME, @@ -18,7 +18,9 @@ from rasa.core.events import ActionExecuted from rasa.core.featurizers import ( BinarySingleStateFeaturizer, + LabelTokenizerSingleStateFeaturizer, MaxHistoryTrackerFeaturizer, + FullDialogueTrackerFeaturizer, ) from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy from rasa.core.policies.embedding_policy import EmbeddingPolicy @@ -113,6 +115,20 @@ async def trained_policy(self, featurizer, priority): policy.train(training_trackers, default_domain) return policy + def test_featurizer(self, trained_policy, tmpdir): + assert trained_policy.featurizer.__class__ is self.featurizer().__class__ + assert ( + trained_policy.featurizer.state_featurizer.__class__ + is self.featurizer().state_featurizer.__class__ + ) + trained_policy.persist(tmpdir.strpath) + loaded = trained_policy.__class__.load(tmpdir.strpath) + assert loaded.featurizer.__class__ is self.featurizer().__class__ + assert ( + loaded.featurizer.state_featurizer.__class__ + is self.featurizer().state_featurizer.__class__ + ) + async def test_persist_and_load(self, trained_policy, default_domain, tmpdir): trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) @@ -179,99 +195,6 @@ def test_tf_config(self, trained_policy, tmpdir): assert loaded.session._config == session_config() -class TestFallbackPolicy(PolicyTestCollection): - def create_policy(self, featurizer, priority): - p = FallbackPolicy(priority=priority) - return p - - @pytest.mark.parametrize( - "nlu_confidence, last_action_name, should_nlu_fallback", - [ - (0.1, "some_action", False), - (0.1, "action_listen", True), - (0.9, "some_action", False), - (0.9, "action_listen", False), - ], - ) - def test_should_nlu_fallback( - self, trained_policy, nlu_confidence, last_action_name, should_nlu_fallback - ): - assert ( - trained_policy.should_nlu_fallback(nlu_confidence, last_action_name) - is should_nlu_fallback - ) - - -class TestMappingPolicy(PolicyTestCollection): - def create_policy(self, featurizer, priority): - p = MappingPolicy() - return p - - -class TestMemoizationPolicy(PolicyTestCollection): - def create_policy(self, featurizer, priority): - max_history = None - if isinstance(featurizer, MaxHistoryTrackerFeaturizer): - max_history = featurizer.max_history - p = MemoizationPolicy(priority=priority, max_history=max_history) - return p - - async def test_memorise(self, trained_policy, default_domain): - trackers = await train_trackers(default_domain, augmentation_factor=20) - trained_policy.train(trackers, default_domain) - lookup_with_augmentation = trained_policy.lookup - - trackers = [ - t for t in trackers if not hasattr(t, "is_augmented") or not t.is_augmented - ] - - ( - all_states, - all_actions, - ) = trained_policy.featurizer.training_states_and_actions( - trackers, default_domain - ) - - for tracker, states, actions in zip(trackers, all_states, all_actions): - recalled = trained_policy.recall(states, tracker, default_domain) - assert recalled == default_domain.index_for_action(actions[0]) - - nums = np.random.randn(default_domain.num_states) - random_states = [{f: num for f, num in zip(default_domain.input_states, nums)}] - assert trained_policy._recall_states(random_states) is None - - # compare augmentation for augmentation_factor of 0 and 20: - trackers_no_augmentation = await train_trackers( - default_domain, augmentation_factor=0 - ) - trained_policy.train(trackers_no_augmentation, default_domain) - lookup_no_augmentation = trained_policy.lookup - - assert lookup_no_augmentation == lookup_with_augmentation - - def test_memorise_with_nlu(self, trained_policy, default_domain): - filename = "data/test_dialogues/default.json" - dialogue = read_dialogue_file(filename) - - tracker = DialogueStateTracker(dialogue.name, default_domain.slots) - tracker.recreate_from_dialogue(dialogue) - states = trained_policy.featurizer.prediction_states([tracker], default_domain)[ - 0 - ] - - recalled = trained_policy.recall(states, tracker, default_domain) - assert recalled is not None - - -class TestAugmentedMemoizationPolicy(PolicyTestCollection): - def create_policy(self, featurizer, priority): - max_history = None - if isinstance(featurizer, MaxHistoryTrackerFeaturizer): - max_history = featurizer.max_history - p = AugmentedMemoizationPolicy(priority=priority, max_history=max_history) - return p - - class TestSklearnPolicy(PolicyTestCollection): def create_policy(self, featurizer, priority, **kwargs): p = SklearnPolicy(featurizer, priority, **kwargs) @@ -409,6 +332,20 @@ def create_policy(self, featurizer, priority): p = EmbeddingPolicy(priority=priority) return p + def test_featurizer(self, trained_policy, tmpdir): + assert trained_policy.featurizer.__class__ is FullDialogueTrackerFeaturizer + assert ( + trained_policy.featurizer.state_featurizer.__class__ + is LabelTokenizerSingleStateFeaturizer + ) + trained_policy.persist(tmpdir.strpath) + loaded = trained_policy.__class__.load(tmpdir.strpath) + assert loaded.featurizer.__class__ is FullDialogueTrackerFeaturizer + assert ( + loaded.featurizer.state_featurizer.__class__ + is LabelTokenizerSingleStateFeaturizer + ) + class TestEmbeddingPolicyWithMaxHistory(PolicyTestCollection): def create_policy(self, featurizer, priority): @@ -418,13 +355,24 @@ def create_policy(self, featurizer, priority): p = EmbeddingPolicy(priority=priority, max_history=self.max_history) return p + def test_featurizer(self, trained_policy, tmpdir): + assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer + assert ( + trained_policy.featurizer.state_featurizer.__class__ + is LabelTokenizerSingleStateFeaturizer + ) + trained_policy.persist(tmpdir.strpath) + loaded = trained_policy.__class__.load(tmpdir.strpath) + assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer + assert ( + loaded.featurizer.state_featurizer.__class__ + is LabelTokenizerSingleStateFeaturizer + ) + class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection): def create_policy(self, featurizer, priority): - # use standard featurizer from EmbeddingPolicy, - # since it is using FullDialogueTrackerFeaturizer - # if max_history is not specified - p = EmbeddingPolicy(priority=priority, **tf_defaults()) + p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults()) return p def test_tf_config(self, trained_policy, tmpdir): @@ -436,7 +384,79 @@ def test_tf_config(self, trained_policy, tmpdir): assert loaded.session._config == session_config() -class TestFormPolicy(PolicyTestCollection): +class TestMemoizationPolicy(PolicyTestCollection): + def create_policy(self, featurizer, priority): + max_history = None + if isinstance(featurizer, MaxHistoryTrackerFeaturizer): + max_history = featurizer.max_history + p = MemoizationPolicy(priority=priority, max_history=max_history) + return p + + def test_featurizer(self, trained_policy, tmpdir): + assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer + assert trained_policy.featurizer.state_featurizer is None + trained_policy.persist(tmpdir.strpath) + loaded = trained_policy.__class__.load(tmpdir.strpath) + assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer + assert loaded.featurizer.state_featurizer is None + + async def test_memorise(self, trained_policy, default_domain): + trackers = await train_trackers(default_domain, augmentation_factor=20) + trained_policy.train(trackers, default_domain) + lookup_with_augmentation = trained_policy.lookup + + trackers = [ + t for t in trackers if not hasattr(t, "is_augmented") or not t.is_augmented + ] + + ( + all_states, + all_actions, + ) = trained_policy.featurizer.training_states_and_actions( + trackers, default_domain + ) + + for tracker, states, actions in zip(trackers, all_states, all_actions): + recalled = trained_policy.recall(states, tracker, default_domain) + assert recalled == default_domain.index_for_action(actions[0]) + + nums = np.random.randn(default_domain.num_states) + random_states = [{f: num for f, num in zip(default_domain.input_states, nums)}] + assert trained_policy._recall_states(random_states) is None + + # compare augmentation for augmentation_factor of 0 and 20: + trackers_no_augmentation = await train_trackers( + default_domain, augmentation_factor=0 + ) + trained_policy.train(trackers_no_augmentation, default_domain) + lookup_no_augmentation = trained_policy.lookup + + assert lookup_no_augmentation == lookup_with_augmentation + + def test_memorise_with_nlu(self, trained_policy, default_domain): + filename = "data/test_dialogues/default.json" + dialogue = read_dialogue_file(filename) + + tracker = DialogueStateTracker(dialogue.name, default_domain.slots) + tracker.recreate_from_dialogue(dialogue) + states = trained_policy.featurizer.prediction_states([tracker], default_domain)[ + 0 + ] + + recalled = trained_policy.recall(states, tracker, default_domain) + assert recalled is not None + + +class TestAugmentedMemoizationPolicy(TestMemoizationPolicy): + def create_policy(self, featurizer, priority): + max_history = None + if isinstance(featurizer, MaxHistoryTrackerFeaturizer): + max_history = featurizer.max_history + p = AugmentedMemoizationPolicy(priority=priority, max_history=max_history) + return p + + +class TestFormPolicy(TestMemoizationPolicy): def create_policy(self, featurizer, priority): p = FormPolicy(priority=priority) return p @@ -499,8 +519,52 @@ async def test_memorise(self, trained_policy, default_domain): random_states = [{f: num for f, num in zip(domain.input_states, nums)}] assert trained_policy.recall(random_states, None, domain) is None + def test_memorise_with_nlu(self, trained_policy, default_domain): + pass + + +class TestMappingPolicy(PolicyTestCollection): + def create_policy(self, featurizer, priority): + p = MappingPolicy() + return p + + def test_featurizer(self, trained_policy, tmpdir): + assert trained_policy.featurizer is None + trained_policy.persist(tmpdir.strpath) + loaded = trained_policy.__class__.load(tmpdir.strpath) + assert loaded.featurizer is None + + +class TestFallbackPolicy(PolicyTestCollection): + def create_policy(self, featurizer, priority): + p = FallbackPolicy(priority=priority) + return p + + def test_featurizer(self, trained_policy, tmpdir): + assert trained_policy.featurizer is None + trained_policy.persist(tmpdir.strpath) + loaded = trained_policy.__class__.load(tmpdir.strpath) + assert loaded.featurizer is None + + @pytest.mark.parametrize( + "nlu_confidence, last_action_name, should_nlu_fallback", + [ + (0.1, "some_action", False), + (0.1, "action_listen", True), + (0.9, "some_action", False), + (0.9, "action_listen", False), + ], + ) + def test_should_nlu_fallback( + self, trained_policy, nlu_confidence, last_action_name, should_nlu_fallback + ): + assert ( + trained_policy.should_nlu_fallback(nlu_confidence, last_action_name) + is should_nlu_fallback + ) + -class TestTwoStageFallbackPolicy(PolicyTestCollection): +class TestTwoStageFallbackPolicy(TestFallbackPolicy): def create_policy(self, featurizer, priority): p = TwoStageFallbackPolicy( priority=priority, deny_suggestion_intent_name="deny" From 565f7303b92159f6f2543e5beb3885cd2f4eb6be Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Fri, 26 Jul 2019 18:26:23 +0200 Subject: [PATCH 39/50] fix test_policies --- tests/core/test_policies.py | 48 +++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 7ba4b42cc5e3..13e6d89f2719 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -116,17 +116,15 @@ async def trained_policy(self, featurizer, priority): return policy def test_featurizer(self, trained_policy, tmpdir): - assert trained_policy.featurizer.__class__ is self.featurizer().__class__ - assert ( - trained_policy.featurizer.state_featurizer.__class__ - is self.featurizer().state_featurizer.__class__ + assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer) + assert isinstance( + trained_policy.featurizer.state_featurizer, BinarySingleStateFeaturizer ) trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) - assert loaded.featurizer.__class__ is self.featurizer().__class__ - assert ( - loaded.featurizer.state_featurizer.__class__ - is self.featurizer().state_featurizer.__class__ + assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer) + assert isinstance( + loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer ) async def test_persist_and_load(self, trained_policy, default_domain, tmpdir): @@ -333,17 +331,16 @@ def create_policy(self, featurizer, priority): return p def test_featurizer(self, trained_policy, tmpdir): - assert trained_policy.featurizer.__class__ is FullDialogueTrackerFeaturizer - assert ( - trained_policy.featurizer.state_featurizer.__class__ - is LabelTokenizerSingleStateFeaturizer + assert isinstance(trained_policy.featurizer, FullDialogueTrackerFeaturizer) + assert isinstance( + trained_policy.featurizer.state_featurizer, + LabelTokenizerSingleStateFeaturizer, ) trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) - assert loaded.featurizer.__class__ is FullDialogueTrackerFeaturizer - assert ( - loaded.featurizer.state_featurizer.__class__ - is LabelTokenizerSingleStateFeaturizer + assert isinstance(loaded.featurizer, FullDialogueTrackerFeaturizer) + assert isinstance( + loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer ) @@ -356,17 +353,16 @@ def create_policy(self, featurizer, priority): return p def test_featurizer(self, trained_policy, tmpdir): - assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer - assert ( - trained_policy.featurizer.state_featurizer.__class__ - is LabelTokenizerSingleStateFeaturizer + assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer) + assert isinstance( + trained_policy.featurizer.state_featurizer, + LabelTokenizerSingleStateFeaturizer, ) trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) - assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer - assert ( - loaded.featurizer.state_featurizer.__class__ - is LabelTokenizerSingleStateFeaturizer + assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer) + assert isinstance( + loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer ) @@ -393,11 +389,11 @@ def create_policy(self, featurizer, priority): return p def test_featurizer(self, trained_policy, tmpdir): - assert trained_policy.featurizer.__class__ is MaxHistoryTrackerFeaturizer + assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer) assert trained_policy.featurizer.state_featurizer is None trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) - assert loaded.featurizer.__class__ is MaxHistoryTrackerFeaturizer + assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer) assert loaded.featurizer.state_featurizer is None async def test_memorise(self, trained_policy, default_domain): From 03f0892cdd66e0d8330e4d6f4fe646d687e32931 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Fri, 26 Jul 2019 18:29:22 +0200 Subject: [PATCH 40/50] also check for max_history --- tests/core/test_policies.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 13e6d89f2719..1551583a68a9 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -117,12 +117,14 @@ async def trained_policy(self, featurizer, priority): def test_featurizer(self, trained_policy, tmpdir): assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer) + assert trained_policy.featurizer.max_history == self.max_history assert isinstance( trained_policy.featurizer.state_featurizer, BinarySingleStateFeaturizer ) trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer) + assert loaded.featurizer.max_history == self.max_history assert isinstance( loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer ) @@ -354,6 +356,7 @@ def create_policy(self, featurizer, priority): def test_featurizer(self, trained_policy, tmpdir): assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer) + assert trained_policy.featurizer.max_history == self.max_history assert isinstance( trained_policy.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer, @@ -361,6 +364,7 @@ def test_featurizer(self, trained_policy, tmpdir): trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer) + assert loaded.featurizer.max_history == self.max_history assert isinstance( loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer ) From 4123d65855b8bdea2932ea3bbdcc71e6a50f235f Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Mon, 29 Jul 2019 17:57:59 +0200 Subject: [PATCH 41/50] add continue_training tests --- rasa/core/featurizers.py | 5 ++-- rasa/core/policies/embedding_policy.py | 37 ++++++++++++++------------ tests/core/test_policies.py | 36 ++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index ae1dab2f49d0..fa728b669103 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -525,8 +525,9 @@ def training_states_and_actions( trackers_as_states.append(states[:-1]) trackers_as_actions.append(actions) - self.max_len = self._calculate_max_len(trackers_as_actions) - logger.debug("The longest dialogue has {} actions.".format(self.max_len)) + if self.max_len is None: + self.max_len = self._calculate_max_len(trackers_as_actions) + logger.debug("The longest dialogue has {} actions.".format(self.max_len)) return trackers_as_states, trackers_as_actions diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index bf834d14bcfe..f996bc5ab6dd 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -1194,25 +1194,28 @@ def continue_training( batch_size = kwargs.get("batch_size", 5) epochs = kwargs.get("epochs", 50) - for _ in range(epochs): - training_data = self._training_data_for_continue_training( - batch_size, training_trackers, domain - ) - - session_data = self._create_session_data(training_data.X, training_data.y) - train_dataset = self._create_tf_dataset(session_data, batch_size) - train_init_op = self._iterator.make_initializer(train_dataset) - self.session.run(train_init_op) + with self.graph.as_default(): + for _ in range(epochs): + training_data = self._training_data_for_continue_training( + batch_size, training_trackers, domain + ) - # fit to one extra example using updated trackers - while True: - try: - self.session.run( - self._train_op, feed_dict={self._is_training: True} - ) + session_data = self._create_session_data( + training_data.X, training_data.y + ) + train_dataset = self._create_tf_dataset(session_data, batch_size) + train_init_op = self._iterator.make_initializer(train_dataset) + self.session.run(train_init_op) + + # fit to one extra example using updated trackers + while True: + try: + self.session.run( + self._train_op, feed_dict={self._is_training: True} + ) - except tf.errors.OutOfRangeError: - break + except tf.errors.OutOfRangeError: + break def tf_feed_dict_for_prediction( self, tracker: "DialogueStateTracker", domain: "Domain" diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 1551583a68a9..b8a64a270891 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -129,6 +129,12 @@ def test_featurizer(self, trained_policy, tmpdir): loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer ) + async def test_continue_training(self, trained_policy, default_domain): + training_trackers = await train_trackers(default_domain, augmentation_factor=0) + trained_policy.continue_training( + training_trackers, default_domain, **{"epochs": 1} + ) + async def test_persist_and_load(self, trained_policy, default_domain, tmpdir): trained_policy.persist(tmpdir.strpath) loaded = trained_policy.__class__.load(tmpdir.strpath) @@ -318,13 +324,35 @@ def test_train_with_shuffle_false( policy.train(trackers, domain=default_domain) -class TestEmbeddingPolicyWithFeaturizer(PolicyTestCollection): +class TestEmbeddingPolicy(PolicyTestCollection): def create_policy(self, featurizer, priority): p = EmbeddingPolicy(featurizer=featurizer, priority=priority) return p + def test_similarity_type(self, trained_policy): + assert trained_policy.similarity_type == "inner" + + +class TestEmbeddingPolicyMargin(TestEmbeddingPolicy): + def create_policy(self, featurizer, priority): + p = EmbeddingPolicy( + featurizer=featurizer, priority=priority, **{"loss_type": "margin"} + ) + return p + + def test_similarity_type(self, trained_policy): + assert trained_policy.similarity_type == "cosine" + + +class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy): + def create_policy(self, featurizer, priority): + p = EmbeddingPolicy( + featurizer=featurizer, priority=priority, **{"evaluate_on_num_examples": 4} + ) + return p + -class TestEmbeddingPolicyWithFullDialogue(PolicyTestCollection): +class TestEmbeddingPolicyWithFullDialogue(TestEmbeddingPolicy): def create_policy(self, featurizer, priority): # use standard featurizer from EmbeddingPolicy, # since it is using FullDialogueTrackerFeaturizer @@ -346,7 +374,7 @@ def test_featurizer(self, trained_policy, tmpdir): ) -class TestEmbeddingPolicyWithMaxHistory(PolicyTestCollection): +class TestEmbeddingPolicyWithMaxHistory(TestEmbeddingPolicy): def create_policy(self, featurizer, priority): # use standard featurizer from EmbeddingPolicy, # since it is using MaxHistoryTrackerFeaturizer @@ -370,7 +398,7 @@ def test_featurizer(self, trained_policy, tmpdir): ) -class TestEmbeddingPolicyWithTfConfig(PolicyTestCollection): +class TestEmbeddingPolicyWithTfConfig(TestEmbeddingPolicy): def create_policy(self, featurizer, priority): p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults()) return p From d06aeeeb67c18327ed7070630a26df2140e9c5e8 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Mon, 29 Jul 2019 18:18:49 +0200 Subject: [PATCH 42/50] use dynamic sequence length, because of continue_trainig --- rasa/core/featurizers.py | 5 ++--- rasa/core/policies/embedding_policy.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/rasa/core/featurizers.py b/rasa/core/featurizers.py index fa728b669103..ae1dab2f49d0 100644 --- a/rasa/core/featurizers.py +++ b/rasa/core/featurizers.py @@ -525,9 +525,8 @@ def training_states_and_actions( trackers_as_states.append(states[:-1]) trackers_as_actions.append(actions) - if self.max_len is None: - self.max_len = self._calculate_max_len(trackers_as_actions) - logger.debug("The longest dialogue has {} actions.".format(self.max_len)) + self.max_len = self._calculate_max_len(trackers_as_actions) + logger.debug("The longest dialogue has {} actions.".format(self.max_len)) return trackers_as_states, trackers_as_actions diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index f996bc5ab6dd..03995eaf807f 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -405,6 +405,7 @@ def _gen_batch( yield batch_x, batch_y + # noinspection PyPep8Naming def _create_tf_dataset( self, session_data: "SessionData", @@ -414,15 +415,20 @@ def _create_tf_dataset( ) -> "tf.data.Dataset": """Create tf dataset.""" + # set batch and sequence length to None + shape_X = (None, None, session_data.X[0].shape[-1]) + + if session_data.Y[0].ndim == 1: + shape_Y = (None, session_data.Y[0].shape[-1]) + else: + shape_Y = (None, None, session_data.Y[0].shape[-1]) + return tf.data.Dataset.from_generator( lambda batch_size_: self._gen_batch( session_data, batch_size_, batch_strategy, shuffle ), output_types=(tf.float32, tf.float32), - output_shapes=( - [None] + list(session_data.X[0].shape), # set batch to None - [None] + list(session_data.Y[0].shape), # set batch to None - ), + output_shapes=(shape_X, shape_Y), args=([batch_size]), ) From 5ae7f112012457a7a4d59c07f96b0cacdb4c1e13 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Mon, 29 Jul 2019 22:55:32 +0200 Subject: [PATCH 43/50] add one more test --- tests/core/test_policies.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index b8a64a270891..31f65211cdc7 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -333,6 +333,14 @@ def test_similarity_type(self, trained_policy): assert trained_policy.similarity_type == "inner" +class TestEmbeddingPolicySequence(TestEmbeddingPolicy): + def create_policy(self, featurizer, priority): + p = EmbeddingPolicy( + featurizer=featurizer, priority=priority, **{"batch_strategy": "sequence"} + ) + return p + + class TestEmbeddingPolicyMargin(TestEmbeddingPolicy): def create_policy(self, featurizer, priority): p = EmbeddingPolicy( From 956ca961d2d2675bb58beef31b1a536084a514ec Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Mon, 29 Jul 2019 23:20:38 +0200 Subject: [PATCH 44/50] add test_gen_batch --- tests/core/test_policies.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 31f65211cdc7..3bb2c745ab24 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -332,13 +332,36 @@ def create_policy(self, featurizer, priority): def test_similarity_type(self, trained_policy): assert trained_policy.similarity_type == "inner" - -class TestEmbeddingPolicySequence(TestEmbeddingPolicy): - def create_policy(self, featurizer, priority): - p = EmbeddingPolicy( - featurizer=featurizer, priority=priority, **{"batch_strategy": "sequence"} + async def test_gen_batch(self, trained_policy, default_domain): + training_trackers = await train_trackers(default_domain, augmentation_factor=0) + training_data = trained_policy.featurize_for_training( + training_trackers, default_domain + ) + session_data = trained_policy._create_session_data( + training_data.X, training_data.y + ) + batch_size = 2 + batch_x, batch_y = next( + trained_policy._gen_batch(session_data=session_data, batch_size=batch_size) + ) + assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size + assert ( + batch_x[0].shape == session_data.X[0].shape + and batch_y[0].shape == session_data.Y[0].shape + ) + batch_x, batch_y = next( + trained_policy._gen_batch( + session_data=session_data, + batch_size=batch_size, + batch_strategy="balanced", + shuffle=True, + ) + ) + assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size + assert ( + batch_x[0].shape == session_data.X[0].shape + and batch_y[0].shape == session_data.Y[0].shape ) - return p class TestEmbeddingPolicyMargin(TestEmbeddingPolicy): From b97bf0bacb66308408fcedc56250d29d7eda833a Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 30 Jul 2019 18:23:59 +0200 Subject: [PATCH 45/50] return pre dial nn --- rasa/core/policies/embedding_policy.py | 27 ++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 03995eaf807f..2846f6d0a4f5 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -55,6 +55,9 @@ class EmbeddingPolicy(Policy): # default properties (DOC MARKER - don't remove) defaults = { # nn architecture + # a list of hidden layers sizes before user embed layer + # number of hidden layers is equal to the length of this list + "hidden_layers_sizes_dial": [], # a list of hidden layers sizes before bot embed layer # number of hidden layers is equal to the length of this list "hidden_layers_sizes_bot": [], @@ -174,7 +177,10 @@ def __init__( # init helpers def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: - self.hidden_layer_sizes_bot = config["hidden_layers_sizes_bot"] + self.hidden_layers_sizes = { + "a": config["hidden_layers_sizes_dial"], + "b": config["hidden_layers_sizes_bot"], + } self.pos_encoding = config["pos_encoding"] self.max_seq_length = config["max_seq_length"] @@ -504,7 +510,7 @@ def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor": b = self._create_tf_nn( b_in, - self.hidden_layer_sizes_bot, + self.hidden_layers_sizes["b"], self.droprate["bot"], layer_name_suffix="bot", ) @@ -595,18 +601,23 @@ def _create_t2t_transformer_encoder( tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout ) - def _create_tf_dial(self) -> Tuple["tf.Tensor", "tf.Tensor"]: + def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]: """Create dialogue level embedding and mask.""" # mask different length sequences # if there is at least one `-1` it should be masked mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1) - self.attention_weights = {} - a = self._create_t2t_transformer_encoder( - self.a_in, mask, self.attention_weights + a = self._create_tf_nn( + a_in, + self.hidden_layers_sizes["a"], + self.droprate["dial"], + layer_name_suffix="dial", ) + self.attention_weights = {} + a = self._create_t2t_transformer_encoder(a, mask, self.attention_weights) + dial_embed = self._create_tf_embed(a, layer_name_suffix="dial") if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer): @@ -883,7 +894,7 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]: self._encoded_all_actions, dtype=tf.float32, name="all_actions" ) - self.dial_embed, mask = self._create_tf_dial() + self.dial_embed, mask = self._create_tf_dial(self.a_in) self.bot_embed = self._create_tf_bot_embed(self.b_in) self.all_bot_embed = self._create_tf_bot_embed(all_actions) @@ -1065,7 +1076,7 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor": self._create_tf_placeholders(session_data) - self.dial_embed, mask = self._create_tf_dial() + self.dial_embed, mask = self._create_tf_dial(self.a_in) self.sim_all = self._tf_raw_sim( self.dial_embed[:, :, tf.newaxis, :], From d9ea8be8d7dc5aeb79ded4fc19d293191098f142 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 30 Jul 2019 18:25:35 +0200 Subject: [PATCH 46/50] return pre dial nn --- rasa/core/policies/embedding_policy.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 2846f6d0a4f5..75be9ab62878 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -57,7 +57,7 @@ class EmbeddingPolicy(Policy): # nn architecture # a list of hidden layers sizes before user embed layer # number of hidden layers is equal to the length of this list - "hidden_layers_sizes_dial": [], + "hidden_layers_sizes_pre_dial": [], # a list of hidden layers sizes before bot embed layer # number of hidden layers is equal to the length of this list "hidden_layers_sizes_bot": [], @@ -178,8 +178,8 @@ def __init__( # init helpers def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None: self.hidden_layers_sizes = { - "a": config["hidden_layers_sizes_dial"], - "b": config["hidden_layers_sizes_bot"], + "pre_dial": config["hidden_layers_sizes_pre_dial"], + "bot": config["hidden_layers_sizes_bot"], } self.pos_encoding = config["pos_encoding"] @@ -510,7 +510,7 @@ def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor": b = self._create_tf_nn( b_in, - self.hidden_layers_sizes["b"], + self.hidden_layers_sizes["bot"], self.droprate["bot"], layer_name_suffix="bot", ) @@ -610,9 +610,9 @@ def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]: a = self._create_tf_nn( a_in, - self.hidden_layers_sizes["a"], + self.hidden_layers_sizes["pre_dial"], self.droprate["dial"], - layer_name_suffix="dial", + layer_name_suffix="pre_dial", ) self.attention_weights = {} From 4ef6660b8d51fef9d288c93929df3c93af0bb739 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 30 Jul 2019 18:41:48 +0200 Subject: [PATCH 47/50] add scale_loss option --- docs/core/policies.rst | 3 +++ rasa/core/policies/embedding_policy.py | 14 +++++++++----- tests/core/test_policies.py | 3 ++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/core/policies.rst b/docs/core/policies.rst index 068ebe9d1510..f5250b2479af 100644 --- a/docs/core/policies.rst +++ b/docs/core/policies.rst @@ -252,6 +252,9 @@ It is recommended to use - ``use_max_sim_neg`` if ``true`` the algorithm only minimizes maximum similarity over incorrect intent labels, used only if ``loss_type`` is set to ``margin``; + - ``scale_loss`` if ``true`` the algorithm will downscale the loss + for examples where correct label is predicted with high confidence, + used only if ``loss_type`` is set to ``softmax``; - regularization: diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 75be9ab62878..47b9f473293b 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -98,6 +98,8 @@ class EmbeddingPolicy(Policy): # the number of incorrect actions, the algorithm will minimize # their similarity to the user input during training "use_max_sim_neg": True, # flag which loss function to use + # scale loss inverse proportionally to confidence of correct prediction + "scale_loss": True, # regularization # the scale of L2 regularization "C2": 0.001, @@ -210,6 +212,7 @@ def _load_embedding_params(self, config: Dict[Text, Any]) -> None: self.num_neg = config["num_neg"] self.use_max_sim_neg = config["use_max_sim_neg"] + self.scale_loss = config["scale_loss"] def _load_regularization_params(self, config: Dict[Text, Any]) -> None: self.C2 = config["C2"] @@ -818,8 +821,8 @@ def _tf_loss_margin( return loss - @staticmethod def _tf_loss_softmax( + self, sim_pos: "tf.Tensor", sim_neg: "tf.Tensor", sim_neg_bot_bot: "tf.Tensor", @@ -838,11 +841,12 @@ def _tf_loss_softmax( neg_labels = tf.zeros_like(logits[:, :, 1:]) labels = tf.concat([pos_labels, neg_labels], -1) - # mask loss by prediction confidence - pred = tf.nn.softmax(logits) - already_learned = tf.pow((1 - pred[:, :, 0]) / 0.5, 4) + if self.scale_loss: + # mask loss by prediction confidence + pred = tf.nn.softmax(logits) + mask *= tf.pow((1 - pred[:, :, 0]) / 0.5, 4) - loss = tf.losses.softmax_cross_entropy(labels, logits, mask * already_learned) + loss = tf.losses.softmax_cross_entropy(labels, logits, mask) # add regularization losses loss += tf.losses.get_regularization_loss() diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 3bb2c745ab24..b894aac5c7cd 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -378,7 +378,8 @@ def test_similarity_type(self, trained_policy): class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy): def create_policy(self, featurizer, priority): p = EmbeddingPolicy( - featurizer=featurizer, priority=priority, **{"evaluate_on_num_examples": 4} + featurizer=featurizer, priority=priority, **{"scale_loss": False, + "evaluate_on_num_examples": 4} ) return p From 54950c1458acc7486eb53dbef4974651b1eee560 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Tue, 30 Jul 2019 18:42:54 +0200 Subject: [PATCH 48/50] black --- tests/core/test_policies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index b894aac5c7cd..ca0db2c987c0 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -378,8 +378,9 @@ def test_similarity_type(self, trained_policy): class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy): def create_policy(self, featurizer, priority): p = EmbeddingPolicy( - featurizer=featurizer, priority=priority, **{"scale_loss": False, - "evaluate_on_num_examples": 4} + featurizer=featurizer, + priority=priority, + **{"scale_loss": False, "evaluate_on_num_examples": 4} ) return p From b2f9f890c92be36bf0f2f4412b14573fcaa75728 Mon Sep 17 00:00:00 2001 From: Vladimir Vlasov Date: Mon, 5 Aug 2019 17:11:58 +0200 Subject: [PATCH 49/50] Update docs/core/policies.rst Co-Authored-By: Tanja --- docs/core/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/core/policies.rst b/docs/core/policies.rst index 1bdfa6a34baa..f4a02d392f52 100644 --- a/docs/core/policies.rst +++ b/docs/core/policies.rst @@ -177,7 +177,7 @@ following steps: previous system action, slots and active form for each time step into an input vector to pre-transformer embedding layer; - - feed it to tranformer; + - feed it to transformer; - apply a dense layer to the output of the transformer to get embeddings of a dialogue for each time step; - apply a dense layer to create embeddings for system actions for each time step; From 1a5221936dfd22edf2d9f7a5cb63e4fe6929e8b7 Mon Sep 17 00:00:00 2001 From: Vova Vv Date: Mon, 5 Aug 2019 17:19:55 +0200 Subject: [PATCH 50/50] fix changelog, remove unneeded else --- CHANGELOG.rst | 3 --- rasa/core/policies/embedding_policy.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fd24dc0af564..97ee7bbdd0f9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -43,9 +43,6 @@ Changed ------- - new event broker class: ``SQLProducer``. This event broker is now used when running locally with Rasa X - -Removed -------- - API requests are not longer logged to ``rasa_core.log`` by default in order to avoid problems when running on OpenShift (use ``--log-file rasa_core.log`` to retain the old behavior) diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py index 47b9f473293b..6224513951cc 100644 --- a/rasa/core/policies/embedding_policy.py +++ b/rasa/core/policies/embedding_policy.py @@ -1114,8 +1114,6 @@ def _extract_attention(self) -> Optional["tf.Tensor"]: if attention: return tf.concat(attention, 0) - else: - return # training methods def train(