In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    ! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py
    import setup_google_colab
    setup_google_colab.setup_week4()

--2020-08-28 07:38:46--  https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1939 (1.9K) [text/plain]
Saving to: ‘setup_google_colab.py’


2020-08-28 07:38:46 (29.1 MB/s) - ‘setup_google_colab.py’ saved [1939/1939]



In [2]:
import random

In [3]:
def generate_equations(allowed_operators, dataset_size, min_value, max_value):
    """Generates pairs of equations and solutions to them.
    
       Each equation has a form of two integers with an operator in between.
       Each solution is an integer with the result of the operaion.
    
        allowed_operators: list of strings, allowed operators.
        dataset_size: an integer, number of equations to be generated.
        min_value: an integer, min value of each operand.
        max_value: an integer, max value of each operand.

        result: a list of tuples of strings (equation, solution).
    """
    sample = []
    for _ in range(dataset_size):
        ######################################
        ######### YOUR CODE HERE #############
        ######################################
        a = str(random.randint(min_value, max_value))
        b = str(random.randint(min_value, max_value))
        opr = random.choice(allowed_operators)
        eq = a+opr+b
        result = str(eval(eq))
        sample.append((eq, result))
        
    return sample

In [4]:
def test_generate_equations():
    allowed_operators = ['+', '-']
    dataset_size = 10
    for (input_, output_) in generate_equations(allowed_operators, dataset_size, 0, 100):
        if not (type(input_) is str and type(output_) is str):
            return "Both parts should be strings."
        if eval(input_) != int(output_):
            return "The (equation: {!r}, solution: {!r}) pair is incorrect.".format(input_, output_)
    return "Tests passed."

In [5]:
print(test_generate_equations())

Tests passed.


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
allowed_operators = ['+', '-']
dataset_size = 100000
data = generate_equations(allowed_operators, dataset_size, min_value=0, max_value=9999)

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [8]:
word2id = {symbol:i for i, symbol in enumerate('#^$+-1234567890')}
id2word = {i:symbol for symbol, i in word2id.items()}

In [9]:
start_symbol = '^'
end_symbol = '$'
padding_symbol = '#'

In [10]:
def sentence_to_ids(sentence, word2id, padded_len):
    """ Converts a sequence of symbols to a padded sequence of their ids.
    
      sentence: a string, input/output sequence of symbols.
      word2id: a dict, a mapping from original symbols to ids.
      padded_len: an integer, a desirable length of the sequence.

      result: a tuple of (a list of ids, an actual length of sentence).
    """
    
    sent_len = min(padded_len-1, len(sentence))+1 ######### YOUR CODE HERE #############
    sent_ids = [word2id[w] for w in sentence] ######### YOUR CODE HERE #############
    pad_size = max(0,padded_len-len(sent_ids)-1)
    sent_ids = sent_ids[:padded_len-1] + [word2id[end_symbol]] + [word2id[padding_symbol]]*(pad_size)
    
    return sent_ids, sent_len

In [11]:
def test_sentence_to_ids():
    sentences = [("123+123", 7), ("123+123", 8), ("123+123", 10)]
    expected_output = [([5, 6, 7, 3, 5, 6, 2], 7), 
                       ([5, 6, 7, 3, 5, 6, 7, 2], 8), 
                       ([5, 6, 7, 3, 5, 6, 7, 2, 0, 0], 8)] 
    for (sentence, padded_len), (sentence_ids, expected_length) in zip(sentences, expected_output):
        output, length = sentence_to_ids(sentence, word2id, padded_len)
        if output != sentence_ids:
            return("Convertion of '{}' for padded_len={} to {} is incorrect.".format(
                sentence, padded_len, output))
        if length != expected_length:
            return("Convertion of '{}' for padded_len={} has incorrect actual length {}.".format(
                sentence, padded_len, length))
    return("Tests passed.")

In [12]:
print(test_sentence_to_ids())

Tests passed.


In [13]:
def ids_to_sentence(ids, id2word):
    """ Converts a sequence of ids to a sequence of symbols.
    
          ids: a list, indices for the padded sequence.
          id2word:  a dict, a mapping from ids to original symbols.

          result: a list of symbols.
    """
 
    return [id2word[i] for i in ids]

In [14]:
def batch_to_ids(sentences, word2id, max_len):
    """Prepares batches of indices. 
    
       Sequences are padded to match the longest sequence in the batch,
       if it's longer than max_len, then max_len is used instead.

        sentences: a list of strings, original sequences.
        word2id: a dict, a mapping from original symbols to ids.
        max_len: an integer, max len of sequences allowed.

        result: a list of lists of ids, a list of actual lengths.
    """
    
    max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
    return batch_ids, batch_ids_len

In [15]:
def generate_batches(samples, batch_size=64):
    X, Y = [], []
    for i, (x, y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i % batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

In [16]:
sentences = train_set[0]
ids, sent_lens = batch_to_ids(sentences, word2id, max_len=10)
print('Input:', sentences)
print('Ids: {}\nSentences lengths: {}'.format(ids, sent_lens))

Input: ('8797+597', '9394')
Ids: [[12, 11, 13, 11, 3, 9, 13, 11, 2], [13, 7, 13, 8, 2, 0, 0, 0, 0]]
Sentences lengths: [9, 5]


In [17]:
import tensorflow as tf

In [18]:
class Seq2SeqModel(object):
    pass

In [19]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""
    
    # Placeholders for input and its actual lengths.
    self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')
    self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')
    
    # Placeholders for groundtruth and its actual lengths.
    self.ground_truth = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ground_truth') ######### YOUR CODE HERE #############
    self.ground_truth_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='ground_truth_lengths')######### YOUR CODE HERE #############
        
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[]) ######### YOUR CODE HERE #############

In [20]:
Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)

In [21]:
def create_embeddings(self, vocab_size, embeddings_size):
    """Specifies embeddings layer and embeds an input batch."""
     
    random_initializer = tf.random_uniform((vocab_size, embeddings_size), -1.0, 1.0)
    self.embeddings = tf.Variable(random_initializer, dtype=tf.float32, name='embeddings_matrix')   ######### YOUR CODE HERE ############# 
    
    # Perform embeddings lookup for self.input_batch. 
    self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)  ######### YOUR CODE HERE #############

In [22]:
Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)

In [23]:
def build_encoder(self, hidden_size):
    """Specifies encoder architecture and computes its output."""
    
    # Create GRUCell with dropout.
    ######### YOUR CODE HERE #############
    encoder_cell = tf.contrib.rnn.DropoutWrapper(
                     tf.contrib.rnn.GRUCell(hidden_size), 
                     dtype=tf.float32, input_keep_prob=self.dropout_ph)
    
    # Create RNN with the predefined cell.
    ######### YOUR CODE HERE #############
    _, self.final_encoder_state = tf.nn.dynamic_rnn(encoder_cell, self.input_batch_embedded, 
                                                    dtype=tf.float32, sequence_length=self.input_batch_lengths)

In [24]:
Seq2SeqModel.__build_encoder = classmethod(build_encoder)

In [25]:
def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):
    """Specifies decoder architecture and computes the output.
    
        Uses different helpers:
          - for train: feeding ground truth
          - for inference: feeding generated output

        As a result, self.train_outputs and self.infer_outputs are created. 
        Each of them contains two fields:
          rnn_output (predicted logits)
          sample_id (predictions).

    """
    
    # Use start symbols as the decoder inputs at the first time step.
    batch_size = tf.shape(self.input_batch)[0]
    start_tokens = tf.fill([batch_size], start_symbol_id)
    ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)
    
    # Use the embedding layer defined before to lookup embedings for ground_truth_as_input. 
    self.ground_truth_embedded = tf.nn.embedding_lookup(self.embeddings, ground_truth_as_input) ######### YOUR CODE HERE #############
     
    # Create TrainingHelper for the train stage.
    train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded, 
                                                     self.ground_truth_lengths)
    
    # Create GreedyEmbeddingHelper for the inference stage.
    # You should provide the embedding layer, start_tokens and index of the end symbol.
    #infer_helper = ######### YOUR CODE HERE #############
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings,
                                                           start_tokens,
                                                           end_symbol_id)
    
  
    def decode(helper, scope, reuse=None):
        """Creates decoder and return the results of the decoding with a given helper."""
        
        with tf.variable_scope(scope, reuse=reuse):
            # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
            #decoder_cell = ######### YOUR CODE HERE #############
            decoder_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(hidden_size, reuse=reuse),
                                                        dtype=tf.float32, input_keep_prob=self.dropout_ph)
            
            # Create a projection wrapper.
            decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)
            
            # Create BasicDecoder, pass the defined cell, a helper, and initial state.
            # The initial state should be equal to the final state of the encoder!
            # decoder = ######### YOUR CODE HERE #############
            decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, 
                                                      helper=helper, 
                                                      initial_state=self.final_encoder_state)
            
            # The first returning argument of dynamic_decode contains two fields:
            #   rnn_output (predicted logits)
            #   sample_id (predictions)
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_iter, 
                                                              output_time_major=False, impute_finished=True)

            return outputs
        
    self.train_outputs = decode(train_helper, 'decode')
    self.infer_outputs = decode(infer_helper, 'decode', reuse=True)

In [26]:
Seq2SeqModel.__build_decoder = classmethod(build_decoder)

In [27]:
def compute_loss(self):
    """Computes sequence loss (masked cross-entopy loss with logits)."""
    
    weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)
    
    self.loss = tf.contrib.seq2seq.sequence_loss(self.train_outputs.rnn_output, self.ground_truth, weights) ######### YOUR CODE HERE #############

In [28]:
Seq2SeqModel.__compute_loss = classmethod(compute_loss)

In [29]:
def perform_optimization(self):
    """Specifies train_op that optimizes self.loss."""
    
    ######### YOUR CODE HERE #############
    self.train_op = tf.contrib.layers.optimize_loss(self.loss, global_step=tf.train.get_global_step() ,learning_rate=self.learning_rate_ph,
                                                    optimizer='Adam', clip_gradients=1.0)

In [30]:
Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)

In [31]:
def init_model(self, vocab_size, embeddings_size, hidden_size, 
               max_iter, start_symbol_id, end_symbol_id, padding_symbol_id):
    
    self.__declare_placeholders()
    self.__create_embeddings(vocab_size, embeddings_size)
    self.__build_encoder(hidden_size)
    self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)
    
    # Compute loss and back-propagate.
    self.__compute_loss()
    self.__perform_optimization()
    
    # Get predictions for evaluation.
    self.train_predictions = self.train_outputs.sample_id
    self.infer_predictions = self.infer_outputs.sample_id

In [32]:
Seq2SeqModel.__init__ = classmethod(init_model)

In [33]:
def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
    pred, loss, _ = session.run([
            self.train_predictions,
            self.loss,
            self.train_op], feed_dict=feed_dict)
    return pred, loss

In [34]:
Seq2SeqModel.train_on_batch = classmethod(train_on_batch)

In [35]:
def predict_for_batch(self, session, X, X_seq_len):
    #feed_dict = ######### YOUR CODE HERE #############
    feed_dict = {
        self.input_batch: X,
        self.input_batch_lengths: X_seq_len,
        self.dropout_ph: 1.0
    }
    pred = session.run([
            self.infer_predictions
        ], feed_dict=feed_dict)[0]
    return pred

def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):
    #feed_dict = ######### YOUR CODE HERE #############
    feed_dict = {
        self.input_batch: X,
        self.input_batch_lengths: X_seq_len,
        self.ground_truth: Y,
        self.ground_truth_lengths: Y_seq_len,
        self.dropout_ph: 1.0
    }
    pred, loss = session.run([
            self.infer_predictions,
            self.loss,
        ], feed_dict=feed_dict)
    return pred, loss

In [36]:
Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)
Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)

In [37]:
tf.reset_default_graph()

#model = ######### YOUR CODE HERE #############
model = Seq2SeqModel(vocab_size=len(word2id), 
                     embeddings_size=20, 
                     hidden_size=512, 
                     max_iter=7, 
                     start_symbol_id=word2id[start_symbol], 
                     end_symbol_id=word2id[end_symbol], 
                     padding_symbol_id=word2id[padding_symbol])


batch_size = 128 ######### YOUR CODE HERE #############
n_epochs = 10 ######### YOUR CODE HERE #############
learning_rate = 0.001 ######### YOUR CODE HERE #############
dropout_keep_probability = 0.5 ######### YOUR CODE HERE #############
max_len = 20 ######### YOUR CODE HERE #############

n_step = int(len(train_set) / batch_size)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [38]:
session = tf.Session()
session.run(tf.global_variables_initializer())
            
invalid_number_prediction_counts = []
all_model_predictions = []
all_ground_truth = []

print('Start training... \n')
for epoch in range(n_epochs):  
    random.shuffle(train_set)
    random.shuffle(test_set)
    
    print('Train: epoch', epoch + 1)
    for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size=batch_size)):
        ######################################
        ######### YOUR CODE HERE #############
        ######################################
        # prepare the data (X_batch and Y_batch) for training
        # using function batch_to_ids
        # predictions, loss = ######### YOUR CODE HERE #############
        X, X_seq_len = batch_to_ids(X_batch, word2id, max_len)
        Y, Y_seq_len = batch_to_ids(Y_batch, word2id, max_len)
        predictions, loss = model.train_on_batch(session, 
                                                X, 
                                                X_seq_len, 
                                                Y, 
                                                Y_seq_len, 
                                                learning_rate, 
                                                dropout_keep_probability)
        
        if n_iter % 200 == 0:
            print("Epoch: [%d/%d], step: [%d/%d], loss: %f" % (epoch + 1, n_epochs, n_iter + 1, n_step, loss))
                
    X_sent, Y_sent = next(generate_batches(test_set, batch_size=batch_size))
    ######################################
    ######### YOUR CODE HERE #############
    ######################################
    # prepare test data (X_sent and Y_sent) for predicting 
    # quality and computing value of the loss function
    # using function batch_to_ids
    #predictions, loss = ######### YOUR CODE HERE #############
    X, X_seq_len = batch_to_ids(X_sent, word2id, max_len)
    Y, Y_seq_len = batch_to_ids(Y_sent, word2id, max_len)
    predictions, loss = model.predict_for_batch_with_loss(session, 
                                                X, 
                                                X_seq_len, 
                                                Y, 
                                                Y_seq_len) 

    print('Test: epoch', epoch + 1, 'loss:', loss,)
    for x, y, p  in list(zip(X, Y, predictions))[:3]:
        print('X:',''.join(ids_to_sentence(x, id2word)))
        print('Y:',''.join(ids_to_sentence(y, id2word)))
        print('O:',''.join(ids_to_sentence(p, id2word)))
        print('')

    model_predictions = []
    ground_truth = []
    invalid_number_prediction_count = 0
    # For the whole test set calculate ground-truth values (as integer numbers)
    # and prediction values (also as integers) to calculate metrics.
    # If generated by model number is not correct (e.g. '1-1'), 
    # increase invalid_number_prediction_count and don't append this and corresponding
    # ground-truth value to the arrays.
    for X_batch, Y_batch in generate_batches(test_set, batch_size=batch_size):
        ######################################
        ######### YOUR CODE HERE #############
        ######################################
        X, X_seq_len = batch_to_ids(X_batch, word2id, max_len)
        Y, Y_seq_len = batch_to_ids(Y_batch, word2id, max_len)
        predictions, loss = model.predict_for_batch_with_loss(session, X, X_seq_len, Y, Y_seq_len) 
        for x, y, p  in list(zip(X, Y, predictions)):
            try:
                truth_val = eval(''.join(ids_to_sentence(y, id2word)).split('$')[0])
                p_val = eval(''.join(ids_to_sentence(p, id2word)).split('$')[0])
                model_predictions.append(p_val)
                ground_truth.append(truth_val)
            except:
                invalid_number_prediction_count += 1
            

    
    all_model_predictions.append(model_predictions)
    all_ground_truth.append(ground_truth)
    invalid_number_prediction_counts.append(invalid_number_prediction_count)
            
print('\n...training finished.')

Start training... 

Train: epoch 1
Epoch: [1/10], step: [1/625], loss: 2.748499
Epoch: [1/10], step: [201/625], loss: 1.807244
Epoch: [1/10], step: [401/625], loss: 1.722589
Epoch: [1/10], step: [601/625], loss: 1.653003
Test: epoch 1 loss: 1.5735583
X: 9297+6292$
Y: 15589$
O: 16444$

X: 3846-6267$
Y: -2421$
O: -3144$

X: 4023+3386$
Y: 7409$#
O: 8044$#

Train: epoch 2
Epoch: [2/10], step: [1/625], loss: 1.609485
Epoch: [2/10], step: [201/625], loss: 1.536041
Epoch: [2/10], step: [401/625], loss: 1.488267
Epoch: [2/10], step: [601/625], loss: 1.488860
Test: epoch 2 loss: 1.4064794
X: 5097+6981$
Y: 12078$
O: 12302$

X: 8061+2256$
Y: 10317$
O: 10662$

X: 566+9167$#
Y: 9733$#
O: 9662$#

Train: epoch 3
Epoch: [3/10], step: [1/625], loss: 1.460178
Epoch: [3/10], step: [201/625], loss: 1.456057
Epoch: [3/10], step: [401/625], loss: 1.379740
Epoch: [3/10], step: [601/625], loss: 1.392766
Test: epoch 3 loss: 1.3364289
X: 2227-276$#
Y: 1951$#
O: 1965$#

X: 2092-8108$
Y: -6016$
O: -5665$

X: 2569

In [39]:
from sklearn.metrics import mean_absolute_error

In [40]:
for i, (gts, predictions, invalid_number_prediction_count) in enumerate(zip(all_ground_truth,
                                                                            all_model_predictions,
                                                                            invalid_number_prediction_counts), 1):
    mae = mean_absolute_error(gts, predictions) ######### YOUR CODE HERE #############
    print("Epoch: %i, MAE: %f, Invalid numbers: %i" % (i, mae, invalid_number_prediction_count))

Epoch: 1, MAE: 808.840300, Invalid numbers: 0
Epoch: 2, MAE: 400.126700, Invalid numbers: 0
Epoch: 3, MAE: 270.173400, Invalid numbers: 0
Epoch: 4, MAE: 182.016100, Invalid numbers: 0
Epoch: 5, MAE: 149.721750, Invalid numbers: 0
Epoch: 6, MAE: 83.784250, Invalid numbers: 0
Epoch: 7, MAE: 43.268350, Invalid numbers: 0
Epoch: 8, MAE: 35.178850, Invalid numbers: 0
Epoch: 9, MAE: 31.102050, Invalid numbers: 0
Epoch: 10, MAE: 29.398700, Invalid numbers: 0
