In [1]:
!pip install unidecode
!pip install observations



In [2]:
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.10 and enable eager execution
import tensorflow as tf

tf.enable_eager_execution()

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import time
import unidecode

print(tf.__version__)

  from ._conv import register_converters as _register_converters


1.10.0


## Load PTB data

In [3]:
import observations
text, testfile, valfile = getattr(observations, 'ptb')('data/')

In [4]:
len(text)

5269890

In [5]:
def preprocess_sentence(w):
    w = w.lower()
    w = re.sub("<eos>", " ", w)
    w = re.sub("<unk>", " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-z?.!,]+", " ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    return w

text = preprocess_sentence(text)
print(len(text))

4715474


In [6]:
# unique contains all the unique characters in the file
unique = ['<start>']+ sorted(set(text))

# creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(unique)}
idx2char = {i:u for i, u in enumerate(unique)}

print("number of unique characters ", len(unique))

number of unique characters  29


In [7]:
# setting the maximum length sentence we want for a single input in characters
max_length = 35

# length of the vocabulary in chars
vocab_size = len(unique)

# the embedding dimension 
embedding_dim = 256

# number of RNN (here GRU) units
units = 1024

# batch size 
BATCH_SIZE = 64

# buffer size to shuffle our dataset
BUFFER_SIZE = 12000

In [8]:
input_text = []
target_text = []

for f in range(0, len(text)-max_length, 3):
    inps = text[f:f+max_length]
    targ = text[f:f+max_length]

    input_text.append([char2idx[i] for i in inps])
    target_text.append([char2idx[t] for t in targ])
    
print (np.array(input_text).shape)
print (np.array(target_text).shape)

(1571813, 35)
(1571813, 35)


In [9]:
dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))

Instructions for updating:
Use `tf.data.Dataset.batch(..., drop_remainder=True)`.


# Build the model

In [10]:
tf.test.is_gpu_available()

True

In [11]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
  if tf.test.is_gpu_available():
    return tf.keras.layers.CuDNNGRU(units, 
                                    return_sequences=True, 
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
  else:
    return tf.keras.layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True,
                               recurrent_initializer='glorot_uniform')

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [13]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, hidden_size)
        score = tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * max_length, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * max_length, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [14]:
encoder = Encoder(len(unique), embedding_dim, units, BATCH_SIZE)
decoder = Decoder(len(unique), embedding_dim, units, BATCH_SIZE)

In [15]:
optimizer = tf.train.AdamOptimizer(2.5e-5)


def loss_function(real, pred):
  mask = 1 - np.equal(real, 0)
  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
  return tf.reduce_mean(loss_)

In [16]:
checkpoint_dir = './training_checkpoints'
checkpoint_dir = os.path.join(checkpoint_dir, "lr.000025_error4")
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [17]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x1e1d9181f98>

In [18]:
EPOCHS = 100

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        if batch % 500 ==0:
            # print one for test
            input_string = [idx2char[idx.numpy()] for idx in inp[0]]
            for i in input_string:
                print(i, end='')
            print('   (original)')
        
        # modify input data to represent error
        nb_error = np.random.randint(0,5) # min(max(epoch - 0, 0), 5) # stoping change in first 5 epoch
        random_index = np.random.randint(0, max_length, (BATCH_SIZE, nb_error))
        random_value = np.random.randint(1, len(unique), (BATCH_SIZE, nb_error))
        
        for b in range(BATCH_SIZE):
            for d in range(nb_error):
                inp = inp.numpy()
                inp[b, random_index[d, d]] = random_value[b, d]
                inp = tf.contrib.eager.Variable(inp)
                # tf.scatter_update(inp, [b, random_index[d, d]], random_value[b, d])
        
        if batch % 500 ==0:
            # print one for test
            input_string = [idx2char[idx.numpy()] for idx in inp[0]]
            for i in input_string:
                print(i, end='')
            print(' ==>  ')
        
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([char2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(0, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                if batch % 500==0:
                    output = predictions[0]
                    print(idx2char[output.numpy().argmax(-1)], end='')
                    
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 50 == 0:
            print('  Epoch {} Batch {} @{} erroes Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         nb_error,
                                                         batch_loss.numpy()))
        if batch % 3000 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
            
    # saving (checkpoint) the model every 2 epochs
    checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}\n'.format(epoch + 1,
                                        total_loss / batch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

rnational business machines corp. a   (original)
rnation l business mychines corp. a ==>  
rnation l business mychines corp. a  Epoch 1 Batch 0 @2 erroes Loss 0.2373
  Epoch 1 Batch 50 @3 erroes Loss 0.3878
  Epoch 1 Batch 100 @2 erroes Loss 0.2420
  Epoch 1 Batch 150 @4 erroes Loss 0.4566
  Epoch 1 Batch 200 @0 erroes Loss 0.0420
  Epoch 1 Batch 250 @4 erroes Loss 0.4631
  Epoch 1 Batch 300 @1 erroes Loss 0.1431
  Epoch 1 Batch 350 @2 erroes Loss 0.2312
  Epoch 1 Batch 400 @0 erroes Loss 0.0355
  Epoch 1 Batch 450 @3 erroes Loss 0.3549
increasingly trade debate foreigner   (original)
increasinglc trade deb te foreimder ==>  
increasingly trade deb te foreimder  Epoch 1 Batch 500 @4 erroes Loss 0.4243
  Epoch 1 Batch 550 @2 erroes Loss 0.2292
  Epoch 1 Batch 600 @2 erroes Loss 0.2388
  Epoch 1 Batch 650 @3 erroes Loss 0.3318
  Epoch 1 Batch 700 @2 erroes Loss 0.2480
  Epoch 1 Batch 750 @0 erroes Loss 0.0399
  Epoch 1 Batch 800 @1 erroes Loss 0.1387
  Epoch 1 Batch 850 @4 erroes Loss 0.

KeyboardInterrupt: 