In [55]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import math
import os

In [56]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [57]:
normal = pd.read_csv('../data/sentence-aligned.v2/normal.aligned',sep='\t',header=None)
normal.head()

Unnamed: 0,0,1,2
0,"Cherokee, Oklahoma",0,It is the county seat of Alfalfa County .
1,"Cherokee, Oklahoma",0,"Cherokee is a city in Alfalfa County , Oklahom..."
2,Skateboard,5,Skateboard decks are usually between 28 and 33...
3,Skateboard,5,The underside of the deck can be printed with ...
4,Skateboard,6,This was created by two surfers ; Ben Whatson ...


In [58]:
simple = pd.read_csv('../data/sentence-aligned.v2/simple.aligned',sep='\t',header=None)
simple.head()

Unnamed: 0,0,1,2
0,"Cherokee, Oklahoma",0,It is the county seat of Alfalfa County .
1,"Cherokee, Oklahoma",0,Cherokee is a city of Oklahoma in the United S...
2,Skateboard,2,Skateboard decks are normally between 28 and 3...
3,Skateboard,2,The bottom of the deck can be printed with a d...
4,Skateboard,3,The longboard was made by two surfers ; Ben Wh...


In [59]:
bos = 'bos ' # Beginning of sentence token
eos = ' eos' # End of sentence token
identical_filter = (normal[2] != simple[2])
input_texts = normal[2][identical_filter]
input_texts = np.array([text.lower().split(' ') for text in input_texts])
target_texts = simple[2][identical_filter]
target_texts = np.array([f'{bos}{text}{eos}'.lower().split(' ') for text in target_texts])
print(f'No. pairs before preprocessing: {len(normal[2])}')
print(f'No. pairs after preprocessing: {len(input_texts)}')

No. pairs before preprocessing: 167689
No. pairs after preprocessing: 117952


In [60]:
# Once again, credit to Hvass Labs, https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/21_Machine_Translation.ipynb
class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""
    
    def __init__(self, texts, padding,
                 reverse=False, num_words=None):
        """
        :param texts: List of strings. This is the data-set.
        :param padding: Either 'post' or 'pre' padding.
        :param reverse: Boolean whether to reverse token-lists.
        :param num_words: Max number of words to use.
        """

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

        # Convert all texts to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    def text_to_tokens(self, text, reverse=False, padding=False):
        """
        Convert a single text-string to tokens with optional
        reversal and padding.
        """

        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)

            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

In [61]:
vocab_size=30_000             # CHANGE TO 60_000

In [62]:
%%time
tokenizer_src = TokenizerWrap(texts=input_texts,
                              padding='pre',
                              reverse=True,
                              num_words=vocab_size)

CPU times: user 7.1 s, sys: 258 ms, total: 7.35 s
Wall time: 7.62 s


In [63]:
%%time
tokenizer_dest = TokenizerWrap(texts=target_texts,
                               padding='post',
                               reverse=False,
                               num_words=vocab_size)

CPU times: user 5.46 s, sys: 115 ms, total: 5.57 s
Wall time: 5.63 s


In [64]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(117952, 54)
(117952, 48)


In [65]:
token_start = tokenizer_dest.word_index[bos.strip()]
token_start

5

In [66]:
token_end = tokenizer_dest.word_index[eos.strip()]
token_end

4

In [67]:
tokenizer_src.tokens_to_string(tokens_src[2])

'. means other any by decorated or , blank , manufacturer the by design a with printed be can deck the of underside the'

In [68]:
normal[2][3]

'The underside of the deck can be printed with a design by the manufacturer , blank , or decorated by any other means .'

In [69]:
tokens_dest[2]

array([   5,    1, 2112,    6,    1, 5594,   71,   35, 1988,   23,    8,
        720,   19,    1, 3294,    2,   30,   15,   71,   35, 9011,    2,
          4,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)

In [70]:
tokenizer_dest.tokens_to_string(tokens_dest[2])

'bos the bottom of the deck can be printed with a design by the maker . or it can be blank . eos'

In [71]:
encoder_input_data=tokens_src

In [72]:
decoder_input_data=tokens_dest[:,:-1]

In [73]:
decoder_input_data.shape

(117952, 47)

In [74]:
decoder_output_data = tokens_dest[:, 1:]
decoder_output_data.shape

(117952, 47)

In [75]:
decoder_input_data[4]

array([    5,    61,  1886,    65,     7,   469, 19625, 17811,     3,
        9742,     3, 11725,     3,     3,  1679,  6058,     3,  9386,
           3,     9,  2750,     2,     4,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

In [76]:
tokenizer_dest.tokens_to_string(decoder_input_data[15])

'bos mitsubishi motors has been around since 1970 . eos'

In [77]:
decoder_output_data[4]

array([   61,  1886,    65,     7,   469, 19625, 17811,     3,  9742,
           3, 11725,     3,     3,  1679,  6058,     3,  9386,     3,
           9,  2750,     2,     4,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

In [78]:
tokenizer_dest.tokens_to_string(decoder_output_data[15])

'mitsubishi motors has been around since 1970 . eos'

# Neural Net Time

## Encoder

In [79]:
encoder_input = Input(shape=(None, ), name='encoder_input')

In [80]:
embedding_size = 300

In [81]:
%%time
encoder_embedding = Embedding(input_dim=vocab_size,
                              output_dim=embedding_size,
                              name='encoder_embedding')

CPU times: user 305 µs, sys: 69 µs, total: 374 µs
Wall time: 378 µs


In [82]:
state_size = 512

In [83]:
encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
# encoder_gru2 = GRU(state_size, name='encoder_gru2',
#                    return_sequences=True)
# encoder_gru3 = GRU(state_size, name='encoder_gru3',
#                    return_sequences=False)

In [84]:
def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
#     net = encoder_gru2(net)
#     net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

In [85]:
encoder_output = connect_encoder()

## Decoder

In [95]:
decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')

In [96]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [97]:
decoder_embedding = Embedding(input_dim=vocab_size,
                              output_dim=embedding_size,
                              name='decoder_embedding')

In [94]:
decoder_gru = GRU(state_size, name='decoder_gru',
                   return_sequences=True)

In [90]:
decoder_dense = Dense(vocab_size,
                      activation='linear',
                      name='decoder_output')

In [91]:
def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

## Connect encoder and decoder

In [92]:
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

ValueError: An `initial_state` was passed that is not compatible with `cell.state_size`. Received `state_spec`=[InputSpec(shape=(None, None, 512), ndim=3)]; however `cell.state_size` is [512]

In [None]:

model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

In [None]:
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

In [None]:
def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.
    
    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """

    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

## Compile

In [None]:
optimizer = RMSprop(lr=1e-3)

In [None]:
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

In [None]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

In [None]:
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

In [None]:
callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

In [None]:
callback_tensorboard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [None]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

In [None]:
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

In [None]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

In [None]:
y_data = \
{
    'decoder_output': decoder_output_data
}

In [None]:
validation_split = 10000 / len(encoder_input_data)
validation_split

In [None]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=10,
                validation_split=validation_split,
                callbacks=callbacks)