In [1]:
%matplotlib inline
import tensorflow as tf
import numpy as np
import os

In [2]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
# "ssss " marks the start of the decoding process while " eeee" tells the decoder to stop.
mark_start = 'ssss '
mark_end = ' eeee'

In [4]:
# define the max numbers of our vocabulary
num_words = 8000

In [5]:
import pickle

In [6]:
with open('tokenizer', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [7]:
# performing tokenizer related operations
class TokenizerWrap:
    
#constructor
    def __init__(self, texts, padding,
                 reverse=False):
        """
        :param texts: List of strings. This is the data-set.
        :param padding: Either 'post' or 'pre' padding.
        :param reverse: Boolean whether to reverse token-lists.
        """
      
        # Convert all texts(words selected into the vocabulary) to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = tokenizer.texts_to_sequences(texts)
        
        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated at the beginning, which corresponds to the end of 
            # the original sequences.
            
            # Padding and Truncating Data(tute20) - 
            # RNN can take sequences of arbitrary length as input, n order to use a whole batch of data, the sequences
            # need to have the same length. this can be simply done by taking the longest length seq, but its a waste of memory.
            # we will use a sequencelength that covers most sequences in the data-set, and we will then truncate longer 
            # sequences and pad shorter sequences.
            
            # truncated -part of seq is simply throw away
            # pad -zeros are added to the seq
            # the choice of 'pre' or 'post' can be important because it determines whether we throw away the first or last part 
            # of a sequence when truncating, and it determines whether we add zeros to the beginning or end of the sequence 
            # when padding
            
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)
# all the modifications to the dataset is covered by now

    # find the related word when token is given
    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""

        word = " " if token == 0 else tokenizer.index_to_word[token]
        return word 

    # this is used by us- convert the related string for the given token     
    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [tokenizer.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    # when text/string is given it returns the related token 
    def text_to_tokens(self, text, reverse=False, padding=False):
        """
        Convert a single text-string to tokens with optional
        reversal and padding.
        """

        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = tokenizer.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)

            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

In [8]:
# loading
with open('tokenizer_src', 'rb') as handle:
    tokenizer_src = pickle.load(handle)

In [9]:
# loading
with open('tokenizer_dest', 'rb') as handle_:
    tokenizer_dest = pickle.load(handle_)

In [10]:
encoder_input = Input(shape=(None, ), name='encoder_input')

In [11]:
embedding_size = 128

In [12]:
encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')

In [13]:
state_size = 512

In [14]:
encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)

In [15]:

def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

In [16]:
encoder_output = connect_encoder()

In [17]:

decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')

In [18]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [19]:
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

In [20]:
decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)

In [21]:
decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')

In [22]:

def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [23]:
decoder_output = connect_decoder(initial_state=encoder_output)

In [24]:
model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

In [25]:

model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

In [26]:
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

In [27]:
from keras.models import load_model

Using TensorFlow backend.


In [28]:
model_train.load_weights("22modelweight.h5")

In [54]:
def translate(input_text):
    """Translate a single text-string."""

    # Convert the input-text to integer-tokens.
    # Note the sequence of tokens has to be reversed.
    # Padding is probably not necessary.
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    # Get the output of the encoder's GRU which will be
    # used as the initial state in the decoder's GRU.
    # This could also have been the encoder's final state
    # but that is really only necessary if the encoder
    # and decoder use the LSTM instead of GRU because
    # the LSTM has two internal states.
    initial_state = model_encoder.predict(input_tokens)

    # Max number of tokens / words in the output sequence.
    max_tokens = tokenizer_dest.max_tokens

    # Pre-allocate the 2-dim array used as input to the decoder.
    # This holds just a single sequence of integer-tokens,
    # but the decoder-model expects a batch of sequences.
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # The first input-token is the special start-token for 'ssss '.
    token_start = tokenizer.word_index[mark_start.strip()]
    token_end = tokenizer.word_index[mark_end.strip()]
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    # While we haven't sampled the special end-token for ' eeee'
    # and we haven't processed the max number of tokens.
    while token_int != token_end and count_tokens < max_tokens:
        # Update the input-sequence to the decoder
        # with the last token that was sampled.
        # In the first iteration this will set the
        # first element to the start-token.
        decoder_input_data[0, count_tokens] = token_int

        # Wrap the input-data in a dict for clarity and safety,
        # so we are sure we input the data in the right order.
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

        # Note that we input the entire sequence of tokens
        # to the decoder. This wastes a lot of computation
        # because we are only interested in the last input
        # and output. We could modify the code to return
        # the GRU-states when calling predict() and then
        # feeding these GRU-states as well the next time
        # we call predict(), but it would make the code
        # much more complicated.

        # Input this data to the decoder and get the predicted output.
        decoder_output = model_decoder.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        token_onehot = decoder_output[0, count_tokens, :]
        
        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer_dest.token_to_word(token_int)
        
        if sampled_word == 'eeee':
            break

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # Sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]
    
    # Print the input-text.
    print("Input text:")
    print(input_text)
    print()

    # Print the translated output-text.
    print("Translated text:")
    print(output_text)
    print()


In [57]:
translate(input_text="But you will DEFINALTELY know when you are in love!")

Input text:
But you will DEFINALTELY know when you are in love!

Translated text:
 you will know when you are in love



In [58]:
translate(input_text="Many words can multitask, doing different jobs in different sentences.")

Input text:
Many words can multitask, doing different jobs in different sentences.

Translated text:
 there is many ways to make out of all of them

