# Neural Machine Translator with Seq2seq: German to English

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
import tensorflow as tf
from PIL import Image
from collections import Counter
import csv
# Seq2Seq Items
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.python.ops.rnn_cell import LSTMCell
from tensorflow.python.ops.rnn_cell import MultiRNNCell
from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
from tensorflow.python.layers.core import Dense

  from ._conv import register_converters as _register_converters


## Defining Hyperparameters
We define main hyperparameters required for the model

In [2]:
vocab_size= 50000
num_units = 128
input_size = 128
batch_size = 16
source_sequence_length=40
target_sequence_length=60
decoder_type = 'basic' # could be basic or attention

# Data Preprocessing (Copied from Chapter 10)

## Loading Data 

First, download the data from this [page](https://nlp.stanford.edu/projects/nmt/) and place them in the ch10 folder. You do not need to do anything if you have already run the exercises in `ch10` folder. The required files are:

* File containing German sentences: [`train.de`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de)
* File containing English sentences: [`train.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en)
* File containing German vocabulary: [`vocab.50K.de`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.de)
* File containing English vocabulary: [`vocab.50K.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.en)

### Loading Vocabulary

First we build the vocabulary dictionaries for both the source (German) and target (English) languages. The vocabularies are found in the `vocab.50K.de` (German) and `vocab.50K.en` files.

In [4]:
# ==========================================
# Building source language vocabulary

# Contains word string -> ID mapping
src_dictionary = dict()

# Read the vocabulary file
with open(os.path.join('..','ch10','vocab.50K.de'), encoding='utf-8') as f:
    # Read and store every line
    for line in f:
        #we are discarding last char as it is new line char
        src_dictionary[line[:-1]] = len(src_dictionary)

# Build a reverse dictionary with the mapping ID -> word string
src_reverse_dictionary = dict(zip(src_dictionary.values(),src_dictionary.keys()))

# Print some of the words in the dictionary
print('Source')
print('\t',list(src_dictionary.items())[:10])
print('\t',list(src_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(src_dictionary))

# ==========================================
# Building source language vocabulary

# Contains word string -> ID mapping
tgt_dictionary = dict()

# Read the vocabulary file
with open(os.path.join('..','ch10','vocab.50K.en'), encoding='utf-8') as f:
    # Read and store every line
    for line in f:
        #we are discarding last char as it is new line char
        tgt_dictionary[line[:-1]] = len(tgt_dictionary)

# Build a reverse dictionary with the mapping ID -> word string
tgt_reverse_dictionary = dict(zip(tgt_dictionary.values(),tgt_dictionary.keys()))

# Print some of the words in the dictionary
print('Target')
print('\t',list(tgt_dictionary.items())[:10])
print('\t',list(tgt_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(tgt_dictionary))

# Each language has 50000 words
vocabulary_size = 50000

Source
	 [('veraltete', 20446), ('Norditalien', 45398), ('89', 10378), ('WD', 19593), ('beiderseitigen', 24051), ('Filmfestivals', 49249), ('Wahlkommission', 34930), ('piscine', 14869), ('berechtigterweise', 34797), ('Budgetierung', 34984)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'die'), (6, 'der'), (7, 'und'), (8, 'in'), (9, 'zu')]
	 Vocabulary size:  50000
Target
	 [('younger', 5563), ('spade', 21549), ('89', 9143), ('CF', 19570), ('decentralised', 10082), ('piscine', 23926), ('Creek', 19483), ('JP', 36526), ('Single', 4328), ('forget', 1823)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'the'), (4, ','), (5, '.'), (6, 'of'), (7, 'and'), (8, 'to'), (9, 'in')]
	 Vocabulary size:  50000


### Loading Training and Testing Data

Here we load the data in the `train.de` and `train.en` files. And split the data in the files into two sets; training and testing data.

In [5]:
# Contains the training sentences
source_sent = [] # Input
target_sent = [] # Output

# Contains the testing sentences
test_source_sent = [] # Input
test_target_sent = [] # Output

# We grab around 100 lines of data that are interleaved 
# in the first 50000 sentences
test_indices = [l_i for l_i in range(50,50001,500)]

# Read the source data file and read the first 250,000 lines (except first 50)
with open(os.path.join('..','ch10','train.de'), encoding='utf-8') as f:
    for l_i, line in enumerate(f):
        # discarding first 50 translations as there was some
        # english to english mappings found in the first few lines. which are wrong
        if l_i<50:
            continue
        
        if len(source_sent)<250000 and l_i not in test_indices:
            source_sent.append(line)
        elif l_i in test_indices:
            test_source_sent.append(line)
        
# Read the target data file and read the first 250,000 lines (except first 50)            
with open(os.path.join('..','ch10','train.en'), encoding='utf-8') as f:
    for l_i, line in enumerate(f):
        # discarding first 50 translations as there was some
        # english to english mappings found in the first few lines. which are wrong
        if l_i<50:
            continue
        
        if len(target_sent)<250000 and l_i not in test_indices:
            target_sent.append(line)
        elif l_i in test_indices:
            test_target_sent.append(line)
        
# Make sure we extracted same number of both extracted source and target sentences         
assert len(source_sent)==len(target_sent),'Source: %d, Target: %d'%(len(source_sent),len(target_sent))

# Print some source sentences
print('Sample translations (%d)'%len(source_sent))
for i in range(0,250000,10000):
    print('(',i,') DE: ', source_sent[i])
    print('(',i,') EN: ', target_sent[i])

# Print some target sentences
print('Sample test translations (%d)'%len(test_source_sent))
for i in range(0,100,10):
    print('DE: ', test_source_sent[i])
    print('EN: ', test_target_sent[i])



Sample translations (250000)
( 0 ) DE:  Hier erfahren Sie , wie Sie Creative Suite 2 und Creative Suite 3 am besten zusammen mit QuarkXPress nutzen können .

( 0 ) EN:  Here , you ’ ll find out how Creative Suite users can get the best possible interaction with QuarkXPress .

( 10000 ) DE:  Für die sehr günstigen Wochen- und Monatskarten ( 1 Monat ca.

( 10000 ) EN:  It is THE trendy area of Marseille .

( 20000 ) DE:  Freuen Sie sich auf die romantische Atmosphäre in den Zimmern und Apartments .

( 20000 ) EN:  Enjoy the romantic atmosphere of one of the guest rooms or apartments .

( 30000 ) DE:  Zu zwiespältig sind Dr. Gutherzens Erfahrungen aus frühen Studententagen verlaufen , in denen er sich in die Gefielde von durch Heidegger geprägten Autor / innen begeben hat und dort ständig mit strengem Blick darauf verwiesen wurde , er habe bestimmte Theorieressourcen und Gedankengebäude einfach noch nicht gründlich genug verstanden und könne deshalb nicht begreifen , warum seine Einwände 

### Preprocessing text
Here we preprocess the text by replacing words not found in the dictionary with `<unk>` as well as remove punctuation marks (`.`,`,`) and new-line characters.

In [7]:
# Keep track of how many unknown words were encountered
src_unk_count, tgt_unk_count = 0, 0

def split_to_tokens(sent,is_source):
    '''
    This function takes in a sentence (source or target)
    and preprocess the sentency with various steps (e.g. removing punctuation)
    '''
    
    global src_unk_count, tgt_unk_count

    # Remove punctuation and new-line chars
    sent = sent.replace(',',' ,')
    sent = sent.replace('.',' .')
    sent = sent.replace('\n',' ') 
    
    sent_toks = sent.split(' ')
    for t_i, tok in enumerate(sent_toks):
        if is_source:
            # src_dictionary contain the word -> word ID mapping for source vocabulary
            if tok not in src_dictionary.keys():
                if not len(tok.strip())==0:
                    sent_toks[t_i] = '<unk>'
                    src_unk_count += 1
        else:
            # tgt_dictionary contain the word -> word ID mapping for target vocabulary
            if tok not in tgt_dictionary.keys():
                if not len(tok.strip())==0:
                    sent_toks[t_i] = '<unk>'
                    #print(tok)
                    tgt_unk_count += 1
    return sent_toks

# Let us first look at some statistics of the sentences
# Train - source data
source_len = []
source_mean, source_std = 0,0
for sent in source_sent:
    source_len.append(len(split_to_tokens(sent,True)))

print('(Source) Sentence mean length: ', np.mean(source_len))
print('(Source) Sentence stddev length: ', np.std(source_len))

# Let us first look at some statistics of the sentences
# Train - target data
target_len = []
for sent in target_sent:
    target_len.append(len(split_to_tokens(sent,False)))

print('(Target) Sentence mean length: ', np.mean(target_len))
print('(Target) Sentence stddev length: ', np.std(target_len))

# Let us first look at some statistics of the sentences
# Test - source data
test_source_len = []
for sent in test_source_sent:
    test_source_len.append(len(split_to_tokens(sent, True)))
    
print('(Test-Source) Sentence mean length: ', np.mean(test_source_len))
print('(Test-Source) Sentence stddev length: ', np.std(test_source_len))

# Let us first look at some statistics of the sentences
# Test - target data
test_target_len = []
test_tgt_mean, test_tgt_std = 0,0
for sent in test_target_sent:
    test_target_len.append(len(split_to_tokens(sent, False)))
    
print('(Test-Target) Sentence mean length: ', np.mean(test_target_len))
print('(Test-Target) Sentence stddev length: ', np.std(test_target_len))

(Source) Sentence mean length:  26.244692
(Source) Sentence stddev length:  13.854376414156501
(Target) Sentence mean length:  28.275308
(Target) Sentence stddev length:  14.925498769057468
(Test-Source) Sentence mean length:  26.61
(Test-Source) Sentence stddev length:  14.800604717375572
(Test-Target) Sentence mean length:  29.08
(Test-Target) Sentence stddev length:  16.19424589167399


### Making training and testing data fixed length

Here we get all the source sentences and target sentences to a fixed length. This is, so that we can process the sentences as batches.

In [8]:
# ================================================================================
# Processing training data

src_unk_count, tgt_unk_count = 0, 0

train_inputs = []
train_outputs = []

# Chosen based on previously found statistics
src_max_sent_length = 41 
tgt_max_sent_length = 61

print('Processing Training Data ...\n')
for s_i, (src_sent, tgt_sent) in enumerate(zip(source_sent,target_sent)):
    # Break source and target sentences to word lists
    src_sent_tokens = split_to_tokens(src_sent,True)
    tgt_sent_tokens = split_to_tokens(tgt_sent,False)
    
    # Append <s> token's ID to the beggining of source sentence
    num_src_sent = [src_dictionary['<s>']]
    # Add the rest of word IDs for words found in the source sentence 
    for tok in src_sent_tokens:
        if tok in src_dictionary.keys():
            num_src_sent.append(src_dictionary[tok])

    # If the lenghth of the source sentence below the maximum allowed length
    # append </s> token's ID to the end
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])

    # If the length exceed the maximum allowed length
    # truncate the sentence
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
        
    # Make sure the sentence is of length src_max_sent_length
    assert len(num_src_sent)==src_max_sent_length,len(num_src_sent)

    train_inputs.append(num_src_sent)
    
    # Create the numeric target sentence with word IDs
    # append <s> to the beginning and append actual words later
    num_tgt_sent = [tgt_dictionary['<s>']]
    for tok in tgt_sent_tokens:
        if tok in tgt_dictionary.keys():
            num_tgt_sent.append(tgt_dictionary[tok])
        
    ## Modifying the outputs such that all the outputs have max_length elements
    if len(num_tgt_sent)<tgt_max_sent_length:
        num_tgt_sent.extend([tgt_dictionary['</s>'] for _ in range(tgt_max_sent_length - len(num_tgt_sent))])
    elif len(num_tgt_sent)>tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
        
    train_outputs.append(num_tgt_sent)
    
print('Unk counts Src: %d, Tgt: %d'%(src_unk_count, tgt_unk_count))
print('Sentences ',len(train_inputs))

assert len(train_inputs)  == len(source_sent),\
        'Size of total elements: %d, Total sentences: %d'\
                %(len(train_inputs),len(source_sent))

# Making inputs and outputs NumPy arrays
train_inputs = np.array(train_inputs, dtype=np.int32)
train_outputs = np.array(train_outputs, dtype=np.int32)

# Make sure number of inputs and outputs dividable by 100
train_inputs = train_inputs[:(train_inputs.shape[0]//100)*100,:]
train_outputs = train_outputs[:(train_outputs.shape[0]//100)*100,:]
print('\t Done processing training data \n')

# Printing some data
print('Samples from training data')
for ti in range(10):
    print('\t',[src_reverse_dictionary[w]  for w in train_inputs[ti,:].tolist()])
    print('\t',[tgt_reverse_dictionary[w]  for w in train_outputs[ti,:].tolist()])
print()
print('\tSentences ',train_inputs.shape[0])

# ================================================================================
# Processing Test data

src_unk_count, tgt_unk_count = 0, 0
print('Processing testing data ....\n')
test_inputs = []
test_outputs = []
for s_i, (src_sent,tgt_sent) in enumerate(zip(test_source_sent,test_target_sent)):
    src_sent_tokens = split_to_tokens(src_sent,True)
    tgt_sent_tokens = split_to_tokens(tgt_sent,False)
    
    num_src_sent = [src_dictionary['<s>']]
    for tok in src_sent_tokens:
        if tok in src_dictionary.keys():
            num_src_sent.append(src_dictionary[tok])
    
    num_tgt_sent = [src_dictionary['<s>']]
    for tok in tgt_sent_tokens:
        if tok in tgt_dictionary.keys():
            num_tgt_sent.append(tgt_dictionary[tok])
        
    # Append </s> if the length is not src_max_sent_length
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])
    # Truncate the sentence if length is over src_max_sent_length
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
        
    assert len(num_src_sent)==src_max_sent_length, len(num_src_sent)

    test_inputs.append(num_src_sent)
    
    # Append </s> is length is not tgt_max_sent_length
    if len(num_tgt_sent)<tgt_max_sent_length:
        num_tgt_sent.extend([tgt_dictionary['</s>'] for _ in range(tgt_max_sent_length - len(num_tgt_sent))])
    # Truncate the sentence if length over tgt_max_sent_length
    elif len(num_tgt_sent)>tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
        
    assert len(num_tgt_sent)==tgt_max_sent_length, len(num_tgt_sent)

    test_outputs.append(num_tgt_sent)

# Printing some data
print('Unk counts Tgt: %d, Tgt: %d'%(src_unk_count, tgt_unk_count))    
print('Done processing testing data ....\n')
test_inputs = np.array(test_inputs,dtype=np.int32)
test_outputs = np.array(test_outputs,dtype=np.int32)
print('Samples from training data')
for ti in range(10):
    print('\t',[src_reverse_dictionary[w]  for w in test_inputs[ti,:].tolist()])
    print('\t',[tgt_reverse_dictionary[w]  for w in test_outputs[ti,:].tolist()])

Processing Training Data ...

Unk counts Src: 464223, Tgt: 214783
Sentences  250000
	 Done processing training data 

Samples from training data
	 ['<s>', 'Hier', 'erfahren', 'Sie', ',', 'wie', 'Sie', 'Creative', 'Suite', '2', 'und', 'Creative', 'Suite', '3', 'am', 'besten', 'zusammen', 'mit', 'QuarkXPress', 'nutzen', 'können', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
	 ['<s>', 'Here', ',', 'you', '’', 'll', 'find', 'out', 'how', 'Creative', 'Suite', 'users', 'can', 'get', 'the', 'best', 'possible', 'interaction', 'with', 'QuarkXPress', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
	 ['<s>', 'Sie',

## Flipping the Input Data
Changin the order of the sentence of the target language improves the performance of NMT systems. Because when reversed, it helps the NMT system to establish a strong connection as the last word of the source language and the last word of the target language will be closest to each other. *DON'T RUN THIS MULTIPLE TIMES as running two times gives original.*

In [9]:
## Reverse the Germen sentences
# Remember reversing the source sentence gives better performance
# DON'T RUN THIS MULTIPLE TIMES as running two times gives original
train_inputs = np.fliplr(train_inputs)
test_inputs = np.fliplr(test_inputs)

print('Training and Test source data after flipping ')
print('\t',[src_reverse_dictionary[w] for w in train_inputs[0,:].tolist()])
print('\t',[tgt_reverse_dictionary[w] for w in test_inputs[0,:].tolist()])
print()
print('\t',[src_reverse_dictionary[w] for w in train_inputs[10,:].tolist()])
print('\t',[tgt_reverse_dictionary[w] for w in test_inputs[10,:].tolist()])

print()
print('\nTesting data after flipping')
print('\t',[src_reverse_dictionary[w] for w in test_inputs[0,:].tolist()])

Training and Test source data after flipping 
	 ['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '.', 'können', 'nutzen', 'QuarkXPress', 'mit', 'zusammen', 'besten', 'am', '3', 'Suite', 'Creative', 'und', '2', 'Suite', 'Creative', 'Sie', 'wie', ',', 'Sie', 'erfahren', 'Hier', '<s>']
	 ['tray', 'road', 'mistakes', 'of', 'expect', 'a', 'tabled', 'with', 'and', 'the', 'posts', 'useful', 'out', 'waiting', 'wounded', 'a', 'drinks', 'been', 'stand', '26th', 'and', 'senior', 'personal', ',', 'difficulties', 'qualifications', 'an', 'rather', 'road', 'rewriting', 'and', 'road', 'unsustainable', 'the', '2007', 'road', 'wounded', 'not', 'throughout', 'amendment', '<s>']

	 ['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '.', ')', 'Import', '##AT##-##AT##', 'PSD', '&gt;', 'Fenster', '(', 'Import', '##AT##-##AT##', 'PSD', 'Palette', 'die', 

## Data Generations for MT

Now we define the data generator for our NMT.

In [24]:
emb_mat = np.load('en-embeddings.npy')
input_size = emb_mat.shape[1]

class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source, is_train):
        global input_size
        # Number of data points in a batch
        self._batch_size = batch_size
        # Number of unrollings
        self._num_unroll = num_unroll
        # Cursors for each element in batch
        self._cursor = [0 for offset in range(self._batch_size)]
        
        # The sentence IDs being currently processed to create the 
        # current batch
        self._sent_ids = None
        
        # We want a batch of data from source or target?
        self._is_source = is_source
        # Is this training or testing data?
        self._is_train = is_train
                
    def next_batch(self, sent_ids):
        
        # Depending on wheter we want source or target data
        # change the maximum sentence length
        if self._is_source:
            max_sent_length = src_max_sent_length
        else:
            max_sent_length = tgt_max_sent_length
            
        # Arrays to hold input and output data
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)
        
        
        # Populate each index of the batch
        for b in range(self._batch_size):
            
            # Sentence IDs to get data from
            sent_id = sent_ids[b]
            
            # If generating data with source sentences
            # use src_word_embeddings
            if self._is_source:
                # Depending on whether we need training data or testind data
                # choose the previously created training or testind data
                if self._is_train:
                    sent_text = train_inputs[sent_id]
                else:
                    sent_text = test_inputs[sent_id]
                             
                # Populate the batch data arrays
                batch_data[b] = sent_text[self._cursor[b]]
                batch_labels[b] = sent_text[self._cursor[b]+1]
            # If generating data with target sentences
            # use tgt_word_embeddings
            else:
                # Depending on whether we need training data or testind data
                # choose the previously created training or testind data
                if self._is_train:
                    sent_text = train_outputs[sent_id]
                else:
                    sent_text = test_outputs[sent_id]
                
                # We cannot avoid having two different embedding vectors for <s> token
                # in soruce and target languages
                # Therefore, if the symbol appears, we always take the source embedding vector
                if sent_text[self._cursor[b]]!=tgt_dictionary['<s>']:
                    batch_data[b] = sent_text[self._cursor[b]]
                else:
                    batch_data[b] = sent_text[self._cursor[b]]
                
                # Populate the data arrays
                batch_labels[b] = sent_text[self._cursor[b]+1]
            
            # Update the cursor for each batch index
            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)
             
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        # Only if new sentence IDs if provided
        # else it will use the previously defined 
        # sent_ids continuously
        if sent_ids is not None:
            
            self._sent_ids = sent_ids
            # Unlike in the previous exercises we do not process a single sequence
            # over many iterations of unrollings. We process either a source sentence or target sentence
            # at a single go. So we reset the _cursor evrytime we generate a batch
            self._cursor = [0 for _ in range(self._batch_size)]
                
        unroll_data,unroll_labels = [],[]
        
        # Unrolling data over time
        for ui in range(self._num_unroll):
            
            if self._is_source:
                data, labels = self.next_batch(self._sent_ids)
            else:
                data, labels = self.next_batch(self._sent_ids)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        # Return unrolled data and sentence IDs
        return unroll_data, unroll_labels, self._sent_ids
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
# Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    # the the string words for returned word IDs and display the results
    print([src_reverse_dictionary[w] for w in lbl.tolist()])

        # Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=5,num_unroll=30,is_source=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,2,3,4,5])
print('\nTarget data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    # the the string words for returned word IDs and display the results
    print([tgt_reverse_dictionary[w] for w in lbl.tolist()])

Source data
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '.', '</s>', '</s>']
['</s>', '</s>', 'bietet', '.', '</s>']
['</s>', '</s>', 'Dateiformat', 'nutzen', '</s>']
['</s>', '</s>', '##AT##-##AT##', 'optimal', '</s>']
['</s>', '</s>', 'PSD', 'Bilder', '</s>']
['</s>', '</s>', 'das', 'Ihre', '.']
['</s>', '</s>', 'über', 'für', 'werden']
['.', '</s>', 'Photoshop', 'es', 'ausgewählt']
['können', '.', 'mit', 'Sie', 'Verwendungszweck']

Target data batch
['Here', 'QuarkXPress', 'In', 'For', 'If']
[',', '8', 'this', 'example', 'you']
['you', 'is', 'section', ',', 'use']
['

# NMT using Tensorflow seq2seq library

## Defining the TensorFlow inputs and outputs
Here we define placeholder to carry inputs and outputs that are required optimize the model

In [15]:
tf.reset_default_graph()

enc_train_inputs = []
dec_train_inputs, dec_train_labels = [],[]

# Need to use pre-trained word embeddings
encoder_emb_layer = tf.convert_to_tensor(np.load('de-embeddings.npy'))
decoder_emb_layer = tf.convert_to_tensor(np.load('en-embeddings.npy'))

# Defining unrolled training inputs
for ui in range(source_sequence_length):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))

# Define unrolled training outputs
for ui in range(target_sequence_length):
    dec_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))
    dec_train_labels.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_outputs_%d'%ui))

# Define embedding lookup operation
encoder_emb_inp = [tf.nn.embedding_lookup(encoder_emb_layer, src) for src in enc_train_inputs]
encoder_emb_inp = tf.stack(encoder_emb_inp)

decoder_emb_inp = [tf.nn.embedding_lookup(decoder_emb_layer, src) for src in dec_train_inputs]
decoder_emb_inp = tf.stack(decoder_emb_inp)



## Define the Encoder

Now we define the encoder cell. Encoder is a simple LSTM cell provided in seq2seq library as `BasicLSTMCell`. Then we use the `dynamic_rnn`function to unroll our inputs and get the output cell state.

In [16]:
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)

encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp, initial_state=initial_state,
    sequence_length=[source_sequence_length for _ in range(batch_size)], 
    time_major=True, swap_memory=True)

## Define the Encoder
We now define the decoder cell and an output softmax layer (`projection_layer`) as well as a helper (that produce word embeddings). Note that we give the reader the option to change the type of decoder (that is, with or without attention).

In [17]:
# Build RNN cell
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

projection_layer = Dense(units=vocab_size, use_bias=True)

# Helper
helper = tf.contrib.seq2seq.TrainingHelper(
    decoder_emb_inp, [target_sequence_length for _ in range(batch_size)], time_major=True)

# Decoder
if decoder_type == 'basic':
    decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)
    
elif decoder_type == 'attention':
    decoder = tf.contrib.seq2seq.BahdanauAttention(
        decoder_cell, helper, encoder_state,
        output_layer=projection_layer)
    
# Dynamic decoding
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
    decoder, output_time_major=True,
    swap_memory=True
)


# Computing logits and predictions
logits = outputs.rnn_output
train_prediction = outputs.sample_id

# Loss computation
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=dec_train_labels, logits=logits)
loss = tf.reduce_mean(crossent) 

## Defining optimizer
Here we define the optimizers to optimize the model parameters. As shown before, we use two optimizers Adam and SGD

In [18]:
print('Defining Optimizer')
# Adam Optimizer. And gradient clipping.
global_step = tf.Variable(0, trainable=False)
inc_gstep = tf.assign(global_step,global_step + 1)
learning_rate = tf.train.exponential_decay(
    0.01, global_step, decay_steps=10, decay_rate=0.9, staircase=True)
with tf.variable_scope('Adam'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
with tf.variable_scope('SGD'):
    sgd_optimizer = tf.train.GradientDescentOptimizer(learning_rate)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 25.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

sgd_gradients, v = zip(*sgd_optimizer.compute_gradients(loss))
sgd_gradients, _ = tf.clip_by_global_norm(sgd_gradients, 25.0)
sgd_optimize = optimizer.apply_gradients(zip(sgd_gradients, v))

sess = tf.InteractiveSession()

Defining Optimizer


## Running the Seq2seq NMT

We now run the seq2seq NMT system we defined.

In [25]:
loss_over_time = []

# Initialize TensorFlow variables
tf.global_variables_initializer().run()

# Load the word embeddings
src_word_embeddings = np.load('de-embeddings.npy')
tgt_word_embeddings = np.load('en-embeddings.npy')

# Defining data generators
enc_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=source_sequence_length,is_train=True, is_source=True)
dec_data_generator = DataGeneratorMT(
    batch_size=batch_size,num_unroll=target_sequence_length,is_train=True, is_source=False)

num_steps = 10001
avg_loss = 0

print('Started Training')

for step in range(num_steps):

    # num_enc_unrollings: 40
    # num_dec_unrollings: 60
    print('.',end='')
    if (step+1)%100==0:
        print('')
        
    # Pick a random batch of sentences to train the algorithm
    sent_ids = np.random.randint(low=0,high=train_inputs.shape[0],size=(batch_size))

    # Create a batch of data for the encoder
    eu_data, eu_labels, _ = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Create a batch of data for the decoder
    du_data, du_labels, _ = dec_data_generator.unroll_batches(sent_ids=sent_ids)
    
    feed_dict = {}
    for ui,(dat,lbl) in enumerate(zip(eu_data,eu_labels)):            
        feed_dict[enc_train_inputs[ui]] = dat                
    
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl

    # Optimize the NMT with either Adam (first 10000 iterations)
    # or stochastic gradient descent (after 10000 iterations)
    if (step+1)<10000:
        _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
        tr_pred = tr_pred.flatten()
    else:
        _,l,tr_pred = sess.run([sgd_optimize,loss,train_prediction], feed_dict=feed_dict)
        tr_pred = tr_pred.flatten()
        
    # Print some training predictions
    if (step+1)%100==0:  
        
        print('Step ',step+1)

        # Print the train results (actual and predicted)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[::batch_size].tolist():
            print_str += tgt_reverse_dictionary[w] + ' '                    
            if tgt_reverse_dictionary[w] == '</s>':
                break
                      
        print(print_str)
        print()
        
        print_str = 'Predicted: '
        for w in tr_pred[::batch_size].tolist():
            print_str += tgt_reverse_dictionary[w] + ' '
            if tgt_reverse_dictionary[w] == '</s>':
                break
        print(print_str)
       
        print('\n')  
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[rand_idx::batch_size].tolist():
            print_str += tgt_reverse_dictionary[w] + ' '
            if tgt_reverse_dictionary[w] == '</s>':
                break
        print(print_str)

            
        print()
        print_str = 'Predicted: '
        for w in tr_pred[rand_idx::batch_size].tolist():
            print_str += tgt_reverse_dictionary[w] + ' '
            if tgt_reverse_dictionary[w] == '</s>':
                break
        print(print_str)
        print()        
        
    avg_loss += l # Update average loss
    
    # Print the loss
    if (step+1)%500==0:
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/500.0)
        
        loss_over_time.append(avg_loss/500.0)
             
        avg_loss = 0.0
        sess.run(inc_gstep)
            
        

Started Training
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
Step  500
Actual: the program <unk> ( Windows only ) helps Braille Music <unk> . </s> 

Predicted: The hotel is is <unk> <unk> the , be <unk> . . </s> 


Actual: Driving during morning and afternoon peak hours is not recommended , as traffic slows to a standstill and even a simple trip across a bridge can take up to 45 minutes . </s> 

Predicted: The <unk> the , the . . . a a to and the , . the <unk> . the to <unk> of . the <unk> . be the to the . . </s> 

	 Loss:  2.907963

....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
Step  4000
Actual: Location was excellent for walking to landmarks and monuments . The hotel staff was very friendly and accommodating . </s> 

Predicted: The is very and the distance the and the . </s> 


Actual: <unk> : User en ##AT##-##AT## 1 – <unk> <unk> <unk> <unk> , <unk> so <unk> . </s> 

Predicted: <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s> 

	 Loss:  2.0885994465351105
.......................................................

KeyboardInterrupt: 