# Neural Machine Translation with Attention: German to English

Here we implement a neural machine translator with attention using standard TensorFlow operations.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
import tensorflow as tf
from PIL import Image
from collections import Counter
import csv
import matplotlib.gridspec as gridspec
import word2vec

from nltk.translate.bleu_score import corpus_bleu
import nltk

  from ._conv import register_converters as _register_converters


## Loading Data 

First, download the data from this [page](https://nlp.stanford.edu/projects/nmt/). The required files are:

* File containing German sentences: [`train.de`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de)
* File containing English sentences: [`train.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en)
* File containing German vocabulary: [`vocab.50K.de`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.de)
* File containing English vocabulary: [`vocab.50K.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.en)

### Loading Vocabulary

First we build the vocabulary dictionaries for both the source (German) and target (English) languages. The vocabularies are found in the `vocab.50K.de` (German) and `vocab.50K.en` files.

In [2]:
# ==========================================
# Building source language vocabulary

# Contains word string -> ID mapping
src_dictionary = dict()

# Read the vocabulary file
with open('vocab.50K.de', encoding='utf-8') as f:
    # Read and store every line
    for line in f:
        #we are discarding last char as it is new line char
        src_dictionary[line[:-1]] = len(src_dictionary)

# Build a reverse dictionary with the mapping ID -> word string
src_reverse_dictionary = dict(zip(src_dictionary.values(),src_dictionary.keys()))

# Print some of the words in the dictionary
print('Source')
print('\t',list(src_dictionary.items())[:10])
print('\t',list(src_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(src_dictionary))

# ==========================================
# Building source language vocabulary

# Contains word string -> ID mapping
tgt_dictionary = dict()

# Read the vocabulary file
with open('vocab.50K.en', encoding='utf-8') as f:
    # Read and store every line
    for line in f:
        #we are discarding last char as it is new line char
        tgt_dictionary[line[:-1]] = len(tgt_dictionary)

# Build a reverse dictionary with the mapping ID -> word string
tgt_reverse_dictionary = dict(zip(tgt_dictionary.values(),tgt_dictionary.keys()))

# Print some of the words in the dictionary
print('Target')
print('\t',list(tgt_dictionary.items())[:10])
print('\t',list(tgt_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(tgt_dictionary))

# Each language has 50000 words
vocabulary_size = 50000

Source
	 [('Salinen', 49002), ('Carlo', 12416), ('aufbürden', 40870), ('ansässigen', 13646), ('feststellten', 33416), ('position', 16440), ('sechsten', 12946), ('BDSG', 28813), ('Lachs', 24120), ('fundamentalistischen', 26220)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'die'), (6, 'der'), (7, 'und'), (8, 'in'), (9, 'zu')]
	 Vocabulary size:  50000
Target
	 [('Carlo', 10590), ('Plato', 21977), ('Coconut', 49745), ('position', 311), ('duplications', 45666), ('disarm', 20083), ('622', 46724), ('reconstructing', 27679), ('light', 627), ('naturalist', 36451)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'the'), (4, ','), (5, '.'), (6, 'of'), (7, 'and'), (8, 'to'), (9, 'in')]
	 Vocabulary size:  50000


### Loading Training and Testing Data

Here we load the data in the `train.de` and `train.en` files. And split the data in the files into two sets; training and testing data.

In [3]:
# Contains the training sentences
source_sent = [] # Input
target_sent = [] # Output

# Contains the testing sentences
test_source_sent = [] # Input
test_target_sent = [] # Output

# We grab around 100 lines of data that are interleaved 
# in the first 50000 sentences
test_indices = [l_i for l_i in range(50,50001,500)]

# Read the source data file and read the first 250,000 lines (except first 50)
with open('train.de', encoding='utf-8') as f:
    for l_i, line in enumerate(f):
        # discarding first 50 translations as there was some
        # english to english mappings found in the first few lines. which are wrong
        if l_i<50:
            continue
        
        if len(source_sent)<250000 and l_i not in test_indices:
            source_sent.append(line)
        elif l_i in test_indices:
            test_source_sent.append(line)
        
# Read the target data file and read the first 250,000 lines (except first 50)            
with open('train.en', encoding='utf-8') as f:
    for l_i, line in enumerate(f):
        # discarding first 50 translations as there was some
        # english to english mappings found in the first few lines. which are wrong
        if l_i<50:
            continue
        
        if len(target_sent)<250000 and l_i not in test_indices:
            target_sent.append(line)
        elif l_i in test_indices:
            test_target_sent.append(line)
        
# Make sure we extracted same number of both extracted source and target sentences         
assert len(source_sent)==len(target_sent),'Source: %d, Target: %d'%(len(source_sent),len(target_sent))

# Print some source sentences
print('Sample translations (%d)'%len(source_sent))
for i in range(0,250000,10000):
    print('(',i,') DE: ', source_sent[i])
    print('(',i,') EN: ', target_sent[i])

# Print some target sentences
print('Sample test translations (%d)'%len(test_source_sent))
for i in range(0,100,10):
    print('DE: ', test_source_sent[i])
    print('EN: ', test_target_sent[i])



Sample translations (250000)
( 0 ) DE:  Hier erfahren Sie , wie Sie Creative Suite 2 und Creative Suite 3 am besten zusammen mit QuarkXPress nutzen können .

( 0 ) EN:  Here , you ’ ll find out how Creative Suite users can get the best possible interaction with QuarkXPress .

( 10000 ) DE:  Für die sehr günstigen Wochen- und Monatskarten ( 1 Monat ca.

( 10000 ) EN:  It is THE trendy area of Marseille .

( 20000 ) DE:  Freuen Sie sich auf die romantische Atmosphäre in den Zimmern und Apartments .

( 20000 ) EN:  Enjoy the romantic atmosphere of one of the guest rooms or apartments .

( 30000 ) DE:  Zu zwiespältig sind Dr. Gutherzens Erfahrungen aus frühen Studententagen verlaufen , in denen er sich in die Gefielde von durch Heidegger geprägten Autor / innen begeben hat und dort ständig mit strengem Blick darauf verwiesen wurde , er habe bestimmte Theorieressourcen und Gedankengebäude einfach noch nicht gründlich genug verstanden und könne deshalb nicht begreifen , warum seine Einwände 

### Preprocessing text
Here we preprocess the text by replacing words not found in the dictionary with `<unk>` as well as remove punctuation marks (`.`,`,`) and new-line characters.

In [4]:
# Keep track of how many unknown words were encountered
src_unk_count, tgt_unk_count = 0, 0

def split_to_tokens(sent,is_source):
    '''
    This function takes in a sentence (source or target)
    and preprocess the sentency with various steps (e.g. removing punctuation)
    '''
    
    global src_unk_count, tgt_unk_count

    # Remove punctuation and new-line chars
    sent = sent.replace(',',' ,')
    sent = sent.replace('.',' .')
    sent = sent.replace('\n',' ') 
    
    sent_toks = sent.split(' ')
    for t_i, tok in enumerate(sent_toks):
        if is_source:
            # src_dictionary contain the word -> word ID mapping for source vocabulary
            if tok not in src_dictionary.keys():
                if not len(tok.strip())==0:
                    sent_toks[t_i] = '<unk>'
                    src_unk_count += 1
        else:
            # tgt_dictionary contain the word -> word ID mapping for target vocabulary
            if tok not in tgt_dictionary.keys():
                if not len(tok.strip())==0:
                    sent_toks[t_i] = '<unk>'
                    #print(tok)
                    tgt_unk_count += 1
    return sent_toks

# Let us first look at some statistics of the sentences
# Train - source data
source_len = []
source_mean, source_std = 0,0
for sent in source_sent:
    source_len.append(len(split_to_tokens(sent,True)))

print('(Source) Sentence mean length: ', np.mean(source_len))
print('(Source) Sentence stddev length: ', np.std(source_len))

# Let us first look at some statistics of the sentences
# Train - target data
target_len = []
for sent in target_sent:
    target_len.append(len(split_to_tokens(sent,False)))

print('(Target) Sentence mean length: ', np.mean(target_len))
print('(Target) Sentence stddev length: ', np.std(target_len))

# Let us first look at some statistics of the sentences
# Test - source data
test_source_len = []
for sent in test_source_sent:
    test_source_len.append(len(split_to_tokens(sent, True)))
    
print('(Test-Source) Sentence mean length: ', np.mean(test_source_len))
print('(Test-Source) Sentence stddev length: ', np.std(test_source_len))

# Let us first look at some statistics of the sentences
# Test - target data
test_target_len = []
test_tgt_mean, test_tgt_std = 0,0
for sent in test_target_sent:
    test_target_len.append(len(split_to_tokens(sent, False)))
    
print('(Test-Target) Sentence mean length: ', np.mean(test_target_len))
print('(Test-Target) Sentence stddev length: ', np.std(test_target_len))



(Source) Sentence mean length:  26.244692
(Source) Sentence stddev length:  13.854376414156501
(Target) Sentence mean length:  28.275308
(Target) Sentence stddev length:  14.925498769057468
(Test-Source) Sentence mean length:  26.61
(Test-Source) Sentence stddev length:  14.800604717375572
(Test-Target) Sentence mean length:  29.08
(Test-Target) Sentence stddev length:  16.19424589167399


### Making training and testing data fixed length

Here we get all the source sentences and target sentences to a fixed length. This is, so that we can process the sentences as batches.

In [5]:
# ================================================================================
# Processing training data

src_unk_count, tgt_unk_count = 0, 0

train_inputs = []
train_outputs = []

# Chosen based on previously found statistics
src_max_sent_length = 41 
tgt_max_sent_length = 61

print('Processing Training Data ...\n')
for s_i, (src_sent, tgt_sent) in enumerate(zip(source_sent,target_sent)):
    # Break source and target sentences to word lists
    src_sent_tokens = split_to_tokens(src_sent,True)
    tgt_sent_tokens = split_to_tokens(tgt_sent,False)
    
    # Append <s> token's ID to the beggining of source sentence
    num_src_sent = [src_dictionary['<s>']]
    # Add the rest of word IDs for words found in the source sentence 
    for tok in src_sent_tokens:
        if tok in src_dictionary.keys():
            num_src_sent.append(src_dictionary[tok])

    # If the lenghth of the source sentence below the maximum allowed length
    # append </s> token's ID to the end
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])

    # If the length exceed the maximum allowed length
    # truncate the sentence
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
        
    # Make sure the sentence is of length src_max_sent_length
    assert len(num_src_sent)==src_max_sent_length,len(num_src_sent)

    train_inputs.append(num_src_sent)
    
    # Create the numeric target sentence with word IDs
    # append <s> to the beginning and append actual words later
    num_tgt_sent = [tgt_dictionary['<s>']]
    for tok in tgt_sent_tokens:
        if tok in tgt_dictionary.keys():
            num_tgt_sent.append(tgt_dictionary[tok])
        
    ## Modifying the outputs such that all the outputs have max_length elements
    if len(num_tgt_sent)<tgt_max_sent_length:
        num_tgt_sent.extend([tgt_dictionary['</s>'] for _ in range(tgt_max_sent_length - len(num_tgt_sent))])
    elif len(num_tgt_sent)>tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
        
    train_outputs.append(num_tgt_sent)
    
print('Unk counts Src: %d, Tgt: %d'%(src_unk_count, tgt_unk_count))
print('Sentences ',len(train_inputs))

assert len(train_inputs)  == len(source_sent),\
        'Size of total elements: %d, Total sentences: %d'\
                %(len(train_inputs),len(source_sent))

# Making inputs and outputs NumPy arrays
train_inputs = np.array(train_inputs, dtype=np.int32)
train_outputs = np.array(train_outputs, dtype=np.int32)

# Make sure number of inputs and outputs dividable by 100
train_inputs = train_inputs[:(train_inputs.shape[0]//100)*100,:]
train_outputs = train_outputs[:(train_outputs.shape[0]//100)*100,:]
print('\t Done processing training data \n')

# Printing some data
print('Samples from training data')
for ti in range(10):
    print('\t',[src_reverse_dictionary[w]  for w in train_inputs[ti,:].tolist()])
    print('\t',[tgt_reverse_dictionary[w]  for w in train_outputs[ti,:].tolist()])
print()
print('\tSentences ',train_inputs.shape[0])

# ================================================================================
# Processing Test data

src_unk_count, tgt_unk_count = 0, 0
print('Processing testing data ....\n')
test_inputs = []
test_outputs = []
for s_i, (src_sent,tgt_sent) in enumerate(zip(test_source_sent,test_target_sent)):
    src_sent_tokens = split_to_tokens(src_sent,True)
    tgt_sent_tokens = split_to_tokens(tgt_sent,False)
    
    num_src_sent = [src_dictionary['<s>']]
    for tok in src_sent_tokens:
        if tok in src_dictionary.keys():
            num_src_sent.append(src_dictionary[tok])
    
    num_tgt_sent = [src_dictionary['<s>']]
    for tok in tgt_sent_tokens:
        if tok in tgt_dictionary.keys():
            num_tgt_sent.append(tgt_dictionary[tok])
        
    # Append </s> if the length is not src_max_sent_length
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])
    # Truncate the sentence if length is over src_max_sent_length
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
        
    assert len(num_src_sent)==src_max_sent_length, len(num_src_sent)

    test_inputs.append(num_src_sent)
    
    # Append </s> is length is not tgt_max_sent_length
    if len(num_tgt_sent)<tgt_max_sent_length:
        num_tgt_sent.extend([tgt_dictionary['</s>'] for _ in range(tgt_max_sent_length - len(num_tgt_sent))])
    # Truncate the sentence if length over tgt_max_sent_length
    elif len(num_tgt_sent)>tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
        
    assert len(num_tgt_sent)==tgt_max_sent_length, len(num_tgt_sent)

    test_outputs.append(num_tgt_sent)

# Printing some data
print('Unk counts Tgt: %d, Tgt: %d'%(src_unk_count, tgt_unk_count))    
print('Done processing testing data ....\n')
test_inputs = np.array(test_inputs,dtype=np.int32)
test_outputs = np.array(test_outputs,dtype=np.int32)
print('Samples from training data')
for ti in range(10):
    print('\t',[src_reverse_dictionary[w]  for w in test_inputs[ti,:].tolist()])
    print('\t',[tgt_reverse_dictionary[w]  for w in test_outputs[ti,:].tolist()])

Processing Training Data ...

Unk counts Src: 464223, Tgt: 214783
Sentences  250000
	 Done processing training data 

Samples from training data
	 ['<s>', 'Hier', 'erfahren', 'Sie', ',', 'wie', 'Sie', 'Creative', 'Suite', '2', 'und', 'Creative', 'Suite', '3', 'am', 'besten', 'zusammen', 'mit', 'QuarkXPress', 'nutzen', 'können', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
	 ['<s>', 'Here', ',', 'you', '’', 'll', 'find', 'out', 'how', 'Creative', 'Suite', 'users', 'can', 'get', 'the', 'best', 'possible', 'interaction', 'with', 'QuarkXPress', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
	 ['<s>', 'Sie',

## Learning word embeddings

In this section, we learn word embeddings for both the languages using the sentences we have. After learning word embeddings, this will create two arrays (`en-embeddings-tmp.npy` and `de-embeddings-tmp.npy`) and store them on disk. To use this in the successive computations, go ahead and change the names to `en-embeddings.npy` and `de-embeddings.npy` respectively. ** You can skip this if you have run the code previously. **

In [6]:
# Total number of sentences
tot_sentences = train_inputs.shape[0]
print('Total number of training sentences: ',tot_sentences)

# we keep a cursor for each sentence in the training set
sentence_cursors = [0 for _ in range(tot_sentences)] 

batch_size = 64
embedding_size = 128 # Dimension of the embedding vector.

# Defining various things needed by the python script
word2vec.define_data_and_hyperparameters(
        tot_sentences, src_max_sent_length, tgt_max_sent_length, src_dictionary, tgt_dictionary,
        src_reverse_dictionary, tgt_reverse_dictionary, train_inputs, train_outputs, embedding_size,
    vocabulary_size)

# Print some batches to make sure the data generator is correct
word2vec.print_some_batches()

# Define TensorFlow ops for learning word embeddings
word2vec.define_word2vec_tensorflow(batch_size)

# Run embedding learning for source language
# Stores the de-embeddings-tmp.npy into the disk
word2vec.run_word2vec_source(batch_size)
# Run embedding learning for target language
# Stores the en-embeddings-tmp.npy to the disk
word2vec.run_word2vec_target(batch_size)

Total number of training sentences:  250000

with window_size = 1:
    batch: [['<s>', 'Florian'], ['<s>', 'Hotel'], ['<s>', 'Und'], ['<s>', 'alle'], ['<s>', '##AT##-##AT##'], ['<s>', 'ist'], ['<s>', 'können'], ['<s>', '-']]
    labels: ['Major', 'Das', '33', 'Für', '<unk>', 'Cadiz', 'Sie', '<unk>']

with window_size = 2:
    batch: [['<s>', 'Kleinigkeiten', 'es', ','], ['<s>', 'Badezimmer', '<unk>', 'Duschkabine'], ['<s>', 'Die', 'der', 'Räume'], ['<s>', 'Zur', 'Ihrer', 'Buchung'], ['<s>', 'Booking', ':', '<unk>'], ['<s>', '<unk>', '&apos;', 'autoroute'], ['<s>', 'Und', 'Wissen', 'über'], ['<s>', 'Das', 'ist', 'hier']]
    labels: ['sind', 'mit', 'meisten', 'Sicherung', '<unk>', 'l', 'ohne', 'Restaurant']
Defining 4 embedding lookups representing each word in the context
Stacked embedding size: [64, 128, 4]
Reduced mean embedding size: [64, 128]
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

S

KeyboardInterrupt: 

## Flipping the Input Data
Changin the order of the sentence of the target language improves the performance of NMT systems. Because when reversed, it helps the NMT system to establish a strong connection as the last word of the source language and the last word of the target language will be closest to each other. *DON'T RUN THIS MULTIPLE TIMES as running two times gives original.*

In [6]:
## Reverse the Germen sentences
# Remember reversing the source sentence gives better performance
# DON'T RUN THIS MULTIPLE TIMES as running two times gives original
train_inputs = np.fliplr(train_inputs)
test_inputs = np.fliplr(test_inputs)

print('Training and Test source data after flipping ')
print('\t',[src_reverse_dictionary[w] for w in train_inputs[0,:].tolist()])
print('\t',[tgt_reverse_dictionary[w] for w in test_inputs[0,:].tolist()])
print()
print('\t',[src_reverse_dictionary[w] for w in train_inputs[10,:].tolist()])
print('\t',[tgt_reverse_dictionary[w] for w in test_inputs[10,:].tolist()])

print()
print('\nTesting data after flipping')
print('\t',[src_reverse_dictionary[w] for w in test_inputs[0,:].tolist()])

Training and Test source data after flipping 
	 ['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '.', 'können', 'nutzen', 'QuarkXPress', 'mit', 'zusammen', 'besten', 'am', '3', 'Suite', 'Creative', 'und', '2', 'Suite', 'Creative', 'Sie', 'wie', ',', 'Sie', 'erfahren', 'Hier', '<s>']
	 ['tray', 'road', 'mistakes', 'of', 'expect', 'a', 'tabled', 'with', 'and', 'the', 'posts', 'useful', 'out', 'waiting', 'wounded', 'a', 'drinks', 'been', 'stand', '26th', 'and', 'senior', 'personal', ',', 'difficulties', 'qualifications', 'an', 'rather', 'road', 'rewriting', 'and', 'road', 'unsustainable', 'the', '2007', 'road', 'wounded', 'not', 'throughout', 'amendment', '<s>']

	 ['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '.', ')', 'Import', '##AT##-##AT##', 'PSD', '&gt;', 'Fenster', '(', 'Import', '##AT##-##AT##', 'PSD', 'Palette', 'die', 

## Data Generations for MT

Now we define the data generator for our NMT.

In [7]:
emb_mat = np.load('de-embeddings.npy')
embedding_size = emb_mat.shape[1]
input_size = embedding_size

class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source, is_train):
        # Number of data points in a batch
        self._batch_size = batch_size
        # Number of unrollings
        self._num_unroll = num_unroll
        # Cursors for each element in batch
        self._cursor = [0 for offset in range(self._batch_size)]
        
        # Loading the learnt word embeddings
        self._src_word_embeddings = np.load('de-embeddings.npy')
        self._tgt_word_embeddings = np.load('en-embeddings.npy')
        
        # The sentence IDs being currently processed to create the 
        # current batch
        self._sent_ids = None
        
        # We want a batch of data from source or target?
        self._is_source = is_source
        # Is this training or testing data?
        self._is_train = is_train
                
    def next_batch(self, sent_ids):
        
        # Depending on wheter we want source or target data
        # change the maximum sentence length
        if self._is_source:
            max_sent_length = src_max_sent_length
        else:
            max_sent_length = tgt_max_sent_length
            
        # Arrays to hold input and output data
        # Word embeddings (current word)
        batch_data = np.zeros((self._batch_size,input_size),dtype=np.float32)
        # One-hot encoded label (next word)
        batch_labels = np.zeros((self._batch_size,vocabulary_size),dtype=np.float32)
        
        
        # Populate each index of the batch
        for b in range(self._batch_size):
            
            # Sentence IDs to get data from
            sent_id = sent_ids[b]
            
            # If generating data with source sentences
            # use src_word_embeddings
            if self._is_source:
                # Depending on whether we need training data or testind data
                # choose the previously created training or testing data
                if self._is_train:
                    sent_text = train_inputs[sent_id]
                else:
                    sent_text = test_inputs[sent_id]
                             
                # Populate the batch data arrays
                batch_data[b] = self._src_word_embeddings[sent_text[self._cursor[b]],:]
                batch_labels[b] = np.zeros((vocabulary_size),dtype=np.float32)
                batch_labels[b,sent_text[self._cursor[b]+1]] = 1.0
            # If generating data with target sentences
            # use tgt_word_embeddings
            else:
                # Depending on whether we need training data or testind data
                # choose the previously created training or testing data
                if self._is_train:
                    sent_text = train_outputs[sent_id]
                else:
                    sent_text = test_outputs[sent_id]
                
                # We cannot avoid having two different embedding vectors for <s> token
                # in soruce and target languages
                # Therefore, if the symbol appears, we always take the source embedding vector
                if sent_text[self._cursor[b]]!=tgt_dictionary['<s>']:
                    batch_data[b] = self._tgt_word_embeddings[sent_text[self._cursor[b]],:]
                else:
                    batch_data[b] = self._src_word_embeddings[sent_text[self._cursor[b]],:]
                
                # Populate the data arrays
                batch_labels[b] = np.zeros((vocabulary_size),dtype=np.float32)
                batch_labels[b,sent_text[self._cursor[b]+1]] = 1.0
            
            # Update the cursor for each batch index
            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)
             
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        # Only if new sentence IDs if provided
        # else it will use the previously defined 
        # sent_ids continuously
        if sent_ids is not None:
            
            self._sent_ids = sent_ids
            # Unlike in the previous exercises we do not process a single sequence
            # over many iterations of unrollings. We process either a source sentence or target sentence
            # at a single go. So we reset the _cursor evrytime we generate a batch
            self._cursor = [0 for _ in range(self._batch_size)]
                
        unroll_data,unroll_labels = [],[]
        
        # Unrolling data over time
        for ui in range(self._num_unroll):
            
            if self._is_source:
                data, labels = self.next_batch(self._sent_ids)
            else:
                data, labels = self.next_batch(self._sent_ids)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        # Return unrolled data and sentence IDs
        return unroll_data, unroll_labels, self._sent_ids
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
# Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=5,num_unroll=20,is_source=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    # the the string words for returned word IDs and display the results
    print([src_reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

        # Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=5,num_unroll=30,is_source=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,2,3,4,5])
print('\nTarget data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    # the the string words for returned word IDs and display the results
    print([tgt_reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])



Source data
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '.', '</s>', '</s>']
['</s>', '</s>', 'bietet', '.', '</s>']
['</s>', '</s>', 'Dateiformat', 'nutzen', '</s>']
['</s>', '</s>', '##AT##-##AT##', 'optimal', '</s>']
['</s>', '</s>', 'PSD', 'Bilder', '</s>']
['</s>', '</s>', 'das', 'Ihre', '.']
['</s>', '</s>', 'über', 'für', 'werden']
['.', '</s>', 'Photoshop', 'es', 'ausgewählt']
['können', '.', 'mit', 'Sie', 'Verwendungszweck']

Target data batch
['Here', 'QuarkXPress', 'In', 'For', 'If']
[',', '8', 'this', 'example', 'you']
['you', 'is', 'section', ',', 'use']
['

## Attention-Based NMT System

Here we define the attention based NMT system. Unlike the standard NMT attention based NMT has the ability to refer to any of the encoder states during any step of the decoding. This is achieved through the attention layer.

### Defining hyperparameters
Here we define various hyperparameters we use to define our model.

In [8]:
num_nodes = 128
batch_size = 10

# We unroll the full length at one go
# both source and target sentences
enc_num_unrollings = 40
dec_num_unrollings = 60


### Defining Input/Output Placeholders
Here we define the placeholder to feed in inputs/outputs. Additionally we define a mask placeholder that can mask certain outputs from the loss calculation.

In [9]:
tf.reset_default_graph()

tgt_word_embeddings = tf.convert_to_tensor(np.load('en-embeddings.npy'))

# Training Input data.
enc_train_inputs = []

# Defining unrolled training inputs
for ui in range(enc_num_unrollings):
    enc_train_inputs.append(tf.placeholder(tf.float32, shape=[batch_size,input_size],name='train_inputs_%d'%ui))

# Training Input data.
dec_train_inputs, dec_train_labels = [],[]
dec_train_masks = []

# Defining unrolled training inputs
for ui in range(dec_num_unrollings):
    dec_train_inputs.append(tf.placeholder(tf.float32, shape=[batch_size,input_size],name='dec_train_inputs_%d'%ui))
    dec_train_labels.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size], name = 'dec_train_labels_%d'%ui))
    dec_train_masks.append(tf.placeholder(tf.float32, shape=[batch_size,1],name='dec_train_masks_%d'%ui))

enc_test_input = [tf.placeholder(tf.float32, shape=[batch_size,input_size]) for _ in range(enc_num_unrollings)]
enc_test_mask = [tf.placeholder(tf.int32,shape=[batch_size]) for _ in range(enc_num_unrollings)] 

dec_test_input = tf.nn.embedding_lookup(tgt_word_embeddings,[tgt_dictionary['<s>']])

### Defining the Encoder Model

We define the encoder model. The encoder model is a single LSTM cell with TensorFlow variables for the state and output variables.

In [10]:
print('Defining Encoder Parameters')
with tf.variable_scope('Encoder'):
    
    # Input gate (i_t) - How much memory to write to cell state
    enc_ix = tf.get_variable('ix',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_im = tf.get_variable('im',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_ib = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='ib')
    
    # Forget gate (f_t) - How much memory to discard from cell state
    enc_fx = tf.get_variable('fx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_fm = tf.get_variable('fm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_fb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='fb')
    
    # Candidate value (c~_t) - Used to compute the current cell state                    
    enc_cx = tf.get_variable('cx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_cm = tf.get_variable('cm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_cb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='cb') 
    
    # Output gate (o_t) - How much memory to output from the cell state
    enc_ox = tf.get_variable('ox',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_om = tf.get_variable('om',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_ob = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='ob') 
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name='train_output')
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name = 'train_cell')
    
    # Variables for saving state for testing
    saved_test_output = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_output')
    saved_test_state = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_cell')

print('\tDone')

Defining Encoder Parameters
	Done


### Defining the Decoder Model

Decoder is a single LSTM cell with an additional softmax layer that can predict words.

In [11]:
print('Defining Decoder Parameters')
with tf.variable_scope('Decoder'):
    
    # Input gate (i_t) - How much memory to write to cell state
    dec_ix = tf.get_variable('ix',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_im = tf.get_variable('im',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_ic = tf.get_variable('ic',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_ib = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='ib')    
    
    # Forget gate (f_t) - How much memory to discard from cell state
    dec_fx = tf.get_variable('fx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_fm = tf.get_variable('fm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_fc = tf.get_variable('fc',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_fb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='fb')    
    
    # Candidate value (c~_t) - Used to compute the current cell state                           
    dec_cx = tf.get_variable('cx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_cm = tf.get_variable('cm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_cc = tf.get_variable('cc',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_cb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='cb')     
    
    # Output gate (o_t) - How much memory to output from the cell state
    dec_ox = tf.get_variable('ox',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_om = tf.get_variable('om',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_oc = tf.get_variable('oc',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_ob = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='ob') 
    
    # Softmax Classifier weights and biases.
    # If we are using sampled softmax loss, the weights dims shouldbe [50000, 64]
    # If not, then [64, 50000]
    w = tf.get_variable('softmax_weights',shape=[num_nodes*2, vocabulary_size], 
                        initializer = tf.contrib.layers.xavier_initializer())
    b = tf.Variable(tf.random_uniform([vocabulary_size],-0.05,-0.05),name='softmax_bias')

print('\tDone')

Defining Decoder Parameters
	Done


### Attention Layer Related Variables

We define the weights used to compute the energy ($e_{ij}$) in the attention layer.

In [12]:
print('Defining Attention Variables ...')
with tf.variable_scope('Attention'):
    
    # Used to calculate e_{ij} as
    # e_{ij} = v_a' tanh(W_a . dec_output + U_a . enc_output)
    # Then alpha_{ij} is the softmax output (normalized) of e_{ij}
    W_a = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],stddev=0.05),name='W_a')
    U_a = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],stddev=0.05),name='U_a')
    v_a = tf.Variable(tf.truncated_normal([num_nodes,1],stddev=0.05),name='v_a')
    
print('\tDone')

Defining Attention Variables ...
	Done


### Defining Cell and Layer Computational Functions

We define several functions below:
* Encoder LSTM cell computations
* Decoder LSTM cell computations
* Attention layer computations.

In [13]:
# Definition of the cell computation (Encoder)
def enc_lstm_cell(i, o, state):
    """Create a LSTM cell"""
    input_gate = tf.sigmoid(tf.matmul(i, enc_ix) + tf.matmul(o, enc_im) + enc_ib)
    forget_gate = tf.sigmoid(tf.matmul(i, enc_fx) + tf.matmul(o, enc_fm) + enc_fb)
    update = tf.matmul(i, enc_cx) + tf.matmul(o, enc_cm) + enc_cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, enc_ox) + tf.matmul(o, enc_om) + enc_ob)
    return output_gate * tf.tanh(state), state

# Definition of the cell computation (Decoder)
def dec_lstm_cell(i, o, state, c):
    """Create a LSTM cell"""
    input_gate = tf.sigmoid(tf.matmul(i, dec_ix) + tf.matmul(o, dec_im) + tf.matmul(c, dec_ic) + dec_ib)
    forget_gate = tf.sigmoid(tf.matmul(i, dec_fx) + tf.matmul(o, dec_fm) + tf.matmul(c, dec_fc) + dec_fb)
    update = tf.matmul(i, dec_cx) + tf.matmul(o, dec_cm) + tf.matmul(c, dec_cc) +dec_cb 
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, dec_ox) + tf.matmul(o, dec_om) + tf.matmul(o, dec_oc) + dec_ob)
    return output_gate * tf.tanh(state), state
                    
def attn_layer(h_j_unrolled, s_i_minus_1):
    '''
    Computes attention values for a given decoding position
    h_j_unrolled : all the unrolled encoder outputs [[batch_size, num_nodes], [batch_size, num_nodes], ....] => enc_num_unrolling-many
    s_i_minus_1 : the previous decoder output [batch_size, num_nodes]
    '''
    # For the following four calculations we calculate by concatenating all encoder outputs (enc_num_unrollings)
    # get the encoder logits
    enc_logits = tf.concat(axis=0,values=h_j_unrolled)
    
    # W_a . encoder_output
    w_a_mul_s_i_minus_1 = tf.matmul(enc_logits,W_a) # of size [enc_num_unroll x batch_size, num_nodes]
    
    # U_a . decoder_output
    u_a_mul_h_j = tf.matmul(tf.tile(s_i_minus_1,[enc_num_unrollings,1]), U_a) # of size [enc_num_unroll x batch_size, num_nodes]   
    
    # calculate "energy"
    e_j = tf.matmul(tf.nn.tanh(w_a_mul_s_i_minus_1 + u_a_mul_h_j),v_a) # of size [enc_num_unroll x batch_size ,1]
    
    # we split the e_j s again into enc_num_unrollings batches
    batched_e_j = tf.split(axis=0,num_or_size_splits=enc_num_unrollings,value=e_j) # list of enc_num_unroll elements, each element [batch_size, 1]
    reshaped_e_j = tf.concat(axis=1,values=batched_e_j) # of size [batch_size, enc_num_unroll]

    # Now we calculate alpha_i for all the enc_num_unrollings time steps
    alpha_i = tf.nn.softmax(reshaped_e_j) # of size [batch_size, enc_num_unroll]
    # break alpha_i into list of enc_num_unroll elemtns, each of size [batch_size,1]
    alpha_i_list = tf.unstack(alpha_i,axis=1) 
    
    # list of enc_num_unroll elements, each of size [batch_size,num_nodes]
    c_i_list =  [tf.reshape(alpha_i_list[e_i],[-1,1])*h_j_unrolled[e_i] for e_i in range(enc_num_unrollings)] 
    
    # add_n batches all together
    c_i = tf.add_n(c_i_list) # of size [batch_size, num_nodes]
    
    return c_i,alpha_i
        

### Defining LSTM Computations

Here we define the computations to compute the final state variables of the encoder, feeding that into the decoder as the intial state, computing attention and finally computing the LSTM output, logit values and the predictions.

In [14]:
# ================================================
# Training related inference logic

# Store encoder outputs and decoder outputs across the unrolling
enc_outputs, dec_outputs = list(),list()

# Context vecs are the c_i values in the attention computation
context_vecs = list()

# These variables are initialized with saved_output and saved_sate
# values and then iteratively updated during unrollings
output = saved_output
state = saved_state
                             

print('Calculating Encoder Output')
# update the output and state values for all the inputs we have
for i in enc_train_inputs:
    output, state = enc_lstm_cell(i, output,state)
    # Accumulate all the output values in to a list
    enc_outputs.append(output)

print('Calculating Decoder Output with Attention')
# Before starting decoder computations, we make sure that
# the encoder outputs are computed
with tf.control_dependencies([saved_output.assign(output),
                             saved_state.assign(state)]):

    # Iterate through the decoder unrollings
    for ii,i in enumerate(dec_train_inputs):
        
        # Compute attention value for each decode position
        c_i,_ = attn_layer(enc_outputs, output)
        
        # Accumulate c_i in a list
        context_vecs.append(c_i)
        
        output, state = dec_lstm_cell(i, output, state, c_i)

        # Accumulate decoder outputs in a list
        dec_outputs.append(output)
    
    print('Calculating Softmax output')
    
    # Compute the logit values
    logits = tf.matmul(
        tf.concat(axis=1, values=[
            tf.concat(axis=0, values=dec_outputs),
            tf.concat(axis=0, values=context_vecs)
        ]), w) + b

    # Predictions.
    train_prediction = tf.nn.softmax(logits)

# ================================================
# Testing related inference logic

# Initialize iteratively updated states with 
# saved_test_output and saved_test_state
test_output  = saved_test_output
test_state = saved_test_state

print("Calculations for test data")
test_predictions = []
test_enc_outputs = []

# Compute the encoder output iteratively
for i in enc_test_input:
    test_output, test_state = enc_lstm_cell(i, test_output,test_state)
    test_enc_outputs.append(test_output)

# This is used for visualization purposes
# To build the attention matrix discussed in the chapter
test_alpha_i_unrolled = []

# Make sure the encoder computations are done
with tf.control_dependencies([saved_test_output.assign(test_output),
                                 saved_test_state.assign(test_state)]):
    
    # Compute the decoder outputs iteratively
    for i in range(dec_num_unrollings):
        
        test_c_i,test_alpha = attn_layer(test_enc_outputs, test_output)
        
        # Used for attention visualization purposes
        test_alpha_i_unrolled.append(test_alpha)
        
        test_output, test_state = dec_lstm_cell(dec_test_input, test_output, test_state, test_c_i)

        # Compute predictions for each decoding step
        test_prediction = tf.nn.softmax(
            tf.nn.xw_plus_b(
                tf.concat(axis=1,values=[test_output,test_c_i]), w, b
            )
        )

        dec_test_input = tf.nn.embedding_lookup(tgt_word_embeddings,tf.argmax(test_prediction,axis=1))
        test_predictions.append(tf.argmax(test_prediction,axis=1))

print('\tDone')            

Calculating Encoder Output
Calculating Decoder Output with Attention
Calculating Softmax output
Calculations for test data
	Done


### Calculating the Loss

Here we calculate the loss. Loss is calculated by summing all the losses obtained across the time axis and averaging over the batch axis. You can see how the `dec_train_masks` is used to mask out irrelevant words from influencing loss

In [15]:
# Defining loss, cross-entropy loss summed across time axis averaged over batch axis
loss_batch = tf.concat(axis=0,values=dec_train_masks)*tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits, labels=tf.concat(axis=0, values=dec_train_labels))
loss = tf.reduce_mean(loss_batch)

### Optimizer
We define the model optimization specific operations. We use two optimizers here; Adam and SGD. I observed that using Adam only cause the model to exhibit some undesired behaviors in the long run. Therefore we use Adam to get a good initial estimate for the SGD and use SGD from that point onwards.

In [16]:
print('Defining Optimizer')

# These are used to decay learning rate over time
global_step = tf.Variable(0, trainable=False)
inc_gstep = tf.assign(global_step,global_step + 1)

# We use two optimizers, when the optimizer changes
# we reset the global step
reset_gstep = tf.assign(global_step,0)

# Calculate decaying learning rate
learning_rate = tf.maximum(
    tf.train.exponential_decay(
        0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True
    ), 0.0001)

sgd_learning_rate = tf.maximum(
    tf.train.exponential_decay(
        0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True
    ), 0.0001)

# We use two optimizers: Adam and naive SGD
# using Adam in the long run produced undesirable results 
# (e.g.) sudden fluctuations in BLEU
# Therefore we use Adam to get a good starting point for optimizing
# and then switch to SGD from that point onwards
with tf.variable_scope('Adam'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
with tf.variable_scope('SGD'):
    sgd_optimizer = tf.train.GradientDescentOptimizer(sgd_learning_rate)
    
# Calculates gradients with clipping for Adam
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 25.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

# Calculates gradients with clipping for SGD
sgd_gradients, v = zip(*sgd_optimizer.compute_gradients(loss))
sgd_gradients, _ = tf.clip_by_global_norm(sgd_gradients, 25.0)
sgd_optimize = optimizer.apply_gradients(zip(sgd_gradients, v))

# Make sure gradients exist flowing from decoder to encoder
print('Checking gradient flow from encoder-to-decoder')
for (g_i,v_i) in zip(gradients,v):
    assert g_i is not None, 'Gradient none for %s'%(v_i.name)
print('\t Ok...')
print('\tDone')

Defining Optimizer
Checking gradient flow from encoder-to-decoder
	 Ok...
	Done


### Resetting Train and Test States
We here define the state resetting functions

In [17]:
# Reset state
reset_train_state = tf.group(
    tf.assign(saved_output, tf.zeros([batch_size, num_nodes])),
    tf.assign(saved_state, tf.zeros([batch_size, num_nodes]))
                            )

reset_test_state = tf.group(
    saved_test_output.assign(tf.zeros([batch_size, num_nodes])),
    saved_test_state.assign(tf.zeros([batch_size, num_nodes]))
                             )


 ## Running the Neural Machine Translator with Attention
 
 With all the relevant TensorFlow operations defined we move on to defining several functions related to executing our NMT model as well as runnning the model to obtain translations for previously unseen source sentences.

### Functions for Evaulating and Printing Results

Next we define two functions to print and save the prediction results for training data as well as testing data, and finally define a function to obtain candidate and reference data to calculate the BLEU score.

In [18]:
def print_and_save_train_predictions(du_labels, tr_pred, rand_idx, train_prediction_text_fname):
    '''
    Use this to print some predicted training samples and save it to file
    du_labels: Decoder's unrolled labels (this is a list of dec_num_unrollings 
    where each item is [batch_size, vocabulary_size])
    tr_pred: This is an array [dec_num_unrollings*batch_size, vocabulary_size] array
    rand_idx: Some random index we use to pick a data point to print
    train_prediction_text_fname: The file we save the prediction results into
    '''

    # This print_str will be written to the text file as well as printed here
    print_str = 'Actual: ' 
    
    # We can get each label corresponding to some sentence by traversing the
    # concatenated labels array ([dec_num_unrollings*batch_size, vocabulary_size])
    # with a batch_size stride
    for w in np.argmax(np.concatenate(du_labels,axis=0)[rand_idx::batch_size],axis=1).tolist():
        # Update the print_str
        print_str += tgt_reverse_dictionary[w] + ' '
        # When we encounter the end of sentence </s> we stop printing
        if tgt_reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    
    # Write to file
    with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:                
        fa.write(print_str+'\n')  

    # Now print the predicted data by following the same procedure as above
    print()
    print_str = 'Predicted: '
    for w in np.argmax(tr_pred[rand_idx::batch_size],axis=1).tolist():
        print_str += tgt_reverse_dictionary[w] + ' '
        # When we encounter the end of sentence </s> we stop printing
        if tgt_reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:                
        fa.write(print_str+'\n')    
    
    
def print_and_save_test_predictions(test_du_labels, test_pred_unrolled, batch_id, test_rand_idx, test_prediction_text_fname):
    '''
    Use this to print some predicted training samples and save it to file
    test_du_labels: Decoder's unrolled labels (this is a list of dec_num_unrollings 
    where each item is [batch_size, vocabulary_size])
    test_pred_unrolled: This is an array [dec_num_unrollings*batch_size, vocabulary_size] array
    batch_id: We need this to retrieve the actual sentence for the predicted 
    test_rand_idx: Some random index we use to pick a data point to print
    test_prediction_text_fname: The file we save the prediction results into
    '''
    
    # Print the actual sentence
    print('DE: ',test_source_sent[(batch_id*batch_size)+test_rand_idx])
    # print_str is the string we display as results and write to a file
    print_str = '\t EN (TRUE):' + test_target_sent[(batch_id*batch_size)+test_rand_idx]
    print(print_str + '\n')

    # Printing predictions
    print_str = '\t EN (Predicted): ' 
    
    for test_pred in test_pred_unrolled:                            
        print_str += tgt_reverse_dictionary[test_pred[test_rand_idx]] + ' '
        if tgt_reverse_dictionary[test_pred[test_rand_idx]] == '</s>':
            break
    print(print_str + '\n')

    # Write the results to text file
    with open(os.path.join(log_dir, test_prediction_text_fname),'a',encoding='utf-8') as fa:                                
        fa.write(print_str+'\n') 
        
def create_bleu_ref_candidate_lists(all_preds, all_labels):
    '''
    Creates two lists (candidate list and reference list) for calcluating BLEU
    all_preds: All the predictions
    all_labels: Correspondign all the actual labels
    Returns
    cand_list: List (sentences) of lists (words in a sentence)
    ref_list: List (sentences) of lists (words in a sentence)
    '''
    bleu_labels, bleu_preds = [],[]
    
    # calculate bleu score:        
    # We iterate batch_size times as i=0,1,2,...,batch_size while grabbing 
    # i, i+batch_size, i+2*batch_size, i+3*batch_size elements from all_labels and all_preds
    # This because the labels/predicitons belonging to same sentence are interleaved by batch_size 
    # due to the way concatenate labels and predictions
    # Taking elements interleaved by batch_size gives the sequence of words belonging to the same sentence
    ref_list, cand_list = [],[]
    for b_i in range(batch_size):
        tmp_lbl = all_labels[b_i::batch_size]            
        tmp_lbl = tmp_lbl[np.where(tmp_lbl != tgt_dictionary['</s>'])]            
        ref_str = ' '.join([tgt_reverse_dictionary[lbl] for lbl in tmp_lbl])
        ref_list.append([ref_str])

        tmp_pred = all_preds[b_i::batch_size]
        tmp_pred = tmp_pred[np.where(tmp_pred != tgt_dictionary['</s>'])]
        cand_str = ' '.join([tgt_reverse_dictionary[pre] for pre in tmp_pred])
        cand_list.append(cand_str)

    return cand_list, ref_list

### Defining a Single Step of Training

We now define a function to train the NMT model for a single step. It takes in encoder inputs, decoder inputs and decoder labels and train the NMT for a single step.

In [19]:
def train_single_step(eu_data, du_data, du_labels):
    '''
    Define a single training step
    eu_data: Unrolled encoder inputs (word embeddings)
    du_data: Unrolled decoder inputs (word embeddings)
    du_labels: Unrolled decoder outputs (one hot encoded words)
    '''
    # Fill the feed dict (Encoder)
    feed_dict = {}
    for ui,dat in enumerate(eu_data):            
        feed_dict[enc_train_inputs[ui]] = dat    
    
    
    # Fill the feed dict (Decoder) 
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl
        # The mask masks the </s> items from being part of the loss
        d_msk = (np.logical_not(np.argmax(lbl,axis=1)==tgt_dictionary['</s>'])).astype(np.int32).reshape(-1,1)
        feed_dict[dec_train_masks[ui]] = d_msk
    
    # ======================= OPTIMIZATION ==========================
    # Using Adam in long term gives very weird behaviors in loss
    # so after 20000 iterations we change the optimizer to SGD
    if (step+1)<20000:
        _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
    else:
        _,l,tr_pred = sess.run([sgd_optimize,loss,train_prediction], feed_dict=feed_dict)
        
    return l, tr_pred

### Defining Data Generators and Other Related Variables

Here we load the word embeddings and some other things as well as define a function to retrieve data generators

In [20]:
# This is where all the results will be logged into
log_dir = 'logs'
if not os.path.exists(log_dir):
    os.mkdir(log_dir)

# Filenames of the logs
train_prediction_text_fname = 'train_predictions_attn.txt'
test_prediction_text_fname = 'test_predictions_attn.txt'

# Some configuration for the TensorFlow session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement=True
sess = tf.InteractiveSession(config=config)

# Initialize global variables
tf.global_variables_initializer().run()

# Load the word embeddings
src_word_embeddings = np.load('de-embeddings.npy')
tgt_word_embeddings = np.load('en-embeddings.npy')

# Defining data generators
def define_data_generators(batch_size, enc_num_unrollings, dec_num_unrollings):
    # Training data generators (Encoder and Decoder)
    enc_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=enc_num_unrollings,is_source=True, is_train=True)
    dec_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=dec_num_unrollings,is_source=False, is_train=True)

    # Testing data generators (Encoder and Decoder)
    test_enc_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=enc_num_unrollings,is_source=True, is_train=False)
    test_dec_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=dec_num_unrollings,is_source=False, is_train=False)
    
    return enc_data_generator,dec_data_generator,test_enc_data_generator,test_dec_data_generator


### Running Training and Testing for NMT

With all the TensorFlow operations, helper functions defined we train and test the NMT system.

In [None]:
# Training and test BLEU scores
attn_train_bleu_scores_over_time,attn_test_bleu_scores_over_time = [],[]
# Loss over time
loss_over_time = []

# Labels and predictions required to calculate the BLEU scores
# for both train and test data
train_bleu_refs, train_bleu_cands = [],[]
test_bleu_refs, test_bleu_cands = [],[]

# Training and test BLEU scores
num_steps = 100001
avg_loss = 0

# Defining data generators for encoder/decoder and training/testing
enc_data_generator, dec_data_generator, \
test_enc_data_generator, test_dec_data_generator = \
define_data_generators(batch_size, enc_num_unrollings, dec_num_unrollings)

print('Started Training')

for step in range(num_steps):

    # input (encoder) unrolling length: 40
    # output (decoder) unrolling length: 60
    if (step+1)%10==0:
        print('.',end='')

    # Sample a random batch of IDs from training data
    sent_ids = np.random.randint(low=0,high=train_inputs.shape[0],size=(batch_size))
    
    # Getting an unrolled set of data batches for the encoder
    eu_data, eu_labels, _ = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Getting an unrolled set of data batches for the decoder
    du_data, du_labels, _ = dec_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Train for single step
    l, tr_pred = train_single_step(eu_data, du_data, du_labels)
    
    # We don't calculate BLEU scores all the time as this is expensive, 
    # it slows down the code
    if np.random.random()<0.1:
        
        # all_labels are labels obtained by concatinating all the labels in batches
        all_labels = np.argmax(np.concatenate(du_labels,axis=0),axis=1)
        # all_preds are predictions for all unrolled steps
        all_preds = np.argmax(tr_pred,axis=1)
        
        # Get training BLEU candidates and references
        batch_cands, batch_refs = create_bleu_ref_candidate_lists(all_preds, all_labels)
        
        # Accumulate training candidates/references for calculating
        # BLEU later
        train_bleu_refs.extend(batch_refs)
        train_bleu_cands.extend(batch_cands)

    if (step+1)%500==0:  
        
        # Writing actual and predicte data to train_prediction.txt file for some random sentence
        print('Step ',step+1)
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:                                
            fa.write('============= Step ' +  str(step+1) + ' =============\n') 
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_and_save_train_predictions(du_labels, tr_pred, rand_idx, train_prediction_text_fname)        
        
        # Calculating the BLEU score for the accumulated candidates/references
        bscore = 0.0
        bscore = corpus_bleu(train_bleu_refs,train_bleu_cands,smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
        attn_train_bleu_scores_over_time.append(bscore)
        print('(Train) BLEU (%d elements): '%(len(train_bleu_refs)),bscore)
        
        # Reset the candidate/reference accumulators
        train_bleu_refs, train_bleu_cands = [],[]
        
        # Write BLEU score to file
        with open(log_dir + os.sep +'blue_scores_attn.txt','a') as fa_bleu:
            fa_bleu.write(str(step+1) +','+str(bscore)+'\n')
        
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:                
            fa.write('(Train) BLEU: %.5f\n'%bscore)        
        
    avg_loss += l # Update average loss
    
    sess.run(reset_train_state) # resetting hidden state for each batch
    
    # ============================= TEST PHASE ==================================
    if (step+1)%1000==0:
        
        # calculate average loss
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/1000.0)
        loss_over_time.append(avg_loss/1000.0)
        
        # write losses to file
        with open(log_dir + os.sep + 'losses_attn.txt','a') as fa_loss:
            fa_loss.write(str(step+1) +','+str(avg_loss/1000.0)+'\n')
        
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:                                
            fa.write('============= Step ' +  str(step+1) + ' =============\n') 
            fa.write('\t Loss: %.5f\n'%(avg_loss/1000.0))
            
        avg_loss = 0.0
        
        # Increase gstep to decay learning rate
        sess.run(inc_gstep)
        
        # reset global step when we change the optimizer
        if (step+1)==20000: 
            sess.run(reset_gstep)
        
        print('=====================================================')
        print('(Test) Translating test sentences ...')
        

        print('Processing test data ... ')
        
        # ===================================================================================
        # Predictions for Test data
        for in_i in range(test_inputs.shape[0]//batch_size):
            
            # Generate encoder / decoder data for testing data
            test_eu_data, test_eu_labels, _ = test_enc_data_generator.unroll_batches(sent_ids=np.arange(in_i*batch_size,(in_i+1)*batch_size))
            test_du_data, test_du_labels, _ = test_dec_data_generator.unroll_batches(sent_ids=np.arange(in_i*batch_size,(in_i+1)*batch_size))
            
            # fill the feed dict
            feed_dict = {}
            for ui,(dat,lbl) in enumerate(zip(test_eu_data,test_eu_labels)):            
                feed_dict[enc_test_input[ui]] = dat             

            # Get predictions out with decoder          
            # run prediction calculation this returns a list of prediction dec_num_unrollings long
            test_pred_unrolled = sess.run(test_predictions, feed_dict=feed_dict)
            
            # We print a randomly selected sample from each batch
            test_rand_idx = np.random.randint(0,batch_size) # used for printing test output
            
            print_and_save_test_predictions(test_du_labels, test_pred_unrolled, in_i, test_rand_idx, test_prediction_text_fname)
            
            # Things required to calculate test BLEU score
            all_labels = np.argmax(np.concatenate(test_du_labels,axis=0),axis=1)
            all_preds = np.concatenate(test_pred_unrolled, axis=0)
            batch_cands, batch_refs = create_bleu_ref_candidate_lists(all_preds, all_labels)
            test_bleu_refs.extend(batch_refs)
            test_bleu_cands.extend(batch_cands)
            
            # Reset the test state
            sess.run(reset_test_state)
        
        # Calculate test BLEU score
        test_bleu_score = 0.0
        test_bleu_score = corpus_bleu(test_bleu_refs,test_bleu_cands,
                                      smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
        attn_test_bleu_scores_over_time.append(test_bleu_score)
        print('(Test) BLEU (%d elements): '%(len(test_bleu_refs)),test_bleu_score)
        
        test_bleu_refs, test_bleu_cands = [],[]        
        print('=====================================================')

Started Training
..................................................Step  500
Actual: You can reserve your treatment at the spa from Tuesday to Saturday 11 : 00 to 19 : 30 . </s> 

Predicted: The is the the the , the <unk> of the . the . . <unk> . the . <unk> . </s> 
(Train) BLEU (530 elements):  0.08305565521562465
..................................................Step  1000
Actual: The southern half is the section where <unk> and great gurus live . </s> 

Predicted: The hotel is of a <unk> of the , the . ##AT##-##AT## . </s> 
(Train) BLEU (400 elements):  0.1793990845295268
	 Loss:  1.1863172934949398
(Test) Translating test sentences ...
Processing test data ... 
DE:  Zum klimatisierten Hotel gehören auch ein Whirpool und eine traumhafte Sonnenterrasse .

	 EN (TRUE):Apart from this , the guests can enjoy the facility of an independent air ##AT##-##AT## conditioning system , a jacuzzi and a beautiful sun terrace .


	 EN (Predicted): The hotel is situated in the heart of the heart of

DE:  Das Haus liegt in der CCZ ##AT##-##AT## Umweltzone und bietet eine sehr gute Anbindung an das Bus- und U ##AT##-##AT## Bahnnetz .

	 EN (TRUE):Set inside the central London congestion ##AT##-##AT## charging zone , this modern hotel has superb transport links , with access to the Tube and the bus network practically on the doorstep .


	 EN (Predicted): The hotel is located in the heart of the hotel , the hotel is located in the heart of the hotel . </s> 

DE:  Dazu kam die deutsche Beteiligung an AWACS ##AT##-##AT## Flügen .

	 EN (TRUE):Additionally , the Germans participated in AWACS flights .


	 EN (Predicted): The <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 

DE:  Ein ält

(Train) BLEU (410 elements):  0.29593363042257675
	 Loss:  0.9000037522614002
(Test) Translating test sentences ...
Processing test data ... 
DE:  34 Diese a Worte sind wahr und treu ; darum übertretet sie nicht , und b nehmt auch nichts davon weg .

	 EN (TRUE):34 These sayings are a true and faithful ; wherefore , transgress them not , neither b take therefrom .


	 EN (Predicted): 18 And it came to pass that they are a <unk> , and I have a b b and the b of the Lord . </s> 

DE:  Alle unterstützten Barcode Varianten sind in einem einzigen Interface konfigurierbar .

	 EN (TRUE):All supported bar code formats are configurable through one single interface .


	 EN (Predicted): All can be used to the <unk> . </s> 

DE:  Es existieren Busverbindungen in nahezu jeden Ort der Provence ( eventuell mit Umsteigen in Aix ##AT##-##AT## en ##AT##-##AT## Provence ) , allerdings sollte beachtet werden , dass die letzten Busse abends ca. um 19 Uhr fahren .

	 EN (TRUE):As always in France those hig

DE:  Booking.com : Best Western Hotell SöderH , Söderhamn , Schweden - 29 Gästebewertungen .

	 EN (TRUE):Booking.com : Best Western Hotell SöderH , Söderhamn , Sweden - 29 Guest reviews .


	 EN (Predicted): Booking .com : <unk> <unk> , <unk> , <unk> , France , France - 13 Guest reviews . </s> 

DE:  Zusätzlich enthält TBarCode / SAPwin eine Menge neuer Strichcode ##AT##-##AT## Symbologien .

	 EN (TRUE):In addition TBarCode / SAPwin comes with a bunch of new bar code symbologies .


	 EN (Predicted): O <unk> / <unk> / <unk> / <unk> / <unk> / <unk> / <unk> / <unk> / <unk> / <unk> / <unk> ) . </s> 

DE:  Das ist eine Metapher , wird jemand von der Propagandaabteilung entgegnen .

	 EN (TRUE):It &apos;s only a metaphor , people from the propaganda department will say .


	 EN (Predicted): The is a &quot; <unk> &quot; &quot; . </s> 

DE:  Es handelt sich um ein ziemlich einfaches Protokoll ; TFTP macht aber manchmal Probleme .

	 EN (TRUE):This is a fairly simple protocol , but sometimes

(Train) BLEU (520 elements):  0.32406730404646117
..................................................Step  7000
Actual: Room Notes : Advance <unk> Rate – Bed and Breakfast . </s> 

Predicted: Room Notes : <unk> <unk> <unk> - <unk> &amp; Breakfast . </s> 
(Train) BLEU (560 elements):  0.3168551081419188
	 Loss:  0.8473638187050819
(Test) Translating test sentences ...
Processing test data ... 
DE:  Mag sein , dass du deine ersten Gehversuche in einem rostigen , undichten Kahn beginnst - aber mit der Zeit wirst du dich zum schnittigen Speedboat oder edlen Katamaran vorarbeiten .

	 EN (TRUE):You may be starting in a ramshackle old tub of a boat , but in no time at all you &apos;ll be able to buy a fancy speedboat , or a classy catamaran . Turn your newfound fame into money , and spend it to buy lavish new homes .


	 EN (Predicted): <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> 

DE:  Das ist eine Metapher , wird jemand von der Propagandaabteilung entgegnen .

	 EN (TRUE):It &apos;s only a metaphor , people from the propaganda department will say .


	 EN (Predicted): The <unk> is a <unk> of <unk> . </s> 

DE:  Es handelt sich um ein ziemlich einfaches Protokoll ; TFTP macht aber manchmal Probleme .

	 EN (TRUE):This is a fairly simple protocol , but sometimes there are problems trying to get it to work .


	 EN (Predicted): It is also possible to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be 

DE:  Die Musikant entspricht dem Folkrock . Dennoch finden sich in den Liedern viele musikalische Elemente aus klassisch- folklorischer Liedern aus Anatolien bis zum Mittelmeer , aus Latein- amerikanischen Märschen bis hin zu klassischen Rockklängen .

	 EN (TRUE):Yorum continues to sing hopeful songs in the name of al

DE:  Das Hotel Sempione verfügt über eine ideale , ruhige Lage in einem geschäftigen Viertel mit guter Verkehrsanbindung . Der Bahnhof und eine U ##AT##-##AT## Bahnstation liegen in der Nähe .

	 EN (TRUE):Hotel Sempione welcomes you to a busy yet quiet area of Milan , within walking distance of excellent transport links , including the central railway station and the Repubblica metro station .


	 EN (Predicted): The Hotel <unk> is a very central location , a short walk from the city centre and the main railway station . </s> 

DE:  18 Denn siehe , er richtet , und sein Richterspruch ist gerecht ; und das Kleinkind , das im Kindesalter stirbt , geht nicht zugrunde ; aber die Menschen trinken Verdammnis für ihre eigene Seele , außer sie demütigen sich und a werden so wie kleine Kinder und glauben daran , daß die Errettung im b sühnenden Blut Christi , des Herrn , des Allmächtigen , und durch dasselbe war und ist und sein wird .

	 EN (TRUE):18 For behold he judgeth , and his judgment i

DE:  Mitglieder geniessen viele zus � tzliche Leistungen wie optimierter Sicherheit , schnelleren Auszahlungszeiten und der Aufhebung von Kreditkarteneinzahlungslimits .

	 EN (TRUE):Members enjoy a range of perks including enhanced security and protection , faster withdrawals and increased credit card deposit limits .


	 EN (Predicted): <unk> <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> 

DE:  Das Cleddau Bridge Hotel ist der ideale Platz um zu entspannen oder geschäftlich zu reisen .

	 EN (TRUE):Cleddau Bridge hotel is the ideal place for those who want a relaxing holiday or who travel for business .


	 EN (Predicted): The <unk> Hotel is a perfect place for exploring the city . </s> 

DE:  Zimmerbeschreibung : Our Castle Deluxe Rooms are traditionally themed with rich luxurious fabrics and f

(Test) BLEU (100 elements):  0.2523678229731235
..................................................Step  12500
Actual: The last ten years <unk> became desirable quiet place for holidays . </s> 

Predicted: The 2 ##AT##-##AT## of ago <unk> a , and in the . </s> 
(Train) BLEU (420 elements):  0.34947250587745593
..................................................Step  13000
Actual: 124 <unk> to be a idle ; cease to be b unclean ; cease to c find fault one with another ; cease to d sleep longer than is <unk> ; retire to thy bed early , that ye may not be weary ; arise early , that your bodies and your minds may be e <unk> . </s> 

Predicted: <unk> , , be a <unk> , and to be a , , yea to be <unk> , , of a a yea to be <unk> , , a not . for , be <unk> . . and they may be be able . for , . and they c is the <unk> . be b . . </s> 
(Train) BLEU (420 elements):  0.3421802868491514
	 Loss:  0.7911088198721409
(Test) Translating test sentences ...
Processing test data ... 
DE:  Zum klimatisierten Ho

DE:  Niedrigere Preise durch mehr Wettbewerb . Die Kosten für Kapital können durch Währungsstabilität , niedrigere Zinssätze und eine bessere Organisation der Kapitalmärkte gesenkt werden .

	 EN (TRUE):In a knowledge ##AT##-##AT## based society the opportunity of education is the key to progress and equality and sustainability .


	 EN (Predicted): <unk> for the <unk> , the <unk> , the <unk> and the <unk> , the <unk> and the <unk> . </s> 

DE:  shower was ok but leaked needed updating .

	 EN (TRUE):the response to to requests was poor , phone 3 time for milk in the room over 4 hours .


	 EN (Predicted): I would like to stay the <unk> . </s> 

DE:  Das „ Ladino di Fassa “ ist jedoch mehr als ein Dialekt – es ist eine richtige Sprache .

	 EN (TRUE):This is Ladin from Fassa which is more than a dialect : it is a language in its own right .


	 EN (Predicted): The “ <unk> ” is a more than more than more than a matter of a <unk> . </s> 

DE:  Booking.com : Best Western Hotell SöderH , S

DE:  Das Personal war immer hilfsbereit und freundlich .

	 EN (TRUE):The location and helpfulness of staff was excellent .


	 EN (Predicted): staff was very friendly and helpful . </s> 

(Test) BLEU (100 elements):  0.2435647966753814
..................................................Step  15500
Actual: L &apos;accueil n &apos;est pas très <unk> ; il a <unk> <unk> plusieurs fois un extra ##AT##-##AT## bed ; la salle de petit déjeuner est très <unk> et à <unk> dans le hall de passage ; <unk> réception pour la <unk> , le bar de notre chambre était <unk> , il a <unk> <unk> qu &apos;il ne le <unk> pas 

Predicted: The <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> . <unk> <unk> . <unk> <unk> <unk> <unk> <unk> <unk> , <unk> <unk> <unk> <unk> <unk> <unk> . <unk> <unk> . <unk> . <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 
(Train) BLEU (470 elements):  0.3519446842119

DE:  Dieser bietet doppelten Schutz durch den OnExecution Scan , der Programme noch bevor Sie gestartet werden mit dem Signaturenscanner überprüft , sowie dem Malware ##AT##-##AT## IDS .

	 EN (TRUE):It includes the double protection using the OnExecution Scan , which scans programs right before they are started with the signature scanner , as well as the Malware ##AT##-##AT## IDS .


	 EN (Predicted): This is the first step of the <unk> , which is the same time to the <unk> , and the <unk> ##AT##-##AT## <unk> is the <unk> . </s> 

DE:  Leicht und ergonomisch gebaut , mit einer Hand zu bedienen , stellen diese Messgeräte eine wirtschaftliche Lösung dar , wenn bei Verdacht auf Wanddickenverlust schnell geprüft werden soll .

	 EN (TRUE):Lightweight and ergonomically designed for easy one ##AT##-##AT## hand operation , these gages provide cost ##AT##-##AT## effective measurement solutions in many applications that require quick inspection of materials suspected of metal wall thinning .



DE:  Einige der ursprünglichen Charakteristika des Gebäudes - wie beispielsweise die einzigartige denkmalgeschützte Fassade und die bezaubernde Innenausstattung der Bar ##AT##-##AT## Bodega De Blauwe Parde - wurden bis heute bewahrt .

	 EN (TRUE):Some unchanged features include the unique frontage , which has listed building status , and also the unique interior of De Blauwe Parade bar ##AT##-##AT## bodega .


	 EN (Predicted): The <unk> of the hotel is the ideal venue for the <unk> Bar , the <unk> <unk> De <unk> - <unk> <unk> . </s> 

(Test) BLEU (100 elements):  0.24447349247943093
..................................................Step  18500
Actual: Some of the articles about what has been published about our environmental program . . . </s> 

Predicted: Many of the most of the you been published . the new . . </s> 
(Train) BLEU (480 elements):  0.34952736745117896
..................................................Step  19000
Actual: Click Purchase Selected Items in the toolbar at 

DE:  Niedrigere Preise durch mehr Wettbewerb . Die Kosten für Kapital können durch Währungsstabilität , niedrigere Zinssätze und eine bessere Organisation der Kapitalmärkte gesenkt werden .

	 EN (TRUE):In a knowledge ##AT##-##AT## based society the opportunity of education is the key to progress and equality and sustainability .


	 EN (Predicted): <unk> <unk> <unk> <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> 

DE:  Eine Woche später wird Dianne Feinstein , Vorstandsvorsitzende der Inspektoren , als Nachfolgerin Moscones ernannt . Sie ist die erste Bürgermeisterin der Stadt .

	 EN (TRUE):Returning by the parallel Stockton or Powell will give you a better feeling of the day to day life of the residents , and are both good for those looking for imported commodities such as tea or herbs .


	 EN (Predict

(Test) BLEU (100 elements):  0.19767268928534704
..................................................Step  21500
Actual: Prior to casting , a number of identical parts are attached to a wax ‘ trunk ’ so that when completed the structure has the form of a wax ‘ tree ’ , the parts being the ‘ branches ’ . </s> 

Predicted: The to a <unk> a number of a in of not to the number of <unk> , s that the the by <unk> of been same of a number of <unk> . s which <unk> of used same <unk> . s </s> 
(Train) BLEU (500 elements):  0.3507924819950598
..................................................Step  22000
Actual: The aim of the task was to fully mill an extrusion die in PM steel ( 1 <unk> <unk> ##AT##-##AT## 13 ) , with a hardness of <unk> . </s> 

Predicted: <unk> <unk> of the <unk> was a be a in <unk> ##AT##-##AT## <unk> <unk> 1 , <unk> ) ) ) <unk> ##AT##-##AT## . which a <unk> of <unk> . </s> 
(Train) BLEU (480 elements):  0.3728390171583663
	 Loss:  0.7777903161942958
(Test) Translating test sen

DE:  Mitglieder geniessen viele zus � tzliche Leistungen wie optimierter Sicherheit , schnelleren Auszahlungszeiten und der Aufhebung von Kreditkarteneinzahlungslimits .

	 EN (TRUE):Members enjoy a range of perks including enhanced security and protection , faster withdrawals and increased credit card deposit limits .


	 EN (Predicted): <unk> <unk> , <unk> , <unk> , <unk> , <unk> , and <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , 

DE:  With a unique location in the heart of Peneda / Gerês National Park , this Pousada has a breathking view over the river Cávado and the peaceful Caniçada dam .

	 EN (TRUE):Located in the heart of Peneda ##AT##-##AT## Gerês National Park , this guest house boasts panoramic views of the surrounding mountains and is a welcome retreat for nature enthusiasts .


	 EN (Predicted): a family ##AT##-##AT## run hotel

(Test) BLEU (100 elements):  0.22278646486425938
..................................................Step  24500
Actual: Barcelona , Spain is a city located at the northeast side of the Iberian Peninsula , in the heart of Catalonia and bordered by the Mediterranean Sea to the east . </s> 

Predicted: Barcelona is the is located perfect centre in the heart of of the city Peninsula . which the heart of the . is by the <unk> Sea . the north . </s> 
(Train) BLEU (490 elements):  0.36802720431623925
..................................................Step  25000
Actual: In addition to the standard lens , the camera offers support to an abundance of lenses , among which the Type G and D AF Nikkor lenses without built ##AT##-##AT## in motor . </s> 

Predicted: The the , the <unk> of , the <unk> and a to the array of the and and the the <unk> ##AT##-##AT## ##AT##-##AT## D ##AT##-##AT## . ##AT##-##AT## . a ##AT##-##AT## in . . </s> 
(Train) BLEU (550 elements):  0.3759763580119075
	 Loss:  0.763728

DE:  Die schlanke , einfache Oberfläche und die gute Performance machen es zum idealen Werkzeug , um dein Netbook ( oder normales Notebook ) in einen e ##AT##-##AT## Book Reader zu verwandeln .

	 EN (TRUE):Its low resource use , simple interface and fast performance makes it the ideal tool to turn your netbook ( or regular laptop ) into an e ##AT##-##AT## book reader .


	 EN (Predicted): the <unk> and the <unk> of the <unk> , the <unk> of the <unk> ( or &quot; ) &quot; . </s> 

DE:  aufgerufen wird , fügt Sie die Flash Nachricht &quot; Eintrag gespeichert !

	 EN (TRUE):is called , it adds the flash message &quot; Record Saved !


	 EN (Predicted): the <unk> Flash Player ! </s> 

DE:  Baustelle zwischen See und Hotel . Altmodische Einrichtung .

	 EN (TRUE):Shared lobby with campsite next door , apparently , and hotel check in / out were not handled by lobby staff but by restaurant staff .


	 EN (Predicted): between the hotel . </s> 

DE:  Hotelparkplätze sind gegen eine kleine Gebü

(Test) BLEU (100 elements):  0.2049243678268133
..................................................Step  27500
Actual: Our mission is providing poker players with information about the world of online poker . Learn the poker rules , use the right poker strategy and use this knowledge to earn a lot of money . </s> 

Predicted: The first in the a of for the for the <unk> &apos;s the . ! </s> 
(Train) BLEU (500 elements):  0.37910238492846365
..................................................Step  28000
Actual: Booking .com : Sheraton Neues Schloss Hotel , Zürich , Switzerland - 23 Guest reviews . </s> 

Predicted: Booking .com : Sheraton Sheraton Schloss Hotel , Zürich , Switzerland - 23 Guest reviews . </s> 
(Train) BLEU (620 elements):  0.37671297743257043
	 Loss:  0.7586548285782337
(Test) Translating test sentences ...
Processing test data ... 
DE:  Das Hotel Sempione verfügt über eine ideale , ruhige Lage in einem geschäftigen Viertel mit guter Verkehrsanbindung . Der Bahnhof und ein

DE:  Im Allgemeinen basieren sie auf Datenbanken , Templates und Skripts .

	 EN (TRUE):In general they are based on databases , template and scripts .


	 EN (Predicted): the new data and data is available . </s> 

DE:  Zitate mit unterschiedlichsten stilistischen Effekten treffen aufeinander : Referenzen auf das narrative Autorenkino ( Hitchcock , Eisenstein , Godard , Brian De Palma ) , poetische oder theoretische Texte ( Tschechow , Duras , Barthes , Žižek , Weibel , Gržinić ) und Verweise auf Massenmedien – B ##AT##-##AT## Filme , TV ##AT##-##AT## Shows , Werbespots , politische Nachrichtensendungen .

	 EN (TRUE):Out of all this material , mixed with disnarrative polysemy and an astonishing lack of inhibition , strange “ fictions ” are reconstructed — fragmentary fictions that are constantly interrupted .


	 EN (Predicted): reviews , including a variety of accomodations options , including the <unk> <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , 

DE:  18 Denn siehe , er richtet , und sein Richterspruch ist gerecht ; und das Kleinkind , das im Kindesalter stirbt , geht nicht zugrunde ; aber die Menschen trinken Verdammnis für ihre eigene Seele , außer sie demütigen sich und a werden so wie kleine Kinder und glauben daran , daß die Errettung im b sühnenden Blut Christi , des Herrn , des Allmächtigen , und durch dasselbe war und ist und sein wird .

	 EN (TRUE):18 For behold he judgeth , and his judgment is just ; and the infant perisheth not that dieth in his infancy ; but men drink a damnation to their own souls except they humble themselves and b become as little children , and believe that c salvation was , and is , and is to come , in and through the d atoning blood of Christ , the Lord Omnipotent .


	 EN (Predicted): 18 For behold , he also shall be b judged , and the <unk> of the people , and the people of men , and they shall be b judged , and they shall be b judged . </s> 

DE:  Private Parkplätze stehen für EUR 3,50 pro

DE:  Standort war sehr praktisch . In 5 Minuten ist man am Hauptbahnhof , in 10 Minuten im Bankenviertel .

	 EN (TRUE):very central only a few minutes walk from Bohr / Ryanair bus stop and main train station.Generally cheap and cheerful .


	 EN (Predicted): the room was very small . </s> 

DE:  Bei der Installation von Adobe Presenter 6 wird das ältere Programm Breeze Presenter 5.1 deinstalliert .

	 EN (TRUE):Installing Adobe Presenter 6 will uninstall the earlier Breeze Presenter 5.1 .


	 EN (Predicted): you will be able to install the <unk> <unk> 6 <unk> <unk> . </s> 

DE:  Ein älteres Kind oder Erwachsener zahlt USD 23,40 pro Übernachtung in einem der vorhandenen Betten .

	 EN (TRUE):One older child or adult is charged USD 23.40 per night when using existing bedding .


	 EN (Predicted): One older child or adult is charged USD 23 <unk> per night when using existing bedding . </s> 

DE:  Zimmerbeschreibung : Our Castle Deluxe Rooms are traditionally themed with rich luxurious fa

(Train) BLEU (590 elements):  0.3819485167570491
..................................................Step  34000
Actual: And while Brussels insists Roma integration must remain a priority for Bulgaria and Romania – and throughout the bloc – few believe the amount of funds that has been made available can remove the obstacles preventing upward mobility among the Roma . </s> 

Predicted: All they they , , , with be be few of the . <unk> . and the the <unk> . <unk> years in <unk> of the . are been made . . be the same . . . . the <unk> . </s> 
(Train) BLEU (440 elements):  0.395429988175639
	 Loss:  0.7410071367770433
(Test) Translating test sentences ...
Processing test data ... 
DE:  Je mehr Zeit wir mit Gilad und dem Rest des Teams in Israel verbracht haben ( um nicht den lauten Hahn zu erwähnen der schreiend bei denen über den Campus rennt ) desto überzeugter waren wir – zusammen können wir mehr bewegen .

	 EN (TRUE):The more time we spent with Gilad as well as the rest of the team in 

DE:  Bei einer digitalen Bildkette wird das Intensitätssignal für jedes Pixel ohne analoge Zwischenschritte direkt in der Detektoreinheit digitalisiert , d.h. in Zahlen umgewandelt .

	 EN (TRUE):A digital image chain is an image chain that is equipped with a digital detector instead of an analogue one .


	 EN (Predicted): <unk> <unk> <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , 

DE:  Sehr freundliche Auszubildende an der Rezeption , die sehr bemüht noch einen Flug für mich gebucht hat .

	 EN (TRUE):First of all I did not like the price ... the next day I went to Milano to a 4 star Hotel for 10 Euro less and super service .. I had a problem with my Internetconnection and the Hotel Maritim did not react right .


	 EN (Predicted): the room was very comfortable and the location was very quiet . </s> 


DE:  Da jedes SCXI ##AT##-##AT## Modul die Signale auf einen einzigen Kanal des Datenerfassungsmoduls multiplext , lassen sich problemlos weitere Module hinzufügen , was für eine höhere Kanalanzahl sorgt .

	 EN (TRUE):It can multiplex its signals into a single channel of the DAQ device , and you can add modules to increase channel count .


	 EN (Predicted): This module module module is a low ##AT##-##AT## channel basis of a single channel . </s> 

DE:  Slimline ICE ist in einer Vielzahl von Geschmacksrichtungen sowohl als Eis am Stiel als auch im Becher erhältlich .

	 EN (TRUE):Palatinose ™ is a disaccharide derived from beet sugar .


	 EN (Predicted): <unk> is a great deal with a wide range of activities in the <unk> . </s> 

DE:  Dazu kam die deutsche Beteiligung an AWACS ##AT##-##AT## Flügen .

	 EN (TRUE):Additionally , the Germans participated in AWACS flights .


	 EN (Predicted): <unk> <unk> <unk> . </s> 

DE:  Die Lizenzgeberin haftet auch nach den gesetzlichen Bestimmungen

(Test) BLEU (100 elements):  0.2566278234166578
..................................................Step  38500
Actual: Cheap Monday jeans developed from the idea to offer the High Fashion customer jeans which keep up of the quality with all large marks , in the price is however unbeatable . </s> 

Predicted: <unk> <unk> through ##AT##-##AT## by the <unk> of the the highest edge , to , has the to the quality of all the , , but the price of not , . </s> 
(Train) BLEU (420 elements):  0.41003175572784756
..................................................Step  39000
Actual: Tarifa is also one of the biggest wind and kitesurfing locations in Europe . </s> 

Predicted: This is a known of the best best and the in in Europe and </s> 
(Train) BLEU (500 elements):  0.38183303887220604
	 Loss:  0.7311006060540676
(Test) Translating test sentences ...
Processing test data ... 
DE:  &#124; Ferienwohnungen 1 Zi &#124; Ferienhäuser &#124; Landhäuser &#124; Autovermietung &#124; Last Minute Angebote ! 

DE:  Jedes Stück Information kann eigene Eigenschaften und Aktionen besitzen .

	 EN (TRUE):Every bit of information and code can be given their own properties and actions .


	 EN (Predicted): Each may contain the same conditions . </s> 

DE:  Das Athens Gate Hotel liegt unterhalb der Akropolis nur 100 m vom neuen Akropolis ##AT##-##AT## Museum entfernt .

	 EN (TRUE):The Athens Gate Hotel rests under the Acropolis , just 100 metres from the new Acropolis museum .


	 EN (Predicted): The hotel is located just a few metres from the hotel . </s> 

DE:  Sie können hier auch Kanufahren , Windsurfen und Tauchen ...

	 EN (TRUE):Here , you can also practice aquatic sports such as yachting , windsurfing and canoeing ... you will find all kinds of water channels , from wild brooks to serene lakes .


	 EN (Predicted): you can also enjoy the best of the hotel . . . </s> 

DE:  Die Musikant entspricht dem Folkrock . Dennoch finden sich in den Liedern viele musikalische Elemente aus klassisch- f

DE:  Niedrigere Preise durch mehr Wettbewerb . Die Kosten für Kapital können durch Währungsstabilität , niedrigere Zinssätze und eine bessere Organisation der Kapitalmärkte gesenkt werden .

	 EN (TRUE):In a knowledge ##AT##-##AT## based society the opportunity of education is the key to progress and equality and sustainability .


	 EN (Predicted): <unk> <unk> <unk> , <unk> , <unk> , and <unk> , and <unk> , and <unk> . </s> 

DE:  Länge : 11,28m ; Breite : 3,66m ; Tiefgang : 0,30m ; Bj.2006 ; Liegeplatz : River Dart , Großbritannien ; 2 Motor ( en ) ; Volvo D4 260hp ; Description : The Bavaria Sport is a really impressive boat ...

	 EN (TRUE):Length : 11.40m ( ~ 37.40 ft ) ; Beam : 3.68m ; Draft : 0.90m ; built 2005 ; Location : Orsera Nautika / Kroatien ##AT##-##AT## Vrsar , Croatia ( Hrvatska ) ; 2 Engine / manufac . ; Volvo Penta 5,7 GXI ; 478kW ; 430 Engine Hours ; Weight : 6.800kg ; Neues Raymarine C70 , verlängerte Badeplattform mit Staufächern , Kühlschr ...


	 EN (Predicted)

DE:  Es handelt sich um ein ziemlich einfaches Protokoll ; TFTP macht aber manchmal Probleme .

	 EN (TRUE):This is a fairly simple protocol , but sometimes there are problems trying to get it to work .


	 EN (Predicted): you can also find a single player . </s> 

DE:  in dieser Option ermöglicht , Dateien relativ zum aktuellen Verzeichnis einzubinden .

	 EN (TRUE):in the include path allows for relative includes as it means the current directory . However , it is more efficient to explicitly use include &apos; . / file &apos; than having PHP always check the current directory for every include .


	 EN (Predicted): about the files are available for the current directory . </s> 

(Test) BLEU (100 elements):  0.2561246190714773
..................................................Step  43500
Actual: our view was amazing . . we had a <unk> . </s> 

Predicted: The staff of very and </s> 
(Train) BLEU (500 elements):  0.41102075055377324
..................................................Ste

DE:  Das Athens Gate Hotel liegt unterhalb der Akropolis nur 100 m vom neuen Akropolis ##AT##-##AT## Museum entfernt .

	 EN (TRUE):The Athens Gate Hotel rests under the Acropolis , just 100 metres from the new Acropolis museum .


	 EN (Predicted): The hotel is located only a short walk from the <unk> <unk> . </s> 

DE:  Einst verwunschene Eilande , später Schlupfwinkel von Piraten , welche die goldbeladenen Schiffe der Spanier überfielen , sind diese unwirtlichen Inseln mit dem seltsamen Tierleben heute wohl eines der letzten großen Tierparadiese der Welt .

	 EN (TRUE):Apart from its beautiful beaches and unique and varied ecosystems , the Galapagos Islands are home to towering active volcanoes that reach altitudes up to 1,600 meters .


	 EN (Predicted): the <unk> <unk> <unk> , the <unk> <unk> , the <unk> of the <unk> , the <unk> <unk> , the <unk> <unk> , the <unk> of the <unk> , the <unk> <unk> , the <unk> <unk> , the <unk> <unk> , the <unk> <unk> , the <unk> <unk> , the <unk> <un

DE:  Das ist viel einfacher ... Nein , streiten Sie nicht mit mir ... es ist einfacher ... ach , wie auch immer !

	 EN (TRUE):This is far more easy ... no , don &apos;t argue with me ... it is easier ... ah whatever !


	 EN (Predicted): is the best way to get the best . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

DE:  Länge : 11,28m ; Breite : 3,66m ; Tiefgang : 0,30m ; Bj.2006 ; Liegeplatz : River Dart , Großbritannien ; 2 Motor ( en ) ; Volvo D4 260hp ; Description : The Bavaria Sport is a really impressive boat ...

	 EN (TRUE):Length : 11.40m ( ~ 37.40 ft ) ; Beam : 3.68m ; Draft : 0.90m ; built 2005 ; Location : Orsera Nautika / Kroatien ##AT##-##AT## Vrsar , Croatia ( Hrvatska ) ; 2 Engine / manufac . ; Volvo Penta 5,7 GXI ; 478kW ; 430 Engine Hours ; Weight : 6.800kg ; Neues Raymarine C70 , verlängerte Badeplattform mit Staufächern , Kühlschr ...


	 EN (Predicted): : 0 : <unk> <unk> <unk> : <unk> <unk> ; <unk> : <unk

DE:  in dieser Option ermöglicht , Dateien relativ zum aktuellen Verzeichnis einzubinden .

	 EN (TRUE):in the include path allows for relative includes as it means the current directory . However , it is more efficient to explicitly use include &apos; . / file &apos; than having PHP always check the current directory for every include .


	 EN (Predicted): this option is very useful for the current directory . </s> 

(Test) BLEU (100 elements):  0.25987847087195726
..................................................Step  48500
Actual: Research on megalithic graves and a fortified settlement ( Monte da Ponte ) in South Portugal . </s> 

Predicted: <unk> of the and and <unk> <unk> <unk> ( <unk> da <unk> ) , the Tyrol , </s> 
(Train) BLEU (620 elements):  0.41611564988868316
..................................................Step  49000
Actual: Authors such Norman Davies , Geert <unk> and Tony <unk> &#91; 5 &#93; stand for this strong need to see and understand how everything was connected

DE:  Niedrigere Preise durch mehr Wettbewerb . Die Kosten für Kapital können durch Währungsstabilität , niedrigere Zinssätze und eine bessere Organisation der Kapitalmärkte gesenkt werden .

	 EN (TRUE):In a knowledge ##AT##-##AT## based society the opportunity of education is the key to progress and equality and sustainability .


	 EN (Predicted): <unk> <unk> , <unk> , <unk> , <unk> , and <unk> , and the <unk> of the <unk> . </s> 

DE:  Länge : 11,28m ; Breite : 3,66m ; Tiefgang : 0,30m ; Bj.2006 ; Liegeplatz : River Dart , Großbritannien ; 2 Motor ( en ) ; Volvo D4 260hp ; Description : The Bavaria Sport is a really impressive boat ...

	 EN (TRUE):Length : 11.40m ( ~ 37.40 ft ) ; Beam : 3.68m ; Draft : 0.90m ; built 2005 ; Location : Orsera Nautika / Kroatien ##AT##-##AT## Vrsar , Croatia ( Hrvatska ) ; 2 Engine / manufac . ; Volvo Penta 5,7 GXI ; 478kW ; 430 Engine Hours ; Weight : 6.800kg ; Neues Raymarine C70 , verlängerte Badeplattform mit Staufächern , Kühlschr ...


	 EN (Pre

DE:  Bei der Installation von Adobe Presenter 6 wird das ältere Programm Breeze Presenter 5.1 deinstalliert .

	 EN (TRUE):Installing Adobe Presenter 6 will uninstall the earlier Breeze Presenter 5.1 .


	 EN (Predicted): the <unk> <unk> <unk> is the <unk> <unk> <unk> <unk> <unk> 5 <unk> . </s> 

DE:  Tux Racer wird Ihnen helfen , die Zeit totzuschlagen und sie können OpenOffice zum Arbeiten verwenden .

	 EN (TRUE):Tux Racer will help you pass the time while you wait , and you can use OpenOffice for work .


	 EN (Predicted): <unk> will be able to assist the time time to enable the time to the desired time . </s> 

DE:  Zimmerbeschreibung : Our Castle Deluxe Rooms are traditionally themed with rich luxurious fabrics and furnishings , many with excellent views over the Castle grounds .

	 EN (TRUE):Room Notes : Our Castle Deluxe Rooms are traditionally themed with rich luxurious fabrics and furnishings , many with excellent views over the Castle grounds .


	 EN (Predicted): Room Notes

(Train) BLEU (580 elements):  0.40057145666680943
	 Loss:  0.6960566946268082
(Test) Translating test sentences ...
Processing test data ... 
DE:  Heute verstehen sich QuarkXPress ® 8 , Photoshop ® und Illustrator ® besser als jemals zuvor . Dank HTML und CSS ­ können Anwender von QuarkXPress inzwischen alle Medien bedienen , und das unabhängig von Anwendungen der Adobe ® Creative Suite ® wie Adobe Flash ® ( SWF ) und Adobe Dreamweaver ® .

	 EN (TRUE):Today , QuarkXPress ® 8 has tighter integration with Photoshop ® and Illustrator ® than ever before , and through standards like HTML and CSS , QuarkXPress users can publish across media both independently and alongside Adobe ® Creative Suite ® applications like Adobe Flash ® ( SWF ) and Adobe Dreamweaver ® .


	 EN (Predicted): <unk> ® , QuarkXPress ® , and ® <unk> , <unk> ® , and <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> 

DE:  Im Allgemeinen basieren sie auf Datenbanken , Templates und Skripts .

	 EN (TRUE):In general they are based on databases , template and scripts .


	 EN (Predicted): they are based on databases , and scripts . </s> 

DE:  bietet ihnen die Möglichkeit , alte ABAP ##AT##-##AT## Reports mit samt ihrer betriebswirtschaftlichen Intelligenz trotz des Umstiegs auf die MS ##AT##-##AT## Umgebung weiter zu nutzen .

	 EN (TRUE):Uses the BW extractor API to extract data from productive R / 3 systems by using delta mechanisms .


	 EN (Predicted): offers the opportunity to learn more about the <unk> of the <unk> <unk> . </s> 

DE:  Wann möchten Sie im Leon &apos; s Place Hotel In Rome übernachten ?

	 EN (TRUE):When would you like to stay at the Leon &apos;s Place Hotel In Rome ?


	 EN (Predicted): When you would you like to stay at the <unk> &apos;s Place Hotel Rome in Rome ? </s> 

DE:  Jedes Stück Information kann eigene Eigenschaften und Aktionen besitzen .

	 EN (TRUE):Every bit of inf

DE:  Bei den romanischen Völkern paart sich die effektive Ohnmacht mit lächerlicher Anmaßung .

	 EN (TRUE):To material weakness the Latin countries add a quite fantastic pretentiousness .


	 EN (Predicted): <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 

(Test) BLEU (100 elements):  0.249245459732604
..................................................Step  55500
Actual: The hotel has a variety of rooms , including a full selection of rear ##AT##-##AT## facing quiet rooms in which guests can choose to relax in pure comfort . </s> 

Predicted: The hotel offers a large of rooms , including a wide range of rooms in based rooms rooms , the the can enjoy from the in the comfort . </

DE:  Das „ Ladino di Fassa “ ist jedoch mehr als ein Dialekt – es ist eine richtige Sprache .

	 EN (TRUE):This is Ladin from Fassa which is more than a dialect : it is a language in its own right .


	 EN (Predicted): “ <unk> ” is a more than more than one . </s> 

DE:  Baustelle zwischen See und Hotel . Altmodische Einrichtung .

	 EN (TRUE):Shared lobby with campsite next door , apparently , and hotel check in / out were not handled by lobby staff but by restaurant staff .


	 EN (Predicted): between the hotel and the <unk> . </s> 

DE:  Zusätzlich enthält TBarCode / SAPwin eine Menge neuer Strichcode ##AT##-##AT## Symbologien .

	 EN (TRUE):In addition TBarCode / SAPwin comes with a bunch of new bar code symbologies .


	 EN (Predicted): TBarCode TBarCode / <unk> is a real ##AT##-##AT## time ##AT##-##AT## box . </s> 

DE:  Alle älteren Kinder oder Erwachsene zahlen EUR 32,00 pro Übernachtung und Person für Zustellbetten .

	 EN (TRUE):All older children or adults are charged EUR 32

(Test) BLEU (100 elements):  0.2720175336017809
..................................................Step  58500
Actual: If you ’ ve been wondering whether you should make the move to Windows Vista , take a look at the evidence . </s> 

Predicted: If you are re already a whether you have be sure same to the , , where a look at home time of </s> 
(Train) BLEU (540 elements):  0.41540923556076387
..................................................Step  59000
Actual: For instance , a course may make use of many resources and have , as an aid to instructors , a wiki devoted to equipment located in several remote classrooms . </s> 

Predicted: The example , the <unk> of be a of the other . other to and well example . the ##AT##-##AT## and <unk> , to the . in the ways areas . </s> 
(Train) BLEU (530 elements):  0.40673129020430665
	 Loss:  0.6946328013837337
(Test) Translating test sentences ...
Processing test data ... 
DE:  Das Hotel Sempione verfügt über eine ideale , ruhige Lage in einem ges

DE:  Dieses 4 ##AT##-##AT## Sterne ##AT##-##AT## Landhotel aus dem 18. Jahrhundert inmitten einer ländlichen Umgebung ist nur eine kurze Fahrt vom Stadtzentrum von Londonderry und vom örtlichen Flughafen entfernt .

	 EN (TRUE):This 4 ##AT##-##AT## star 18th century country house hotel is situated in the countryside , just a short drive from Derry city centre and within reach of the City of Derry Airport .


	 EN (Predicted): a 4 ##AT##-##AT## star hotel , this 4 ##AT##-##AT## star hotel is situated in a quiet area of the city centre , just a short drive from the city centre and the airport . </s> 

DE:  Im Allgemeinen basieren sie auf Datenbanken , Templates und Skripts .

	 EN (TRUE):In general they are based on databases , template and scripts .


	 EN (Predicted): they are based on databases , and scripts . </s> 

DE:  aufgerufen wird , fügt Sie die Flash Nachricht &quot; Eintrag gespeichert !

	 EN (TRUE):is called , it adds the flash message &quot; Record Saved !


	 EN (Predicte

..................................................Step  62000
Actual: And he said unto them : On this wise shall ye baptize ; and there shall be b no <unk> among you . </s> 

Predicted: And he said unto them : I this shall , ye shall , and there shall be no none more among you . </s> 
(Train) BLEU (430 elements):  0.4254064557454336
	 Loss:  0.6981558171659707
(Test) Translating test sentences ...
Processing test data ... 
DE:  In der Hotelbeschreibung im Internet müßte die Zufahrt beschrieben werden .

	 EN (TRUE):There are no adverse comments about this hotel at all .


	 EN (Predicted): the Internet connection is the <unk> . </s> 

DE:  18 Denn siehe , er richtet , und sein Richterspruch ist gerecht ; und das Kleinkind , das im Kindesalter stirbt , geht nicht zugrunde ; aber die Menschen trinken Verdammnis für ihre eigene Seele , außer sie demütigen sich und a werden so wie kleine Kinder und glauben daran , daß die Errettung im b sühnenden Blut Christi , des Herrn , des Allmächtigen

DE:  Da jedes SCXI ##AT##-##AT## Modul die Signale auf einen einzigen Kanal des Datenerfassungsmoduls multiplext , lassen sich problemlos weitere Module hinzufügen , was für eine höhere Kanalanzahl sorgt .

	 EN (TRUE):It can multiplex its signals into a single channel of the DAQ device , and you can add modules to increase channel count .


	 EN (Predicted): With the SCXI ##AT##-##AT## <unk> module , the NI <unk> ##AT##-##AT## <unk> is a single channel ##AT##-##AT## channel , which can be used for a single ##AT##-##AT## channel module . </s> 

DE:  Die Bewohner des Nordens sind ein buntes Völkergemisch aus den verschiedensten Bergstämmen und den Nord ##AT##-##AT## Thais oder kon mueang ; die traditionell in den fruchtbaren Tiefebenen Nordthailands siedeln . In vielerlei Hinsicht halten sich die Nord Thais für die &quot; wahren &quot; Thais , die die Thai ##AT##-##AT## Kultur noch am besten über die Zeit gerettet haben .

	 EN (TRUE):From Pratu Chiang Mai market , songthaews also trave

DE:  Es handelt sich um ein ziemlich einfaches Protokoll ; TFTP macht aber manchmal Probleme .

	 EN (TRUE):This is a fairly simple protocol , but sometimes there are problems trying to get it to work .


	 EN (Predicted): you need to make a single ##AT##-##AT## course protocol . </s> 

DE:  Bamberg , die &quot; Traumstadt der Deutschen &quot; , seine aufgeschlossenen Menschen und seine romantische Umgebung wird auch Sie begeistern , denn sie bietet für jeden etwas .

	 EN (TRUE):The beauty and rich cultural life of this town can be enjoyed at any time of year . Soak up the summer sun whilst relaxing at one of the many sidewalk cafés in the historic old town or savour a cool beer beneath a shady chestnut tree in one of the popular beer gardens .


	 EN (Predicted): <unk> , the <unk> of the <unk> , and his wife , and his wife , you will find the best of the best possible . </s> 

(Test) BLEU (100 elements):  0.2718733447908611
..................................................Step  6450

## Visualizing the Attention Model

Here we visualize the attention matrix for various translations the NMT system produced. The attention matrix is a `dec_num_unrollings x enc_num_unrollings` matrix. Where each cell denotes the $\alpha$ values obtained during attention calculation.

In [None]:
source_labels = []
target_labels = []

print('=====================================================')
print('(Test) Translating test sentences ...')

print('Processing test data ... ')

# Process each test input by batches
for in_i in range(test_inputs.shape[0]//batch_size):
    
    # Generate test data
    test_eu_data, test_eu_labels, _ = test_enc_data_generator.unroll_batches(sent_ids=np.arange(in_i*batch_size,(in_i+1)*batch_size))
    test_du_data, test_du_labels, _ = test_dec_data_generator.unroll_batches(sent_ids=np.arange(in_i*batch_size,(in_i+1)*batch_size))

    # Choose a random data point in the batch
    test_rand_idx = np.random.randint(0,batch_size) # used for printing test output

    # fill the feed dict
    feed_dict = {}
    source_labels = [] # This contains the source words of the test point considered
    for ui,(dat,lbl) in enumerate(zip(test_eu_data,test_eu_labels)):            
        feed_dict[enc_test_input[ui]] = dat
        source_labels.append(src_reverse_dictionary[test_inputs[(in_i*batch_size)+test_rand_idx,ui]])

    # Print the true source sentence
    print('DE: ',test_source_sent[(in_i*batch_size)+test_rand_idx])
    print_str = '\t EN (TRUE):' + test_target_sent[(in_i*batch_size)+test_rand_idx]
    print(print_str + '\n')

    
    print_str = '\t EN (Predicted): '            

    # run prediction calculation this returns a list of prediction dec_num_unrollings long
    # alpha_dec_unrolled is a list of dec_num_unrollings elements, 
    # where each element (another list) is num_enc_unrollings long
    test_pred_unrolled, alpha_dec_unrolled = sess.run([test_predictions,test_alpha_i_unrolled], feed_dict=feed_dict)

    target_labels = []
    
    # Building the attention matrix
    attention_matrix = []
    
    r_i,c_i = 0, 0
    
    # We build the attention matrix column by column
    for u_i, (test_pred, alpha_enc_unrolled) in enumerate(zip(test_pred_unrolled, alpha_dec_unrolled)): 
        # Column index
        c_i = 0
        # Current target word
        current_tgt = tgt_reverse_dictionary[test_pred[test_rand_idx]]
        
        # Only add if the word is not <s> or </s> or <unk>
        if current_tgt != '<s>' and current_tgt != '</s>' and current_tgt != '<unk>':            
            attention_matrix.append([])
            target_labels.append(tgt_reverse_dictionary[test_pred[test_rand_idx]])
            print_str += tgt_reverse_dictionary[test_pred[test_rand_idx]] + ' '
            filtered_src_labels = []
            # Fill each row position in that column
            for u_ii in range(enc_num_unrollings):
                # Only add if the word is not <s> or </s> or <unk>
                if source_labels[u_ii] != '<s>' and source_labels[u_ii] != '</s>' and source_labels[u_ii] != '<unk>':
                    filtered_src_labels.append(source_labels[u_ii])            
                    attention_matrix[r_i].append(alpha_enc_unrolled[test_rand_idx,u_ii])
                    c_i += 1
            r_i += 1

    assert r_i == len(target_labels)
    
    # Make the above to a matrix
    attention_matrix = np.array(attention_matrix)
    
    if attention_matrix.ndim == 1:
        attention_matrix = attention_matrix.reshape(1,-1)
    
    # Reset test state after each batch
    sess.run(reset_test_state)
    
    # Plot
    f,ax = pylab.subplots(1,1,figsize=(5.0 + 0.5*attention_matrix.shape[0], 
                                       5.0 + 0.5*attention_matrix.shape[1]))
    # Repetitions are used to make the attention value to a set of image pixels
    rep_attn = np.repeat(attention_matrix,5,axis=0)
    rep_attn = np.repeat(rep_attn,5,axis=1)
    
    # Correcting for source reversing
    rep_attn = np.fliplr(rep_attn)
    
    # Rendering image
    ax.imshow(rep_attn,vmin=0.0,vmax=1.0,cmap='jet')
    
    # Labels for columns
    for s_i,src_text in enumerate(reversed(filtered_src_labels)):
        ax.text(s_i*5+1,-2,src_text,rotation=90, verticalalignment='bottom',fontsize=18)
        
    # Labels for rows 
    for t_i,tgt_text in enumerate(target_labels):
        ax.text(-2, t_i*5+0.5,tgt_text, horizontalalignment = 'right', fontsize=18)
        
    ax.axis('off')

    f.savefig('attention_%d.png'%in_i)
    pylab.close(f)
    
 
print('=====================================================')