# Neural Machine Translation: German to English (With Pretrained Word Vectors)

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.

# ***********************************************************
# Please use "pip install unidecode"
# to install the unidecode library if you haven't installed it

%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
import tensorflow as tf
from PIL import Image
from collections import Counter
import csv

import unidecode
from nltk.translate.bleu_score import corpus_bleu
import nltk

  from ._conv import register_converters as _register_converters


## Loading Data 

First, download the data from this [page](https://nlp.stanford.edu/projects/nmt/). The required files are:

* File containing German sentences: [`train.de`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de)
* File containing English sentences: [`train.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en)
* File containing German vocabulary: [`vocab.50K.de`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.de)
* File containing English vocabulary: [`vocab.50K.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.en)

### Loading Vocabulary

First we build the vocabulary dictionaries for both the source (German) and target (English) languages. The vocabularies are found in the `vocab.50K.de` (German) and `vocab.50K.en` files.

In [2]:
# ==========================================
# Building source language vocabulary

# Contains word string -> ID mapping
src_dictionary = dict()

# Read the vocabulary file
with open('vocab.50K.de', encoding='utf-8') as f:
    # Read and store every line
    for line in f:
        #we are discarding last char as it is new line char
        src_dictionary[line[:-1]] = len(src_dictionary)

# Build a reverse dictionary with the mapping ID -> word string
src_reverse_dictionary = dict(zip(src_dictionary.values(),src_dictionary.keys()))

# Print some of the words in the dictionary
print('Source')
print('\t',list(src_dictionary.items())[:10])
print('\t',list(src_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(src_dictionary))

# ==========================================
# Building source language vocabulary

# Contains word string -> ID mapping
tgt_dictionary = dict()

# Read the vocabulary file
with open('vocab.50K.en', encoding='utf-8') as f:
    # Read and store every line
    for line in f:
        #we are discarding last char as it is new line char
        tgt_dictionary[line[:-1]] = len(tgt_dictionary)

# Build a reverse dictionary with the mapping ID -> word string
tgt_reverse_dictionary = dict(zip(tgt_dictionary.values(),tgt_dictionary.keys()))

# Print some of the words in the dictionary
print('Target')
print('\t',list(tgt_dictionary.items())[:10])
print('\t',list(tgt_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(tgt_dictionary))

# Each language has 50000 words
vocabulary_size = 50000

Source
	 [('recent', 33872), ('Strafmaß', 49477), ('Ukrainer', 24405), ('stecken', 6942), ('Wirtschaftsreformen', 17596), ('Retro', 30753), ('Brillen', 36529), ('verweilen', 13178), ('Familienhotel', 16611), ('Evangelium', 17121)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'die'), (6, 'der'), (7, 'und'), (8, 'in'), (9, 'zu')]
	 Vocabulary size:  50000
Target
	 [('Rhone', 24708), ('recent', 597), ('Gaston', 44095), ('Retro', 41945), ('Schultz', 34091), ('lapses', 16345), ('sacral', 36968), ('novembre', 46943), ('parc', 31063), ('Zoran', 41835)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'the'), (4, ','), (5, '.'), (6, 'of'), (7, 'and'), (8, 'to'), (9, 'in')]
	 Vocabulary size:  50000


### Loading Training and Testing Data

Here we load the data in the `train.de` and `train.en` files. And split the data in the files into two sets; training and testing data.

In [3]:
# Contains the training sentences
source_sent = [] # Input
target_sent = [] # Output

# Contains the testing sentences
test_source_sent = [] # Input
test_target_sent = [] # Output

# We grab around 100 lines of data that are interleaved 
# in the first 50000 sentences
test_indices = [l_i for l_i in range(50,50001,500)]

# Read the source data file and read the first 250,000 lines (except first 50)
with open('train.de', encoding='utf-8') as f:
    for l_i, line in enumerate(f):
        # discarding first 50 translations as there was some
        # english to english mappings found in the first few lines. which are wrong
        if l_i<50:
            continue
        
        if len(source_sent)<250000 and l_i not in test_indices:
            source_sent.append(line)
        elif l_i in test_indices:
            test_source_sent.append(line)
        
# Read the target data file and read the first 250,000 lines (except first 50)            
with open('train.en', encoding='utf-8') as f:
    for l_i, line in enumerate(f):
        # discarding first 50 translations as there was some
        # english to english mappings found in the first few lines. which are wrong
        if l_i<50:
            continue
        
        if len(target_sent)<250000 and l_i not in test_indices:
            target_sent.append(line)
        elif l_i in test_indices:
            test_target_sent.append(line)
        
# Make sure we extracted same number of both extracted source and target sentences         
assert len(source_sent)==len(target_sent),'Source: %d, Target: %d'%(len(source_sent),len(target_sent))

# Print some source sentences
print('Sample translations (%d)'%len(source_sent))
for i in range(0,250000,10000):
    print('(',i,') DE: ', source_sent[i])
    print('(',i,') EN: ', target_sent[i])

# Print some target sentences
print('Sample test translations (%d)'%len(test_source_sent))
for i in range(0,100,10):
    print('DE: ', test_source_sent[i])
    print('EN: ', test_target_sent[i])



Sample translations (250000)
( 0 ) DE:  Hier erfahren Sie , wie Sie Creative Suite 2 und Creative Suite 3 am besten zusammen mit QuarkXPress nutzen können .

( 0 ) EN:  Here , you ’ ll find out how Creative Suite users can get the best possible interaction with QuarkXPress .

( 10000 ) DE:  Für die sehr günstigen Wochen- und Monatskarten ( 1 Monat ca.

( 10000 ) EN:  It is THE trendy area of Marseille .

( 20000 ) DE:  Freuen Sie sich auf die romantische Atmosphäre in den Zimmern und Apartments .

( 20000 ) EN:  Enjoy the romantic atmosphere of one of the guest rooms or apartments .

( 30000 ) DE:  Zu zwiespältig sind Dr. Gutherzens Erfahrungen aus frühen Studententagen verlaufen , in denen er sich in die Gefielde von durch Heidegger geprägten Autor / innen begeben hat und dort ständig mit strengem Blick darauf verwiesen wurde , er habe bestimmte Theorieressourcen und Gedankengebäude einfach noch nicht gründlich genug verstanden und könne deshalb nicht begreifen , warum seine Einwände 

### Preprocessing text
Here we preprocess the text by replacing words not found in the dictionary with `<unk>` as well as remove punctuation marks (`.`,`,`) and new-line characters.

In [4]:
# Keep track of how many unknown words were encountered
src_unk_count, tgt_unk_count = 0, 0

def split_to_tokens(sent,is_source):
    '''
    This function takes in a sentence (source or target)
    and preprocess the sentency with various steps (e.g. removing punctuation)
    '''
    
    global src_unk_count, tgt_unk_count

    # Remove punctuation and new-line chars
    sent = sent.replace(',',' ,')
    sent = sent.replace('.',' .')
    sent = sent.replace('\n',' ') 
    
    sent_toks = sent.split(' ')
    for t_i, tok in enumerate(sent_toks):
        if is_source:
            # src_dictionary contain the word -> word ID mapping for source vocabulary
            if tok not in src_dictionary.keys():
                if not len(tok.strip())==0:
                    sent_toks[t_i] = '<unk>'
                    src_unk_count += 1
        else:
            # tgt_dictionary contain the word -> word ID mapping for target vocabulary
            if tok not in tgt_dictionary.keys():
                if not len(tok.strip())==0:
                    sent_toks[t_i] = '<unk>'
                    #print(tok)
                    tgt_unk_count += 1
    return sent_toks

# Let us first look at some statistics of the sentences
# Train - source data
source_len = []
source_mean, source_std = 0,0
for sent in source_sent:
    source_len.append(len(split_to_tokens(sent,True)))

print('(Source) Sentence mean length: ', np.mean(source_len))
print('(Source) Sentence stddev length: ', np.std(source_len))

# Let us first look at some statistics of the sentences
# Train - target data
target_len = []
for sent in target_sent:
    target_len.append(len(split_to_tokens(sent,False)))

print('(Target) Sentence mean length: ', np.mean(target_len))
print('(Target) Sentence stddev length: ', np.std(target_len))

# Let us first look at some statistics of the sentences
# Test - source data
test_source_len = []
for sent in test_source_sent:
    test_source_len.append(len(split_to_tokens(sent, True)))
    
print('(Test-Source) Sentence mean length: ', np.mean(test_source_len))
print('(Test-Source) Sentence stddev length: ', np.std(test_source_len))

# Let us first look at some statistics of the sentences
# Test - target data
test_target_len = []
test_tgt_mean, test_tgt_std = 0,0
for sent in test_target_sent:
    test_target_len.append(len(split_to_tokens(sent, False)))
    
print('(Test-Target) Sentence mean length: ', np.mean(test_target_len))
print('(Test-Target) Sentence stddev length: ', np.std(test_target_len))

(Source) Sentence mean length:  26.244692
(Source) Sentence stddev length:  13.854376414156501
(Target) Sentence mean length:  28.275308
(Target) Sentence stddev length:  14.925498769057468
(Test-Source) Sentence mean length:  26.61
(Test-Source) Sentence stddev length:  14.800604717375572
(Test-Target) Sentence mean length:  29.08
(Test-Target) Sentence stddev length:  16.19424589167399


### Making training and testing data fixed length

In [5]:
# ================================================================================
# Processing training data

src_unk_count, tgt_unk_count = 0, 0

train_inputs = []
train_outputs = []

# Chosen based on previously found statistics
src_max_sent_length = 41 
tgt_max_sent_length = 61

print('Processing Training Data ...\n')
for s_i, (src_sent, tgt_sent) in enumerate(zip(source_sent,target_sent)):
    # Break source and target sentences to word lists
    src_sent_tokens = split_to_tokens(src_sent,True)
    tgt_sent_tokens = split_to_tokens(tgt_sent,False)
    
    # Append <s> token's ID to the beggining of source sentence
    num_src_sent = [src_dictionary['<s>']]
    # Add the rest of word IDs for words found in the source sentence 
    for tok in src_sent_tokens:
        if tok in src_dictionary.keys():
            num_src_sent.append(src_dictionary[tok])

    # If the lenghth of the source sentence below the maximum allowed length
    # append </s> token's ID to the end
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])

    # If the length exceed the maximum allowed length
    # truncate the sentence
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
        
    # Make sure the sentence is of length src_max_sent_length
    assert len(num_src_sent)==src_max_sent_length,len(num_src_sent)

    train_inputs.append(num_src_sent)
    
    # Create the numeric target sentence with word IDs
    # append <s> to the beginning and append actual words later
    num_tgt_sent = [tgt_dictionary['<s>']]
    for tok in tgt_sent_tokens:
        if tok in tgt_dictionary.keys():
            num_tgt_sent.append(tgt_dictionary[tok])
        
    ## Modifying the outputs such that all the outputs have max_length elements
    if len(num_tgt_sent)<tgt_max_sent_length:
        num_tgt_sent.extend([tgt_dictionary['</s>'] for _ in range(tgt_max_sent_length - len(num_tgt_sent))])
    elif len(num_tgt_sent)>tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
        
    train_outputs.append(num_tgt_sent)
    
print('Unk counts Src: %d, Tgt: %d'%(src_unk_count, tgt_unk_count))
print('Sentences ',len(train_inputs))

assert len(train_inputs)  == len(source_sent),\
        'Size of total elements: %d, Total sentences: %d'\
                %(len(train_inputs),len(source_sent))

# Making inputs and outputs NumPy arrays
train_inputs = np.array(train_inputs, dtype=np.int32)
train_outputs = np.array(train_outputs, dtype=np.int32)

# Make sure number of inputs and outputs dividable by 100
train_inputs = train_inputs[:(train_inputs.shape[0]//100)*100,:]
train_outputs = train_outputs[:(train_outputs.shape[0]//100)*100,:]
print('\t Done processing training data \n')

# Printing some data
print('Samples from training data')
for ti in range(10):
    print('\t',[src_reverse_dictionary[w]  for w in train_inputs[ti,:].tolist()])
    print('\t',[tgt_reverse_dictionary[w]  for w in train_outputs[ti,:].tolist()])
print()
print('\tSentences ',train_inputs.shape[0])

# ================================================================================
# Processing Test data

src_unk_count, tgt_unk_count = 0, 0
print('Processing testing data ....\n')
test_inputs = []
test_outputs = []
for s_i, (src_sent,tgt_sent) in enumerate(zip(test_source_sent,test_target_sent)):
    src_sent_tokens = split_to_tokens(src_sent,True)
    tgt_sent_tokens = split_to_tokens(tgt_sent,False)
    
    num_src_sent = [src_dictionary['<s>']]
    for tok in src_sent_tokens:
        if tok in src_dictionary.keys():
            num_src_sent.append(src_dictionary[tok])
    
    num_tgt_sent = [src_dictionary['<s>']]
    for tok in tgt_sent_tokens:
        if tok in tgt_dictionary.keys():
            num_tgt_sent.append(tgt_dictionary[tok])
        
    # Append </s> if the length is not src_max_sent_length
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])
    # Truncate the sentence if length is over src_max_sent_length
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
        
    assert len(num_src_sent)==src_max_sent_length, len(num_src_sent)

    test_inputs.append(num_src_sent)
    
    # Append </s> is length is not tgt_max_sent_length
    if len(num_tgt_sent)<tgt_max_sent_length:
        num_tgt_sent.extend([tgt_dictionary['</s>'] for _ in range(tgt_max_sent_length - len(num_tgt_sent))])
    # Truncate the sentence if length over tgt_max_sent_length
    elif len(num_tgt_sent)>tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
        
    assert len(num_tgt_sent)==tgt_max_sent_length, len(num_tgt_sent)

    test_outputs.append(num_tgt_sent)

# Printing some data
print('Unk counts Tgt: %d, Tgt: %d'%(src_unk_count, tgt_unk_count))    
print('Done processing testing data ....\n')
test_inputs = np.array(test_inputs,dtype=np.int32)
test_outputs = np.array(test_outputs,dtype=np.int32)
print('Samples from training data')
for ti in range(10):
    print('\t',[src_reverse_dictionary[w]  for w in test_inputs[ti,:].tolist()])
    print('\t',[tgt_reverse_dictionary[w]  for w in test_outputs[ti,:].tolist()])

Processing Training Data ...

Unk counts Src: 464223, Tgt: 214783
Sentences  250000
	 Done processing training data 

Samples from training data
	 ['<s>', 'Hier', 'erfahren', 'Sie', ',', 'wie', 'Sie', 'Creative', 'Suite', '2', 'und', 'Creative', 'Suite', '3', 'am', 'besten', 'zusammen', 'mit', 'QuarkXPress', 'nutzen', 'können', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
	 ['<s>', 'Here', ',', 'you', '’', 'll', 'find', 'out', 'how', 'Creative', 'Suite', 'users', 'can', 'get', 'the', 'best', 'possible', 'interaction', 'with', 'QuarkXPress', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']
	 ['<s>', 'Sie',

## Loading Word Embeddings

To train the NMT with pretrained embeddings, we will first download multilingual word embeddings from the [website](http://www.cs.cmu.edu/~afm/projects/multilingual_embeddings.html). We are going to need the [German embeddings](http://camoes.lx.it.pt/amartins/projects/data/multilingual_embeddings.de) and the [English embeddings](http://camoes.lx.it.pt/amartins/projects/data/multilingual_embeddings.en). After the files are downloaded, we will match the words from the embedding files and our vocabulary. We will also check for words with several variations if any of the following matches exist: 
* Lower case words
* First letter capitalized
* Words with the accent removed

The matched word embeddings are copied to a matrix. **Downloading the files in the provided links is essential to successfully run the code**.

In [6]:
embeddings_size = 300

# Randomly initialize German and English embeddings
de_embeddings = np.random.uniform(size=(vocabulary_size, embeddings_size),low=-1.0, high=1.0)
en_embeddings = np.random.uniform(size=(vocabulary_size, embeddings_size),low=-1.0, high=1.0)

def match_pretrained_dataset_words(filename, embeddings, dictionary):
    '''
    Here we match the words in our vocabulary with the pretrained embeddings
    '''
    
    words_found = 0
    words_found_ids = []
    
    # Get unaccented words
    unaccented_dict = \
    dict(zip([unidecode.unidecode(k.lower()) for k in dictionary.keys()],dictionary.values()))

    # Reading file. This is a space separated file
    with open(filename,'r',encoding='utf-8') as f:

        # We read line by line
        for l_i, line in enumerate(f):
            
            # Printing the progress
            if l_i%100==0:
                print('.',end='')
            if l_i%10000==0:
                print('')
                
            # Split the line by spaces
            line_tokens = line.split(' ')
            
            # Get the word
            lword = line_tokens[0]

            # If the word is empty skip
            if len(lword.strip())==0:
                continue
            
            # Decode the word to get rid of acccents
            lword = unidecode.unidecode(lword)
            # Get the vector
            vector = [float(v) for v in line_tokens[1:]]

            # Update the randomly initialized matrix for the embeddings
            # Update the number of words matched with pretrained embeddings
            try:
                dword = dictionary[lword]
                words_found_ids.append(dictionary[lword])
                embeddings[dictionary[lword],:] = vector
                words_found += 1
            
            # If a given word is not found in our vocabulary,
            except KeyError:
                try:
                    # First try to match the same word with first letter
                    # capitalized
                    if len(lword)>0:
                        firt_letter_cap = lword[0].upper()+lword[1:]

                    else:
                        continue
                        
                    # Update the word embeddings matrix
                    dword = dictionary[firt_letter_cap]
                    words_found_ids.append(dictionary[firt_letter_cap])
                    embeddings[dictionary[firt_letter_cap],:] = vector
                    words_found += 1
                
                except KeyError:
                    # If not found try to matrch the word with the unaccented word
                    try:
                        dword = unaccented_dict[lword]
                        words_found_ids.append(dictionary[lword])
                        embeddings[dictionary[lword],:] = vector
                        words_found += 1
                    except KeyError:

                        continue
                    
    return embeddings, words_found, words_found_ids                          

# Processing German vocabulary
print('Processing German Vocabulary')
de_embeddings, words_found, words_found_ids = \
match_pretrained_dataset_words('multilingual_embeddings.de', de_embeddings, src_dictionary)

# Print some statistics about the embedding matching
words_notfound_ids = list(set(list(range(0,len(src_dictionary)))) - set(words_found_ids))
print('\tVocabulary size: %d',vocabulary_size)
print('\tWords found in pretrained embeddings: ', words_found)

print([src_reverse_dictionary[wid] for wid in words_notfound_ids[:20]])

# Processing English vocabulary
print('\nProcessing English Vocabulary')
en_embeddings, words_found, words_found_ids = \
match_pretrained_dataset_words('multilingual_embeddings.en', en_embeddings, tgt_dictionary)

# Print some statistics about embedding matching
print('\tVocabulary size: %d',vocabulary_size)
print('\tWords found in pretrained embeddings: ', words_found)

words_notfound_ids = list(set(list(range(0,len(tgt_dictionary)))) - set(words_found_ids))
print([tgt_reverse_dictionary[wid] for wid in words_notfound_ids[:20]])

Processing German Vocabulary
.
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.....................................................	Vocabulary size: %d 50000
	Words found in pretrained embeddings:  20590
['<unk>', '<s>', '</s>', '##AT##-##AT##', 'für', 'Sie', 'Die', '&quot;', 'über', 'Das', 'Ich', 'können', 'Wir', 'Der', 'daß', 'Es', 'müssen', 'In', 'Europäischen', 'möchte']

Processing English Vocabulary
.
....................................................................................................
....................................................................................................
..............................

## Flipping the Input Data
Changin the order of the sentence of the target language improves the performance of NMT systems. Because when reversed, it helps the NMT system to establish a strong connection as the last word of the source language and the last word of the target language will be closest to each other. *DON'T RUN THIS MULTIPLE TIMES as running two times gives original.*

In [7]:
## Reverse the Germen sentences
# Remember reversing the source sentence gives better performance
# DON'T RUN THIS MULTIPLE TIMES as running two times gives original
train_inputs = np.fliplr(train_inputs)
test_inputs = np.fliplr(test_inputs)

print('Training and Test source data after flipping ')
print('\t',[src_reverse_dictionary[w] for w in train_inputs[0,:].tolist()])
print('\t',[tgt_reverse_dictionary[w] for w in test_inputs[0,:].tolist()])
print()
print('\t',[src_reverse_dictionary[w] for w in train_inputs[10,:].tolist()])
print('\t',[tgt_reverse_dictionary[w] for w in test_inputs[10,:].tolist()])

print()
print('\nTesting data after flipping')
print('\t',[src_reverse_dictionary[w] for w in test_inputs[0,:].tolist()])

Training and Test source data after flipping 
	 ['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '.', 'können', 'nutzen', 'QuarkXPress', 'mit', 'zusammen', 'besten', 'am', '3', 'Suite', 'Creative', 'und', '2', 'Suite', 'Creative', 'Sie', 'wie', ',', 'Sie', 'erfahren', 'Hier', '<s>']
	 ['tray', 'road', 'mistakes', 'of', 'expect', 'a', 'tabled', 'with', 'and', 'the', 'posts', 'useful', 'out', 'waiting', 'wounded', 'a', 'drinks', 'been', 'stand', '26th', 'and', 'senior', 'personal', ',', 'difficulties', 'qualifications', 'an', 'rather', 'road', 'rewriting', 'and', 'road', 'unsustainable', 'the', '2007', 'road', 'wounded', 'not', 'throughout', 'amendment', '<s>']

	 ['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '.', ')', 'Import', '##AT##-##AT##', 'PSD', '&gt;', 'Fenster', '(', 'Import', '##AT##-##AT##', 'PSD', 'Palette', 'die', 

## Data Generations for MT

Now we define the data generator for our NMT.

In [8]:


class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source, is_train):
        # Number of data points in a batch
        self._batch_size = batch_size
        # Number of unrollings
        self._num_unroll = num_unroll
        # Cursors for each element in batch
        self._cursor = [0 for offset in range(self._batch_size)]
        
        # The sentence IDs being currently processed to create the 
        # current batch
        self._sent_ids = None
        
        # We want a batch of data from source or target?
        self._is_source = is_source
        # Is this training or testing data?
        self._is_train = is_train
        
        self._vocab_size = vocabulary_size 
        
    def next_batch(self, sent_ids, first_set):
        
        # Depending on wheter we want source or target data
        # change the maximum sentence length
        if self._is_source:
            max_sent_length = src_max_sent_length
        else:
            max_sent_length = tgt_max_sent_length
        
        # Arrays to hold input and output data
        # Word embeddings (current word)
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        # One-hot encoded label (next word)
        batch_labels = np.zeros((self._batch_size,self._vocab_size),dtype=np.float32)
        
        # Populate each index of the batch
        for b in range(self._batch_size):
            
            # Sentence IDs to get data from
            sent_id = sent_ids[b]
            
            # If generating data with source sentences
            # get data from train and test inputs
            if self._is_source:
                # Depending on whether we need training data or testind data
                # choose the previously created training or testing data
                if self._is_train:
                    sent_text = train_inputs[sent_id]
                else:
                    sent_text = test_inputs[sent_id]
                             
                batch_data[b] = sent_text[self._cursor[b]]
                # Fill the label as a one hot encoded word
                batch_labels[b] = np.zeros((self._vocab_size),dtype=np.float32)
                batch_labels[b,sent_text[self._cursor[b]+1]] = 1.0

            # If generating data with target sentences
            # get data from train and test outputs
            else:
                # Depending on whether we need training data or testind data
                # choose the previously created training or testing data
                if self._is_train:
                    sent_text = train_outputs[sent_id]
                else:
                    sent_text = test_outputs[sent_id]
                
                batch_data[b] = sent_text[self._cursor[b]]
                # Fill the label as a one hot encoded word
                batch_labels[b] = np.zeros((self._vocab_size),dtype=np.float32)
                batch_labels[b,sent_text[self._cursor[b]+1]] = 1.0

            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)
             
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        # Only if new sentence IDs if provided
        # else it will use the previously defined 
        # sent_ids continuously
        if sent_ids is not None:
            
            self._sent_ids = sent_ids
            # Unlike in the previous exercises we do not process a single sequence
            # over many iterations of unrollings. We process either a source sentence or target sentence
            # at a single go. So we reset the _cursor evrytime we generate a batch
            self._cursor = [0 for _ in range(self._batch_size)]
                
        unroll_data,unroll_labels = [],[]
        
        # Unrolling data over time
        for ui in range(self._num_unroll):
            if self._is_source:
                data, labels = self.next_batch(self._sent_ids, False)
            else:
                data, labels = self.next_batch(self._sent_ids, False)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        return unroll_data, unroll_labels, self._sent_ids
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
# Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=5,num_unroll=40,is_source=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    # the the string words for returned word IDs and display the results
    print([src_reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

        
# Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=5,num_unroll=60,is_source=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,2,3,4,5])
print('\nTarget data batch (first time)')

for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    # the the string words for returned word IDs and display the results
    print([tgt_reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

print('\nTarget data batch (non-first time)')
u_data, u_labels, _ = dg.unroll_batches(None)

for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    # the the string words for returned word IDs and display the results
    print([tgt_reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])


Source data
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '</s>', '</s>', '</s>']
['</s>', '</s>', '.', '</s>', '</s>']
['</s>', '</s>', 'bietet', '.', '</s>']
['</s>', '</s>', 'Dateiformat', 'nutzen', '</s>']
['</s>', '</s>', '##AT##-##AT##', 'optimal', '</s>']
['</s>', '</s>', 'PSD', 'Bilder', '</s>']
['</s>', '</s>', 'das', 'Ihre', '.']
['</s>', '</s>', 'über', 'für', 'werden']
['.', '</s>', 'Photoshop', 'es', 'ausgewählt']
['können', '.', 'mit', 'Sie', 'Verwendungszweck']
['nutzen', 'lässt', 'Integration', 'wie', 'nach']
['QuarkXPress', 'erschließen', 'beste', 'und', 'je']
['mit', 'Software', 'die', 'sollten

## Attention-Based NMT System

Here we define the attention based NMT system. Unlike the standard NMT attention based NMT has the ability to refer to any of the encoder states during any step of the decoding. This is achieved through the attention layer.

### Defining hyperparameters
Here we define various hyperparameters we use to define our model.

In [9]:
input_size = embeddings_size

num_nodes = 128
batch_size = 10

# We unroll the full length at one go
# both source and target sentences
enc_num_unrollings = 40
dec_num_unrollings = 60


### Loading the Pretrained Embeddings
Here we load the pretrained word embedding matrix we created and have them as TensorFlow variables

In [10]:
tgt_word_embeddings = tf.get_variable(
    'target_embeddings',shape=[vocabulary_size, embeddings_size],
    dtype=tf.float32, initializer = tf.constant_initializer(en_embeddings)
)
src_word_embeddings = tf.get_variable(
    'source_embeddings',shape=[vocabulary_size, embeddings_size], 
    dtype=tf.float32, initializer = tf.constant_initializer(de_embeddings)
)    

### Defining Input/Output Placeholders
Here we define the placeholder to feed in inputs/outputs. Additionally we define a mask placeholder that can mask certain outputs from the loss calculation.

In [11]:
# Training Input data.
enc_train_inputs = []
# Embedding lookup for training input data
enc_train_input_embeds = []

# Defining unrolled training inputs as well as embedding lookup (Encoder)
for ui in range(enc_num_unrollings):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))
    enc_train_input_embeds.append(tf.nn.embedding_lookup(src_word_embeddings,enc_train_inputs[ui]))

# Training Input data and the respective embeddings vectors
dec_train_inputs, dec_train_input_embeds = [],[]
# Training output data (used for optimization)
dec_train_labels = []
# Used to mask irrelevant words during loss computation
dec_train_masks = []

# Defining unrolled training inputs, embeddings,  outputs, and masks (Decoder)
for ui in range(dec_num_unrollings):
    dec_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='dec_train_inputs_%d'%ui))
    dec_train_input_embeds.append(tf.nn.embedding_lookup(tgt_word_embeddings, dec_train_inputs[ui]))
    dec_train_labels.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size], name = 'dec_train_labels_%d'%ui))
    dec_train_masks.append(tf.placeholder(tf.float32, shape=[batch_size,1],name='dec_train_masks_%d'%ui))

# Testing related placeholders and tensors
enc_test_input = [tf.placeholder(tf.int32, shape=[batch_size]) for _ in range(enc_num_unrollings)]
enc_test_input_embeds = [tf.nn.embedding_lookup(src_word_embeddings,test_enc_ui) for test_enc_ui in enc_test_input]

dec_test_input = tf.nn.embedding_lookup(tgt_word_embeddings,[tgt_dictionary['<s>']])

### Defining the Encoder Model

We define the encoder model. The encoder model is a single LSTM cell with TensorFlow variables for the state and output variables.

In [12]:
print('Defining Encoder Parameters')
with tf.variable_scope('Encoder'):
    
    # Input gate (i_t) - How much memory to write to cell state
    enc_ix = tf.get_variable('ix',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_im = tf.get_variable('im',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_ib = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='ib')
    
    # Forget gate (f_t) - How much memory to discard from cell state
    enc_fx = tf.get_variable('fx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_fm = tf.get_variable('fm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_fb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='fb')
    
    # Candidate value (c~_t) - Used to compute the current cell state                    
    enc_cx = tf.get_variable('cx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_cm = tf.get_variable('cm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_cb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='cb') 
    
    # Output gate (o_t) - How much memory to output from the cell state
    enc_ox = tf.get_variable('ox',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_om = tf.get_variable('om',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    enc_ob = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='ob') 
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name='train_output')
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name = 'train_cell')
    
    # Variables for saving state for testing
    saved_test_output = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_output')
    saved_test_state = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_cell')

print('\tDone')

Defining Encoder Parameters
	Done


### Defining the Decoder Model

Decoder is a single LSTM cell with an additional softmax layer that can predict words.

In [13]:
print('Defining Decoder Parameters')
with tf.variable_scope('Decoder'):
    
    # Input gate (i_t) - How much memory to write to cell state
    dec_ix = tf.get_variable('ix',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_im = tf.get_variable('im',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_ic = tf.get_variable('ic',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_ib = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='ib')    
    
    # Forget gate (f_t) - How much memory to discard from cell state
    dec_fx = tf.get_variable('fx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_fm = tf.get_variable('fm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_fc = tf.get_variable('fc',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_fb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05, 0.05),name='fb')    
    
    # Candidate value (c~_t) - Used to compute the current cell state                           
    dec_cx = tf.get_variable('cx',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_cm = tf.get_variable('cm',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_cc = tf.get_variable('cc',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_cb = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='cb')     
    
    # Output gate (o_t) - How much memory to output from the cell state
    dec_ox = tf.get_variable('ox',shape=[input_size, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_om = tf.get_variable('om',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_oc = tf.get_variable('oc',shape=[num_nodes, num_nodes],
                             initializer = tf.contrib.layers.xavier_initializer())
    dec_ob = tf.Variable(tf.random_uniform([1, num_nodes],-0.05,0.05),name='ob') 
    
    # Softmax Classifier weights and biases.
    # If we are using sampled softmax loss, the weights dims shouldbe [50000, 64]
    # If not, then [64, 50000]
    w = tf.get_variable('softmax_weights',shape=[num_nodes*2, vocabulary_size], 
                        initializer = tf.contrib.layers.xavier_initializer())
    b = tf.Variable(tf.random_uniform([vocabulary_size],-0.05,-0.05),name='softmax_bias')

print('\tDone')

Defining Decoder Parameters
	Done


### Attention Layer Related Variables

We define the weights used to compute the energy ($e_{ij}$) in the attention layer.

In [14]:
print('Defining Attention Variables ...')
with tf.variable_scope('Attention'):
    
    # Used to calculate e_{ij} as
    # e_{ij} = v_a' tanh(W_a . dec_output + U_a . enc_output)
    # Then alpha_{ij} is the softmax output (normalized) of e_{ij}
    W_a = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],stddev=0.05),name='W_a')
    U_a = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],stddev=0.05),name='U_a')
    v_a = tf.Variable(tf.truncated_normal([num_nodes,1],stddev=0.05),name='v_a')
    
print('\tDone')

Defining Attention Variables ...
	Done


### Defining Cell and Layer Computational Functions

We define several functions below:
* Encoder LSTM cell computations
* Decoder LSTM cell computations
* Attention layer computations.

In [15]:
# Definition of the cell computation (Encoder)
def enc_lstm_cell(i, o, state):
    """Create a LSTM cell"""
    input_gate = tf.sigmoid(tf.matmul(i, enc_ix) + tf.matmul(o, enc_im) + enc_ib)
    forget_gate = tf.sigmoid(tf.matmul(i, enc_fx) + tf.matmul(o, enc_fm) + enc_fb)
    update = tf.matmul(i, enc_cx) + tf.matmul(o, enc_cm) + enc_cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, enc_ox) + tf.matmul(o, enc_om) + enc_ob)
    return output_gate * tf.tanh(state), state

# Definition of the cell computation (Decoder)
def dec_lstm_cell(i, o, state, c):
    """Create a LSTM cell"""
    input_gate = tf.sigmoid(tf.matmul(i, dec_ix) + tf.matmul(o, dec_im) + tf.matmul(c, dec_ic) + dec_ib)
    forget_gate = tf.sigmoid(tf.matmul(i, dec_fx) + tf.matmul(o, dec_fm) + tf.matmul(c, dec_fc) + dec_fb)
    update = tf.matmul(i, dec_cx) + tf.matmul(o, dec_cm) + tf.matmul(c, dec_cc) +dec_cb 
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, dec_ox) + tf.matmul(o, dec_om) + tf.matmul(o, dec_oc) + dec_ob)
    return output_gate * tf.tanh(state), state
                    
def attn_layer(h_j_unrolled, s_i_minus_1):
    '''
    Computes attention values for a given decoding position
    h_j_unrolled : all the unrolled encoder outputs [[batch_size, num_nodes], [batch_size, num_nodes], ....] => enc_num_unrolling-many
    s_i_minus_1 : the previous decoder output [batch_size, num_nodes]
    '''
    # For the following four calculations we calculate by concatenating all encoder outputs (enc_num_unrollings)
    # get the encoder logits
    enc_logits = tf.concat(axis=0,values=h_j_unrolled)
    
    # W_a . encoder_output
    w_a_mul_s_i_minus_1 = tf.matmul(enc_logits,W_a) # of size [enc_num_unroll x batch_size, num_nodes]
    
    # U_a . decoder_output
    u_a_mul_h_j = tf.matmul(tf.tile(s_i_minus_1,[enc_num_unrollings,1]), U_a) # of size [enc_num_unroll x batch_size, num_nodes]   
    
    # calculate "energy"
    e_j = tf.matmul(tf.nn.tanh(w_a_mul_s_i_minus_1 + u_a_mul_h_j),v_a) # of size [enc_num_unroll x batch_size ,1]
    
    # we split the e_j s again into enc_num_unrollings batches
    batched_e_j = tf.split(axis=0,num_or_size_splits=enc_num_unrollings,value=e_j) # list of enc_num_unroll elements, each element [batch_size, 1]
    reshaped_e_j = tf.concat(axis=1,values=batched_e_j) # of size [batch_size, enc_num_unroll]

    # Now we calculate alpha_i for all the enc_num_unrollings time steps
    alpha_i = tf.nn.softmax(reshaped_e_j) # of size [batch_size, enc_num_unroll]
    # break alpha_i into list of enc_num_unroll elemtns, each of size [batch_size,1]
    alpha_i_list = tf.unstack(alpha_i,axis=1) 
    
    # list of enc_num_unroll elements, each of size [batch_size,num_nodes]
    c_i_list =  [tf.reshape(alpha_i_list[e_i],[-1,1])*h_j_unrolled[e_i] for e_i in range(enc_num_unrollings)] 
    
    # add_n batches all together
    c_i = tf.add_n(c_i_list) # of size [batch_size, num_nodes]
    
    return c_i,alpha_i
        

### Defining LSTM Computations

Here we define the computations to compute the final state variables of the encoder, feeding that into the decoder as the intial state, computing attention and finally computing the LSTM output, logit values and the predictions.

In [16]:
# ================================================
# Training related inference logic

# Store encoder outputs and decoder outputs across the unrolling
enc_outputs, dec_outputs = list(),list()

# Context vecs are the c_i values in the attention computation
context_vecs = list()

# These variables are initialized with saved_output and saved_sate
# values and then iteratively updated during unrollings
output = saved_output
state = saved_state
                             

print('Calculating Encoder Output')
# update the output and state values for all the inputs we have
for i in enc_train_input_embeds:
    output, state = enc_lstm_cell(i, output,state)
    # Accumulate all the output values in to a list
    enc_outputs.append(output)
    
print('Calculating Decoder Output with Attention')
# Before starting decoder computations, we make sure that
# the encoder outputs are computed
with tf.control_dependencies([saved_output.assign(output),
                             saved_state.assign(state)]):

    # Iterate through the decoder unrollings
    for ii,i in enumerate(dec_train_input_embeds):
        
        # Compute attention value for each decode position
        c_i,_ = attn_layer(enc_outputs, output)
        
        # Accumulate c_i in a list
        context_vecs.append(c_i)
        
        output, state = dec_lstm_cell(i, output, state, c_i)

        # Accumulate decoder outputs in a list
        dec_outputs.append(output)

    # Compute the logit values
    logits = tf.matmul(tf.concat(axis=1, values=[
        tf.concat(axis=0, values=dec_outputs),
        tf.concat(axis=0, values=context_vecs)]
                                ), 
                       w) + b
    
    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
# ================================================
# Testing related inference logic

# Initialize iteratively updated states with 
# saved_test_output and saved_test_state
test_output  = saved_test_output
test_state = saved_test_state

print("Calculations for test data")
test_predictions = []
test_enc_outputs = []

# Compute the encoder output iteratively
for i in enc_test_input_embeds:
    
    test_output, test_state = enc_lstm_cell(i, test_output, test_state)
    test_enc_outputs.append(test_output)

# This is used for visualization purposes
# To build the attention matrix discussed in the chapter
test_alpha_i_unrolled = []

# Make sure the encoder computations are done
with tf.control_dependencies([saved_test_output.assign(test_output),
                             saved_test_state.assign(test_state)]):

    for i in range(dec_num_unrollings):
        
        test_c_i,test_alpha = attn_layer(test_enc_outputs, test_output)
        
        # Used for visualization purposes
        test_alpha_i_unrolled.append(test_alpha)
        
        test_output, test_state = dec_lstm_cell(dec_test_input, test_output, test_state, test_c_i)

        # Compute predictions for each decoding step
        test_prediction = tf.nn.softmax(
            tf.nn.xw_plus_b(
                tf.concat(axis=1,values=[test_output,test_c_i]), w, b
            )
        )

        dec_test_input = tf.nn.embedding_lookup(tgt_word_embeddings,tf.argmax(test_prediction,axis=1))
        test_predictions.append(tf.argmax(test_prediction,axis=1))

print('\tDone')    

Calculating Encoder Output
Calculating Decoder Output with Attention
Calculations for test data
	Done


### Calculating the Loss

Here we calculate the loss. Loss is calculated by summing all the losses obtained across the time axis and averaging over the batch axis. You can see how the `dec_train_masks` is used to mask out irrelevant words from influencing loss

In [17]:
# Defining loss, cross-entropy loss summed across time axis averaged over batch axis
print('Calculating Softmax output and Loss')
loss_batch = tf.concat(axis=0,values=dec_train_masks)*tf.nn.softmax_cross_entropy_with_logits_v2(
    logits=logits, labels=tf.concat(axis=0, values=dec_train_labels))

loss = tf.reduce_mean(loss_batch)
print('\tDone')

Calculating Softmax output and Loss
	Done


### Optimizer
We define the model optimization specific operations. We use two optimizers here; Adam and SGD. I observed that using Adam only cause the model to exhibit some undesired behaviors in the long run. Therefore we use Adam to get a good initial estimate for the SGD and use SGD from that point onwards.

In [18]:
print('Defining Optimizer')

# These are used to decay learning rate over time
global_step = tf.Variable(0, trainable=False)

# We use two optimizers, when the optimizer changes
# we reset the global step
inc_gstep = tf.assign(global_step,global_step + 1)
reset_gstep = tf.assign(global_step,0)

# Calculate decaying learning rate
learning_rate = tf.maximum(
    tf.train.exponential_decay(
        0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True
    ), 0.0001)

sgd_learning_rate = tf.maximum(
    tf.train.exponential_decay(
        0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True
    ), 0.0001)

# We use two optimizers: Adam and naive SGD
# using Adam in the long run produced undesirable results 
# (e.g.) sudden fluctuations in BLEU
# Therefore we use Adam to get a good starting point for optimizing
# and then switch to SGD from that point onwards
with tf.variable_scope('Adam'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
with tf.variable_scope('SGD'):
    sgd_optimizer = tf.train.GradientDescentOptimizer(sgd_learning_rate)

# Calculates gradients with clipping for Adam
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 25.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

# Calculates gradients with clipping for SGD
sgd_gradients, v = zip(*sgd_optimizer.compute_gradients(loss))
sgd_gradients, _ = tf.clip_by_global_norm(sgd_gradients, 25.0)
sgd_optimize = optimizer.apply_gradients(zip(sgd_gradients, v))

# Make sure gradients exist flowing from decoder to encoder
print('Checking gradient flow from encoder-to-decoder')
for (g_i,v_i) in zip(gradients,v):
    assert g_i is not None, 'Gradient none for %s'%(v_i.name)
print('\tDone')

Defining Optimizer
Checking gradient flow from encoder-to-decoder
	Done


### Resetting Train and Test States
We here define the state resetting functions

In [19]:
# Reset state
reset_train_state = tf.group(
    tf.assign(saved_output, tf.zeros([batch_size, num_nodes])),
    tf.assign(saved_state, tf.zeros([batch_size, num_nodes]))
                            )

reset_test_state = tf.group(
    saved_test_output.assign(tf.zeros([batch_size, num_nodes])),
    saved_test_state.assign(tf.zeros([batch_size, num_nodes]))
                             )


 ## Running the Neural Machine Translator with Attention
 
 With all the relevant TensorFlow operations defined we move on to defining several functions related to executing our NMT model as well as runnning the model to obtain translations for previously unseen source sentences.

### Functions for Evaulating and Printing Results

Next we define two functions to print and save the prediction results for training data as well as testing data, and finally define a function to obtain candidate and reference data to calculate the BLEU score.

In [24]:
def print_and_save_train_predictions(du_labels, tr_pred, rand_idx, train_prediction_text_fname):
    '''
    Use this to print some predicted training samples and save it to file
    du_labels: Decoder's unrolled labels (this is a list of dec_num_unrollings 
    where each item is [batch_size, vocabulary_size])
    tr_pred: This is an array [dec_num_unrollings*batch_size, vocabulary_size] array
    rand_idx: Some random index we use to pick a data point to print
    train_prediction_text_fname: The file we save the prediction results into
    '''

    # This print_str will be written to the text file as well as printed here
    print_str = 'Actual: ' 
    
    # We can get each label corresponding to some sentence by traversing the
    # concatenated labels array ([dec_num_unrollings*batch_size, vocabulary_size])
    # with a batch_size stride
    for w in np.argmax(np.concatenate(du_labels,axis=0)[rand_idx::batch_size],axis=1).tolist():
        # Update the print_str
        print_str += tgt_reverse_dictionary[w] + ' '
        # When we encounter the end of sentence </s> we stop printing
        if tgt_reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    
    # Write to file
    with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:                
        fa.write(print_str+'\n')  

    # Now print the predicted data by following the same procedure as above
    print()
    print_str = 'Predicted: '
    for w in np.argmax(tr_pred[rand_idx::batch_size],axis=1).tolist():
        print_str += tgt_reverse_dictionary[w] + ' '
        # When we encounter the end of sentence </s> we stop printing
        if tgt_reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:                
        fa.write(print_str+'\n')    
    
    
def print_and_save_test_predictions(test_du_labels, test_pred_unrolled, batch_id, test_rand_idx, test_prediction_text_fname):
    '''
    Use this to print some predicted training samples and save it to file
    test_du_labels: Decoder's unrolled labels (this is a list of dec_num_unrollings 
    where each item is [batch_size, vocabulary_size])
    test_pred_unrolled: This is an array [dec_num_unrollings*batch_size, vocabulary_size] array
    batch_id: We need this to retrieve the actual sentence for the predicted 
    test_rand_idx: Some random index we use to pick a data point to print
    test_prediction_text_fname: The file we save the prediction results into
    '''
    
    # Print the actual sentence
    print('DE: ',test_source_sent[(batch_id*batch_size)+test_rand_idx])
    # print_str is the string we display as results and write to a file
    print_str = '\t EN (TRUE):' + test_target_sent[(batch_id*batch_size)+test_rand_idx]
    print(print_str + '\n')

    # Printing predictions
    print_str = '\t EN (Predicted): ' 
    
    for test_pred in test_pred_unrolled:                            
        print_str += tgt_reverse_dictionary[test_pred[test_rand_idx]] + ' '
        if tgt_reverse_dictionary[test_pred[test_rand_idx]] == '</s>':
            break
    print(print_str + '\n')

    # Write the results to text file
    with open(os.path.join(log_dir, test_prediction_text_fname),'a',encoding='utf-8') as fa:                                
        fa.write(print_str+'\n') 
        
def create_bleu_ref_candidate_lists(all_preds, all_labels):
    '''
    Creates two lists (candidate list and reference list) for calcluating BLEU
    all_preds: All the predictions
    all_labels: Correspondign all the actual labels
    Returns
    cand_list: List (sentences) of lists (words in a sentence)
    ref_list: List (sentences) of lists (words in a sentence)
    '''
    bleu_labels, bleu_preds = [],[]
    
    # calculate bleu score:        
    # We iterate batch_size times as i=0,1,2,...,batch_size while grabbing 
    # i, i+batch_size, i+2*batch_size, i+3*batch_size elements from all_labels and all_preds
    # This because the labels/predicitons belonging to same sentence are interleaved by batch_size 
    # due to the way concatenate labels and predictions
    # Taking elements interleaved by batch_size gives the sequence of words belonging to the same sentence
    ref_list, cand_list = [],[]
    for b_i in range(batch_size):
        tmp_lbl = all_labels[b_i::batch_size]            
        tmp_lbl = tmp_lbl[np.where(tmp_lbl != tgt_dictionary['</s>'])]            
        ref_str = ' '.join([tgt_reverse_dictionary[lbl] for lbl in tmp_lbl])
        ref_list.append([ref_str])

        tmp_pred = all_preds[b_i::batch_size]
        tmp_pred = tmp_pred[np.where(tmp_pred != tgt_dictionary['</s>'])]
        cand_str = ' '.join([tgt_reverse_dictionary[pre] for pre in tmp_pred])
        cand_list.append(cand_str)

    return cand_list, ref_list

### Defining a Single Step of Training

We now define a function to train the NMT model for a single step. It takes in encoder inputs, decoder inputs and decoder labels and train the NMT for a single step.

In [21]:
def train_single_step(eu_data, du_data, du_labels):
    '''
    Define a single training step
    eu_data: Unrolled encoder inputs (word IDs)
    du_data: Unrolled decoder inputs (word IDs)
    du_labels: Unrolled decoder outputs (one hot encoded words)
    '''
    # Fill the feed dict (Encoder)
    feed_dict = {}
    for ui,dat in enumerate(eu_data):            
        feed_dict[enc_train_inputs[ui]] = dat    
    
    
    # Fill the feed dict (Decoder) 
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl
        # The mask masks the </s> items from being part of the loss
        d_msk = (np.logical_not(np.argmax(lbl,axis=1)==tgt_dictionary['</s>'])).astype(np.int32).reshape(-1,1)
        feed_dict[dec_train_masks[ui]] = d_msk
    
    # ======================= OPTIMIZATION ==========================
    # Apparently using Adam in long term gives very weird behaviors in loss
    # so after 20000 iterations we change the optimizer to SGD
    if (step+1)<20000:
        _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
    else:
        _,l,tr_pred = sess.run([sgd_optimize,loss,train_prediction], feed_dict=feed_dict)
        
    return l, tr_pred

### Defining Data Generators and Other Related Variables

In [22]:
# This is where all the results will be logged into
log_dir = 'logs'
if not os.path.exists(log_dir):
    os.mkdir(log_dir)

# Some configuration for the TensorFlow session
config = tf.ConfigProto()
# Not dedicate the whole GPU memory but grow memory as required
config.gpu_options.allow_growth = True
# Place Graph nodes on CPU or GPU as per availability
config.allow_soft_placement=True 
sess = tf.InteractiveSession(config=config)

# Filenames of the logs
train_prediction_text_fname = 'train_predictions_pret.txt'
test_prediction_text_fname = 'test_predictions_pret.txt'

# Initialize global variables
print('Intializing Global Variables')
tf.global_variables_initializer().run()

# Defining data generators
def define_data_generators(batch_size, enc_num_unrollings, dec_num_unrollings):
    # Training data generators (Encoder and Decoder)
    enc_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=enc_num_unrollings,is_source=True, is_train=True)
    dec_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=dec_num_unrollings,is_source=False, is_train=True)

    # Testing data generators (Encoder and Decoder)
    test_enc_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=enc_num_unrollings,is_source=True, is_train=False)
    test_dec_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=dec_num_unrollings,is_source=False, is_train=False)

    return enc_data_generator, dec_data_generator, test_enc_data_generator, test_dec_data_generator

print('\tDone')

Intializing Global Variables
	Done


### Running Training and Testing for NMT

With all the TensorFlow operations, helper functions defined we train and test the NMT system.

In [None]:
# Training and test BLEU scores
attn_train_bleu_scores_over_time,attn_test_bleu_scores_over_time = [],[]
loss_over_time = []

# Labels and predictions required to calculate the BLEU scores
# for both train and test data
train_bleu_refs, train_bleu_cands = [],[]
test_bleu_refs, test_bleu_cands = [],[]

# Training and test BLEU scores
num_steps = 100001
avg_loss = 0

enc_data_generator, dec_data_generator, test_enc_data_generator, test_dec_data_generator = define_data_generators(batch_size, enc_num_unrollings, dec_num_unrollings)
print('Started Training')

for step in range(num_steps):

    # input (encoder) unrolling length: 40
    # output (decoder) unrolling length: 60
    if (step+1)%100==0:
        print('.',end='')

    # Sample a random batch of IDs from training data
    sent_ids = np.random.randint(low=0,high=train_inputs.shape[0],size=(batch_size))
    
    # Getting an unrolled set of data batches for the encoder
    eu_data, _, _ = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Getting an unrolled set of data batches for the decoder
    du_data, du_labels, _ = dec_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Train for single step
    l, tr_pred = train_single_step(eu_data, du_data, du_labels)
    
    # We don't calculate BLEU scores all the time as this is expensive, 
    # it slows down the code
    if np.random.random()<0.1:
        
        # all_labels are labels obtained by concatinating all the labels in batches
        all_labels = np.argmax(np.concatenate(du_labels,axis=0),axis=1)
        # all_preds are predictions for all unrolled steps
        all_preds = np.argmax(tr_pred,axis=1)
        
        # Get training BLEU candidates and references
        batch_cands, batch_refs = create_bleu_ref_candidate_lists(all_preds, all_labels)
        
        # Accumulate training candidates/references for calculating
        # BLEU later
        train_bleu_refs.extend(batch_refs)
        train_bleu_cands.extend(batch_cands)

    # Train BLEU calculations     
    # And printing training labels/predictions
    if (step+1)%500==0:  
        
        # Writing actual and predicte data to train_prediction.txt file for some random sentence
        print('Step ',step+1)
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:                                
            fa.write('============= Step ' +  str(step+1) + ' =============\n') 
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_and_save_train_predictions(du_labels, tr_pred, rand_idx, train_prediction_text_fname)                
        
        # Calculating the BLEU score for the accumulated candidates/references
        bscore = 0.0
        bscore = corpus_bleu(train_bleu_refs,train_bleu_cands,smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
        attn_train_bleu_scores_over_time.append(bscore)
        print('(Train) BLEU (%d elements): '%(len(train_bleu_refs)),bscore)
        
        # Reset the candidate/reference accumulators
        train_bleu_refs, train_bleu_cands = [],[]
        
        # Write BLEU score to file
        with open(log_dir + os.sep +'blue_scores_pret.txt','a') as fa_bleu:
            fa_bleu.write(str(step+1) +','+str(bscore)+'\n')
        
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:                
            fa.write('(Train) BLEU: %.5f\n'%bscore)        
        
    avg_loss += l # Update average loss
    
    sess.run(reset_train_state) # resetting hidden state for each batch
    
    # ============================= TEST PHASE ==================================
    if (step+1)%1000==0:
        # calculate average loss
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/1000.0)
        loss_over_time.append(avg_loss/1000.0)
        
        # write losses to file
        with open(log_dir + os.sep + 'losses_pret.txt','a') as fa_loss:
            fa_loss.write(str(step+1) +','+str(avg_loss/1000.0)+'\n')
        
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:                                
            fa.write('============= Step ' +  str(step+1) + ' =============\n') 
            fa.write('\t Loss: %.5f\n'%(avg_loss/1000.0))
            
        avg_loss = 0.0 # Reset loss
        
        # Increase gstep to decay learning rate
        sess.run(inc_gstep)
        
        # reset global step when we change the optimizer
        if (step+1)==20000: # reset global step when we change the optimizer
            sess.run(reset_gstep)
        
        print('=====================================================')
        print('(Test) Translating test sentences ...')

        print('Processing test data ... ')
        
        # ===================================================================================
        # Predictions for Test data
        for in_i in range(test_inputs.shape[0]//batch_size):
            
            # Generate encoder / decoder data for testing data
            test_eu_data, test_eu_labels, _ = test_enc_data_generator.unroll_batches(sent_ids=np.arange(in_i*batch_size,(in_i+1)*batch_size))
            test_du_data, test_du_labels, _ = test_dec_data_generator.unroll_batches(sent_ids=np.arange(in_i*batch_size,(in_i+1)*batch_size))
            
            # fill the feed dict
            feed_dict = {}
            for ui,(dat,lbl) in enumerate(zip(test_eu_data,test_eu_labels)):            
                feed_dict[enc_test_input[ui]] = dat             

            # Get predictions out with decoder          
            # run prediction calculation this returns a list of prediction dec_num_unrollings long
            test_pred_unrolled = sess.run(test_predictions, feed_dict=feed_dict)
            
            # We print a randomly selected sample from each batch
            test_rand_idx = np.random.randint(0,batch_size) # used for printing test output
            
            print_and_save_test_predictions(test_du_labels, test_pred_unrolled, in_i, test_rand_idx, test_prediction_text_fname)
            
            # Things require dto calculate test BLEU score
            all_labels = np.argmax(np.concatenate(test_du_labels,axis=0),axis=1)
            all_preds = np.concatenate(test_pred_unrolled, axis=0)
            batch_cands, batch_refs = create_bleu_ref_candidate_lists(all_preds, all_labels)
            test_bleu_refs.extend(batch_refs)
            test_bleu_cands.extend(batch_cands)
            
            # Reset the test state
            sess.run(reset_test_state)
        
        # Calculate test BLEU score
        test_bleu_score = 0.0
        test_bleu_score = corpus_bleu(test_bleu_refs,test_bleu_cands,
                                      smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
        attn_test_bleu_scores_over_time.append(test_bleu_score)
        print('(Test) BLEU (%d elements): '%(len(test_bleu_refs)),test_bleu_score)
        
        test_bleu_refs, test_bleu_cands = [],[]        
        print('=====================================================')

Started Training
.....Step  500
Actual: Room Notes : Circus <unk> is a charming <unk> house in an amazing location : just behind the famous Circus and next door to The Royal Crescent , in the heart of Georgian Bath . </s> 

Predicted: Room Notes : 13 was , a good location , , the hotel and . <unk> to the hotel city , <unk> to to the <unk> . . the the hotel of the . . </s> 
(Train) BLEU (630 elements):  0.2124202059721479
.....Step  1000
Actual: One child under 12 years is charged EUR 30 <unk> per night and person in an extra bed . </s> 

Predicted: The of in 15 years are charged . 15 <unk> per night . person . the room beds . </s> 
(Train) BLEU (520 elements):  0.2538571172836747
	 Loss:  1.040665469557047
(Test) Translating test sentences ...
Processing test data ... 
DE:  Das Hotel Sempione verfügt über eine ideale , ruhige Lage in einem geschäftigen Viertel mit guter Verkehrsanbindung . Der Bahnhof und eine U ##AT##-##AT## Bahnstation liegen in der Nähe .

	 EN (TRUE):Hotel Sempione

DE:  Das „ Ladino di Fassa “ ist jedoch mehr als ein Dialekt – es ist eine richtige Sprache .

	 EN (TRUE):This is Ladin from Fassa which is more than a dialect : it is a language in its own right .


	 EN (Predicted): The <unk> is a new <unk> . </s> 

DE:  Booking.com : Best Western Hotell SöderH , Söderhamn , Schweden - 29 Gästebewertungen .

	 EN (TRUE):Booking.com : Best Western Hotell SöderH , Söderhamn , Sweden - 29 Guest reviews .


	 EN (Predicted): Booking .com : Radisson Hotel <unk> , United Kingdom - Guest reviews . </s> 

DE:  Wünschen Sie Unterstützung bei der der Zentrensuche ?

	 EN (TRUE):Would you like being assisted in searching a specialised centre ?


	 EN (Predicted): How to the <unk> <unk> <unk> ? </s> 

DE:  With a unique location in the heart of Peneda / Gerês National Park , this Pousada has a breathking view over the river Cávado and the peaceful Caniçada dam .

	 EN (TRUE):Located in the heart of Peneda ##AT##-##AT## Gerês National Park , this guest house boa

DE:  „ Für uns junge slowenische Architekten ist prägend , wie Plečnik den öffentlichen Raum in Ljubljana zu gliedern verstand .

	 EN (TRUE):“ For us young Slovene architects the way in which Plečnik was able to shape public space in Ljubljana is highly influential .


	 EN (Predicted): <unk> is a <unk> of the <unk> , and <unk> , and the <unk> <unk> . </s> 

DE:  Die schlanke , einfache Oberfläche und die gute Performance machen es zum idealen Werkzeug , um dein Netbook ( oder normales Notebook ) in einen e ##AT##-##AT## Book Reader zu verwandeln .

	 EN (TRUE):Its low resource use , simple interface and fast performance makes it the ideal tool to turn your netbook ( or regular laptop ) into an e ##AT##-##AT## book reader .


	 EN (Predicted): The <unk> is a result of the <unk> , and <unk> , and <unk> , and <unk> , and <unk> , and <unk> . </s> 

DE:  aufgerufen wird , fügt Sie die Flash Nachricht &quot; Eintrag gespeichert !

	 EN (TRUE):is called , it adds the flash message &quot; Re

DE:  Das Personal war immer hilfsbereit und freundlich .

	 EN (TRUE):The location and helpfulness of staff was excellent .


	 EN (Predicted): The staff were friendly and friendly and friendly . </s> 

(Test) BLEU (100 elements):  0.16871871200915653
.....Step  5500
Actual: Our guests will feel right at home thanks to the pleasant and cordial atmosphere , the attention to detail and the personal service . </s> 

Predicted: The <unk> can find the in the , to the <unk> and comfortable , . and hotel of the and the <unk> atmosphere . </s> 
(Train) BLEU (460 elements):  0.32528087341897727
.....Step  6000
Actual: You agree that you have reviewed , agree to and understand our Privacy Policy and the terms and conditions thereof . </s> 

Predicted: The can that the can to the or that be the that <unk> Policy , the <unk> of the of , </s> 
(Train) BLEU (340 elements):  0.33781066321661035
	 Loss:  0.8552386335581541
(Test) Translating test sentences ...
Processing test data ... 
DE:  34 Diese a

DE:  Im Allgemeinen basieren sie auf Datenbanken , Templates und Skripts .

	 EN (TRUE):In general they are based on databases , template and scripts .


	 EN (Predicted): In the case of the <unk> , the <unk> will be used in the <unk> . </s> 

DE:  Nach einigen Wanderwochen erreichten ich und Celina Warschau . Auf dem Weg zum jüdischen Komitee begegnete ich auf der Straße meinem Bruder !

	 EN (TRUE):It turned out that Marek had jumped from the window of a train moving to Majdanek .


	 EN (Predicted): After the <unk> , I was not afraid of the <unk> of the <unk> . </s> 

DE:  Standort war sehr praktisch . In 5 Minuten ist man am Hauptbahnhof , in 10 Minuten im Bankenviertel .

	 EN (TRUE):very central only a few minutes walk from Bohr / Ryanair bus stop and main train station.Generally cheap and cheerful .


	 EN (Predicted): The hotel is a 5 minutes from the city centre . </s> 

DE:  Hotelparkplätze sind gegen eine kleine Gebühr vorhanden .

	 EN (TRUE):Car Parking is available at the

(Train) BLEU (510 elements):  0.3454244897105529
.....Step  9000
Actual: Note : PHP allows shortcuts for bit values , including K ( kilo ) , M ( mega ) and G ( <unk> ) . </s> 

Predicted: Note : <unk> is you , the of , and the . <unk> ) , and <unk> <unk> ) , <unk> <unk> <unk> ) . </s> 
(Train) BLEU (580 elements):  0.350077642651091
	 Loss:  0.8267974787205458
(Test) Translating test sentences ...
Processing test data ... 
DE:  Je mehr Zeit wir mit Gilad und dem Rest des Teams in Israel verbracht haben ( um nicht den lauten Hahn zu erwähnen der schreiend bei denen über den Campus rennt ) desto überzeugter waren wir – zusammen können wir mehr bewegen .

	 EN (TRUE):The more time we spent with Gilad as well as the rest of the team in Israel ( not to mention the very loud rooster that runs around in their campus ) , the more convinced we all became - we ’ ll be better off together .


	 EN (Predicted): The <unk> was the first time of the <unk> , and the <unk> of the <unk> , the <unk> of t

DE:  Naturreservat auf aufgeschüttetem Gelände am Río de la Plata .

	 EN (TRUE):Wide selection of main courses including fresh pasta in homemade sauces ( $ 15 ##AT##-##AT## 30AR ) , traditional chicken dishes incuding Chicken Marsala ( $ 20AR- $ 30AR ) , and a variety of meats including Argentine parilla style steaks ( $ 35AR ) . The menu of seafood is worth considering with rareties such as fresh Yellow ##AT##-##AT## fin Tuna steak in a pesto sauce ( $ 30AR ) .


	 EN (Predicted): The hotel is located on the beach . </s> 

DE:  Ein älteres Kind oder Erwachsener zahlt USD 23,40 pro Übernachtung in einem der vorhandenen Betten .

	 EN (TRUE):One older child or adult is charged USD 23.40 per night when using existing bedding .


	 EN (Predicted): One older child or adult is charged USD 23 <unk> per night and person when using existing bedding . </s> 

DE:  Bei einer digitalen Bildkette wird das Intensitätssignal für jedes Pixel ohne analoge Zwischenschritte direkt in der Detektoreinheit

DE:  Das Personal war immer hilfsbereit und freundlich .

	 EN (TRUE):The location and helpfulness of staff was excellent .


	 EN (Predicted): The staff were very friendly and helpful . </s> 

(Test) BLEU (100 elements):  0.2147230680205952
.....Step  11500
Actual: Even so , there are a couple of annual events in the winter , starting with a furniture and interior decorating trade fair called Maison &amp; Object &#91; 40 &#93; in January . </s> 

Predicted: The though that it is a great of <unk> <unk> , the city , the out the large , a , , , , <unk> , <unk> , 4 &#93; . the . </s> 
(Train) BLEU (450 elements):  0.36368477563732515
.....Step  12000
Actual: proc ##UNDERSCORE## nice ( ) will only exist if your system has &apos; nice &apos; capabilities . </s> 

Predicted: <unk> ##UNDERSCORE## <unk> ( ) &apos; be be in you use will been <unk> &apos; . . </s> 
(Train) BLEU (480 elements):  0.3591657216073991
	 Loss:  0.802608942553401
(Test) Translating test sentences ...
Processing test da

(Train) BLEU (500 elements):  0.36175999464744213
	 Loss:  0.7782670557051897
(Test) Translating test sentences ...
Processing test data ... 
DE:  Ideale Lage für Exkursionen in die Stadt und Nähe zur Promenade .

	 EN (TRUE):There was plenty of space in the room and a nice garden to sit and have a drink and smoke .


	 EN (Predicted): The hotel is ideally located in the city centre . </s> 

DE:  Tarbet Gast ist Haus im ersten Nationalpark von Schottland aufgestellt und hat eine gehobene Position hoch über dem Dorf von Tarbet und genießt spektakuläre südliche Blicke Bucht Lomond hinunter und nach der westlichen Seite von Ben Lomond .

	 EN (TRUE):Tarbet Guest House is situated In Scotland ’ s first National Park and has an elevated position high above the village of Tarbet and enjoys spectacular southerly views down Loch Lomond and towards the western side of Ben Lomond .


	 EN (Predicted): <unk> is a great place to visit the city , and the <unk> of the <unk> , the <unk> and the <unk>

DE:  Jedes Stück Information kann eigene Eigenschaften und Aktionen besitzen .

	 EN (TRUE):Every bit of information and code can be given their own properties and actions .


	 EN (Predicted): Each time can be used in the <unk> and <unk> . </s> 

DE:  Alle älteren Kinder oder Erwachsene zahlen EUR 32,00 pro Übernachtung und Person für Zustellbetten .

	 EN (TRUE):All older children or adults are charged EUR 32.00 per night and person for extra beds .


	 EN (Predicted): All older children or adults are charged EUR 32 <unk> per night and person for extra beds . </s> 

DE:  Zimmerbeschreibung : Our Castle Deluxe Rooms are traditionally themed with rich luxurious fabrics and furnishings , many with excellent views over the Castle grounds .

	 EN (TRUE):Room Notes : Our Castle Deluxe Rooms are traditionally themed with rich luxurious fabrics and furnishings , many with excellent views over the Castle grounds .


	 EN (Predicted): Room Notes : The <unk> <unk> <unk> , <unk> <unk> , <unk> <u

(Test) BLEU (100 elements):  0.19542589803608468
.....Step  15500
Actual: You can see past them far into the countryside and across the <unk> . </s> 

Predicted: The can also the the , from the country , the the country . </s> 
(Train) BLEU (510 elements):  0.3733506409253079
.....Step  16000
Actual: Built originally in the 1850 &apos;s , this elegant Victorian listed building is ideally situated within a minutes walk from Paddington train and underground station , therefore all the top attractions are in very easy reach of our hotel . Main attractions on our door step include Madam Tussauds , Buckingham Palace , Kensington Palace , Albert Hall , the 

Predicted: The in built the heart of city and hotel hotel building building is a located for walking short from from the Station station bus station . and a the main of are just the quiet . . the city . </s> 
(Train) BLEU (580 elements):  0.36284241577789117
	 Loss:  0.7719490717053413
(Test) Translating test sentences ...
Processing tes

DE:  Es handelt sich um ein ziemlich einfaches Protokoll ; TFTP macht aber manchmal Probleme .

	 EN (TRUE):This is a fairly simple protocol , but sometimes there are problems trying to get it to work .


	 EN (Predicted): The <unk> is the first time , but it is a bit of <unk> . </s> 

DE:  in dieser Option ermöglicht , Dateien relativ zum aktuellen Verzeichnis einzubinden .

	 EN (TRUE):in the include path allows for relative includes as it means the current directory . However , it is more efficient to explicitly use include &apos; . / file &apos; than having PHP always check the current directory for every include .


	 EN (Predicted): The option option allows you to be <unk> to the <unk> . </s> 

(Test) BLEU (100 elements):  0.24591078203837485
.....Step  17500
Actual: The guest reviews are submitted by our customers after their stay at Pratt &apos;s Hotel . </s> 

Predicted: The guest reviews are submitted by our customers after their stay at Hotel Hotel Hotel . </s> 
(Train) BLEU

DE:  shower was ok but leaked needed updating .

	 EN (TRUE):the response to to requests was poor , phone 3 time for milk in the room over 4 hours .


	 EN (Predicted): The staff were very helpful and friendly . </s> 

DE:  Das „ Ladino di Fassa “ ist jedoch mehr als ein Dialekt – es ist eine richtige Sprache .

	 EN (TRUE):This is Ladin from Fassa which is more than a dialect : it is a language in its own right .


	 EN (Predicted): The <unk> is a more than one of the most popular cities in the city . </s> 

DE:  Die Bewohner des Nordens sind ein buntes Völkergemisch aus den verschiedensten Bergstämmen und den Nord ##AT##-##AT## Thais oder kon mueang ; die traditionell in den fruchtbaren Tiefebenen Nordthailands siedeln . In vielerlei Hinsicht halten sich die Nord Thais für die &quot; wahren &quot; Thais , die die Thai ##AT##-##AT## Kultur noch am besten über die Zeit gerettet haben .

	 EN (TRUE):From Pratu Chiang Mai market , songthaews also travel to Hang Dong ( 20 baht ) and San P

DE:  Ziel von 50 ##AT##-##AT## Hand Video Poker ist ähnlich zu dem von Video Poker : eine Pokerhand mit fünf Karten , die mindestens die niedrigste Kombination von dem Spieltisch , an dem Sie sich gerade befinden , zu erlangen .

	 EN (TRUE):The object of 50 ##AT##-##AT## Hand Video Poker is similar to Video Poker , to obtain a five ##AT##-##AT## card poker hand that contains at least the lowest combination on the pay table for the version you are playing .


	 EN (Predicted): The poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker poker 

(Test) BLEU (100 elements):  0.23488802137857362
.....Step  20500
Actual: <unk> was given the task of dealing with Ishka , Quark &apos; s mother , in 2371 aft

DE:  Booking.com : Best Western Hotell SöderH , Söderhamn , Schweden - 29 Gästebewertungen .

	 EN (TRUE):Booking.com : Best Western Hotell SöderH , Söderhamn , Sweden - 29 Guest reviews .


	 EN (Predicted): Booking .com : Best Western <unk> Hotell <unk> , <unk> , Netherlands - <unk> Guest reviews . </s> 

DE:  Naturreservat auf aufgeschüttetem Gelände am Río de la Plata .

	 EN (TRUE):Wide selection of main courses including fresh pasta in homemade sauces ( $ 15 ##AT##-##AT## 30AR ) , traditional chicken dishes incuding Chicken Marsala ( $ 20AR- $ 30AR ) , and a variety of meats including Argentine parilla style steaks ( $ 35AR ) . The menu of seafood is worth considering with rareties such as fresh Yellow ##AT##-##AT## fin Tuna steak in a pesto sauce ( $ 30AR ) .


	 EN (Predicted): <unk> de la Frontera is a good option for the hotel . </s> 

DE:  Das ist eine Metapher , wird jemand von der Propagandaabteilung entgegnen .

	 EN (TRUE):It &apos;s only a metaphor , people from the pro

(Test) BLEU (100 elements):  0.23680062307687
.....Step  23500
Actual: Centrally located just 5 minutes away from the city centre and the main train station , this tranquil and tastefully decorated hotel offers comfort and elegance for business and leisure travellers . . . </s> 

Predicted: The located in a minutes walk from the city centre , the city shopping station , the hotel hotel new decorated hotel is a and a . a travellers leisure travellers . </s> 
(Train) BLEU (410 elements):  0.37747557616247335
.....Step  24000
Actual: Infineon EBIT in the 2008 fiscal year also included Euro 41 million , mostly for the amortization of acquisition ##AT##-##AT## related intangible assets related mainly to the business acquired from LSI . </s> 

Predicted: The EBIT of the <unk> fiscal year , is the , ( to and in the <unk> of the , based figures values to to of the <unk> sector by the . </s> 
(Train) BLEU (540 elements):  0.3487552798022022
	 Loss:  0.7754492957592011
(Test) Translating test se

DE:  Slimline ICE ist in einer Vielzahl von Geschmacksrichtungen sowohl als Eis am Stiel als auch im Becher erhältlich .

	 EN (TRUE):Palatinose ™ is a disaccharide derived from beet sugar .


	 EN (Predicted): <unk> is a small <unk> in <unk> , and also are also available in the <unk> . </s> 

DE:  Naturreservat auf aufgeschüttetem Gelände am Río de la Plata .

	 EN (TRUE):Wide selection of main courses including fresh pasta in homemade sauces ( $ 15 ##AT##-##AT## 30AR ) , traditional chicken dishes incuding Chicken Marsala ( $ 20AR- $ 30AR ) , and a variety of meats including Argentine parilla style steaks ( $ 35AR ) . The menu of seafood is worth considering with rareties such as fresh Yellow ##AT##-##AT## fin Tuna steak in a pesto sauce ( $ 30AR ) .


	 EN (Predicted): The hotel is located on the beach . </s> 

DE:  Kosten Sie mediterrane Gerichte im preisgekrönten Restaurant Molyvos .

	 EN (TRUE):Enjoy award winning Mediterranean cuisine at Molyvos .


	 EN (Predicted): Enjoy the 

(Train) BLEU (450 elements):  0.368505099778133
	 Loss:  0.7574514121562242
(Test) Translating test sentences ...
Processing test data ... 
DE:  Das Hotel Sempione verfügt über eine ideale , ruhige Lage in einem geschäftigen Viertel mit guter Verkehrsanbindung . Der Bahnhof und eine U ##AT##-##AT## Bahnstation liegen in der Nähe .

	 EN (TRUE):Hotel Sempione welcomes you to a busy yet quiet area of Milan , within walking distance of excellent transport links , including the central railway station and the Repubblica metro station .


	 EN (Predicted): The hotel is located in a quiet location , close to the city centre and close to the city centre . </s> 

DE:  In raschem Tempo werden die Modelle angepasst und erneuert .

	 EN (TRUE):The models are quickly being improved and renewed .


	 EN (Predicted): In the world , the best and most popular areas of the world . </s> 

DE:  Niedrigere Preise durch mehr Wettbewerb . Die Kosten für Kapital können durch Währungsstabilität , niedrigere Z

DE:  Das ist eine Metapher , wird jemand von der Propagandaabteilung entgegnen .

	 EN (TRUE):It &apos;s only a metaphor , people from the propaganda department will say .


	 EN (Predicted): The <unk> is a <unk> of the <unk> of the <unk> . </s> 

DE:  &quot; Die Letzte Droge &quot; wird , wie auch Route 66 , unter einer Creative Commons ##AT##-##AT## Lizenz veröffentlicht - Kopieren , Aufführen und Verändern ist diesmal auch zu kommerziellen Zwecken gestattet und erwünscht !

	 EN (TRUE):We will release The Last Drug under a Creative Commons BY SA License , making it the first free HD feature film . All footage , project files , sounds and special effects will be available for those of you that are eager to get hands on experience on the first Open Source feature film project ever or for those that are able to turn it into something different .


	 EN (Predicted): &quot; <unk> &quot; , &quot; <unk> &quot; , &quot; <unk> &quot; , &quot; <unk> &quot; , &quot; <unk> &quot; , &quot; <unk>

(Test) BLEU (100 elements):  0.22122936924910774
.....Step  29500
Actual: We offer sperm freezing for men who must undergo <unk> or radiotherapy and want to protect their sperm for future use . </s> 

Predicted: The &apos;ve a , , the , are see the , <unk> , <unk> to see the <unk> , the , . </s> 
(Train) BLEU (500 elements):  0.37973085072864554
.....Step  30000
Actual: 229 00 : 14 : 00 <unk> -- &gt; 00 : 14 : 01 <unk> L . Ron describes it as , 230 00 : 14 : 01 <unk> -- &gt; 00 : 14 : 04 <unk> &quot; The actual cycle of action is as follows : 231 00 : 14 : 04 <unk> -- &gt; 00 : 14 : 

Predicted: - 00 : 14 : 14 : 14 &gt; 00 : 14 : 14 : : <unk> <unk> Hubbard the : a I 00 : 14 : 00 <unk> -- &gt; 00 : 14 : 14 <unk> . <unk> <unk> &quot; &quot; <unk> &quot; a a : 00 to : 14 : 14 <unk> -- &gt; 00 : 14 : 
(Train) BLEU (630 elements):  0.37314327580579465
	 Loss:  0.770045813947916
(Test) Translating test sentences ...
Processing test data ... 
DE:  34 Diese a Worte sind wahr und treu ; darum ü

DE:  Das Haus liegt in der CCZ ##AT##-##AT## Umweltzone und bietet eine sehr gute Anbindung an das Bus- und U ##AT##-##AT## Bahnnetz .

	 EN (TRUE):Set inside the central London congestion ##AT##-##AT## charging zone , this modern hotel has superb transport links , with access to the Tube and the bus network practically on the doorstep .


	 EN (Predicted): The hotel is located in the heart of the city centre , close to the metro station . </s> 

DE:  Mitglieder geniessen viele zus � tzliche Leistungen wie optimierter Sicherheit , schnelleren Auszahlungszeiten und der Aufhebung von Kreditkarteneinzahlungslimits .

	 EN (TRUE):Members enjoy a range of perks including enhanced security and protection , faster withdrawals and increased credit card deposit limits .


	 EN (Predicted): The <unk> <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> 

(Train) BLEU (380 elements):  0.38574443542114156
.....Step  33000
Actual: One of the cornerstones of Fanatic the <unk> Center lay in the superb equipment that you will find in all destinations , where quality matches quantity with the highest of standards . </s> 

Predicted: A of the most of the ’ <unk> , is the the <unk> range , are can be the the the in and you , with and the <unk> standard the . </s> 
(Train) BLEU (510 elements):  0.38207628358629925
	 Loss:  0.7422782608270645
(Test) Translating test sentences ...
Processing test data ... 
DE:  &#124; Ferienwohnungen 1 Zi &#124; Ferienhäuser &#124; Landhäuser &#124; Autovermietung &#124; Last Minute Angebote ! !

	 EN (TRUE):&#124; 1 Bedroom Apts &#124; Holiday houses &#124; Rural Homes &#124; Car Rental &#124; Last Minute Offers !


	 EN (Predicted): Holiday Houses &#124; Rural Homes &#124; Hotels &#124; Last Minute Offers ! </s> 

DE:  Tarbet Gast ist Haus im ersten Nationalpark von Schottland aufgestellt und hat eine gehobene P

DE:  Jedes Stück Information kann eigene Eigenschaften und Aktionen besitzen .

	 EN (TRUE):Every bit of information and code can be given their own properties and actions .


	 EN (Predicted): Each information can be found in the same . </s> 

DE:  Das ist eine Metapher , wird jemand von der Propagandaabteilung entgegnen .

	 EN (TRUE):It &apos;s only a metaphor , people from the propaganda department will say .


	 EN (Predicted): The man is a man <unk> , but that the <unk> is the <unk> of the <unk> . </s> 

DE:  Softwaretools wie zum Beispiel der NI Analog Waveform Editor , das NI Modulation Toolkit und LabVIEW helfen Anwendern , die Entwicklungszeit von Prüfsystemen zu verringern und gleichzeitig flexibel auf die sich ändernden Anwendungsanforderungen zu reagieren .

	 EN (TRUE):Software tools such as the NI Analog Waveform Editor , Modulation Toolkit and LabVIEW , help reduce your test system development time while also having the flexibility to meet your changing application requ

DE:  &#124; Ferienwohnungen 1 Zi &#124; Ferienhäuser &#124; Landhäuser &#124; Autovermietung &#124; Last Minute Angebote ! !

	 EN (TRUE):&#124; 1 Bedroom Apts &#124; Holiday houses &#124; Rural Homes &#124; Car Rental &#124; Last Minute Offers !


	 EN (Predicted): Apartments &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; Hotels &#124; 

DE:  1 Nun ließ der Herr , Gott , die Hungersnot im Land Ur so schwer werden , daß a Haran , mein Bruder , starb ; aber b Terach , mein Vater , lebte noch im Land Ur der Chaldäer .

	 EN (TRUE):1 Now the Lord God caused the a famine to wax sore in the land of Ur , insomuch that b Haran , my brother , died 

DE:  Das Athens Gate Hotel liegt unterhalb der Akropolis nur 100 m vom neuen Akropolis ##AT##-##AT## Museum entfernt .

	 EN (TRUE):The Athens Gate Hotel rests under the Acropolis , just 100 metres from the new Acropolis museum .


	 EN (Predicted): The hotel is located in the heart of the city centre , just 100 metres away . </s> 

DE:  Obwohl das Nazi ##AT##-##AT## Regime die Buddhistische Gemeinde in Berlin , die seit 1936 aktiv gewesen war , schloss und kurzzeitig deren Begründer Martin Steinke 1941 inhaftierte , verfolgte es die Buddhisten nicht generell .

	 EN (TRUE):Although the Nazi regime closed the Buddhistische Gemeinde ( Buddhist Society ) in Berlin , which had been active from 1936 , and briefly arrested its founder Martin Steinke in 1941 , they generally did not persecute Buddhists .


	 EN (Predicted): Although the <unk> , the <unk> of the Nazi occult father was the <unk> of the Nazi occult affairs of the Nazi regime , and the <unk> of the Nazi regime , the <unk> of the

.....Step  38500
Actual: Structure steel , high ##AT##-##AT## strength , high ##AT##-##AT## <unk> , Wear ##AT##-##AT## resistant steel , Steel wear <unk> ( <unk> ) , High ##AT##-##AT## strength steels . </s> 

Predicted: The , , <unk> ##AT##-##AT## <unk> <unk> <unk> ##AT##-##AT## resolution <unk> <unk> , colored waist , <unk> , , , <unk> ) , <unk> ##AT##-##AT## <unk> composition , </s> 
(Train) BLEU (510 elements):  0.3873660151634252
.....Step  39000
Actual: We are still trying to convince the mother that the <unk> is causing the seizures . Every time we get someone off of aspartame , the seizures stop . </s> 

Predicted: <unk> have a used to get the <unk> of the <unk> was a the <unk> of </s> 
(Train) BLEU (490 elements):  0.39106601394322477
	 Loss:  0.7147494323849678
(Test) Translating test sentences ...
Processing test data ... 
DE:  &#124; Ferienwohnungen 1 Zi &#124; Ferienhäuser &#124; Landhäuser &#124; Autovermietung &#124; Last Minute Angebote ! !

	 EN (TRUE):&#124; 1 Bedroom

DE:  Die schlanke , einfache Oberfläche und die gute Performance machen es zum idealen Werkzeug , um dein Netbook ( oder normales Notebook ) in einen e ##AT##-##AT## Book Reader zu verwandeln .

	 EN (TRUE):Its low resource use , simple interface and fast performance makes it the ideal tool to turn your netbook ( or regular laptop ) into an e ##AT##-##AT## book reader .


	 EN (Predicted): The <unk> is a <unk> , or the <unk> , you can use the <unk> and the <unk> , or the <unk> ##AT##-##AT## <unk> ##AT##-##AT## <unk> ##AT##-##AT## <unk> . </s> 

DE:  aufgerufen wird , fügt Sie die Flash Nachricht &quot; Eintrag gespeichert !

	 EN (TRUE):is called , it adds the flash message &quot; Record Saved !


	 EN (Predicted): The default message will be displayed in the default . </s> 

DE:  Die Bewohner des Nordens sind ein buntes Völkergemisch aus den verschiedensten Bergstämmen und den Nord ##AT##-##AT## Thais oder kon mueang ; die traditionell in den fruchtbaren Tiefebenen Nordthailands siede

DE:  in dieser Option ermöglicht , Dateien relativ zum aktuellen Verzeichnis einzubinden .

	 EN (TRUE):in the include path allows for relative includes as it means the current directory . However , it is more efficient to explicitly use include &apos; . / file &apos; than having PHP always check the current directory for every include .


	 EN (Predicted): The option of the option of the release of the download directory . </s> 

(Test) BLEU (100 elements):  0.263338043392497
.....Step  41500
Actual: Dr . Jones , however , wished to avoid the struggle of Human emotion and therefore could not love <unk> the way he wanted her to . </s> 

Predicted: The . Jones , <unk> , he to be the <unk> to the illnesses , <unk> he have be the , <unk> to was to to the </s> 
(Train) BLEU (560 elements):  0.3905624657116184
.....Step  42000
Actual: All children under 2 years are charged THB 150 <unk> per night and person when using existing bedding . </s> 

Predicted: All children under 2 <unk> are charg

DE:  Ferienwohnungen erste Strandlinie . Dachwohnung in Conil de la Frontera , Cadiz .

	 EN (TRUE):Located at the foot of the beach , this Conil beach apartment rentals , Spain is perfect for your summer vacation in Conil de la Frontera .


	 EN (Predicted): In the Cadiz Province , Spain , Conil de la Frontera , Cadiz , Spain . </s> 

DE:  Die Prüfgeräte von Olympus erweitern den Bereich des menschlichen Auges bei der industriellen Sichtprüfung . Mit unseren Industrieendoskopen werden verdeckte Bereiche mit beschränktem Zugang sichtbar gemacht , wie z.

	 EN (TRUE):Olympus test equipment expands the range of the human eye in industrial visual inspection .


	 EN (Predicted): The <unk> of the Olympus E330 is a <unk> of the Olympus E330 , and the <unk> of the Olympus E330 is a <unk> of the <unk> . </s> 

DE:  Slimline ICE ist in einer Vielzahl von Geschmacksrichtungen sowohl als Eis am Stiel als auch im Becher erhältlich .

	 EN (TRUE):Palatinose ™ is a disaccharide derived from beet su

DE:  Bei den romanischen Völkern paart sich die effektive Ohnmacht mit lächerlicher Anmaßung .

	 EN (TRUE):To material weakness the Latin countries add a quite fantastic pretentiousness .


	 EN (Predicted): <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 

(Test) BLEU (100 elements):  0.2560304770176548
.....Step  44500
Actual: Airport Tegel is located in the southwest of <unk> , 15 minutes distance from the center of Tegel . </s> 

Predicted: Berlin Berlin is the in Berlin Berlin , Berlin , Berlin <unk> from from the Berlin of Berlin . </s> 
(Train) BLEU (500 elements):  0.4046938595586291
.....Step  45000
Actual: You can also easily export the diagram to the following formats

DE:  Genießen Sie hier in gemütlicher Atmosphäre ein kühles Kölsch oder einen erfrischenden Cocktail .

	 EN (TRUE):Enjoy a local beer or refreshing cocktail in a cosy atmosphere .


	 EN (Predicted): Enjoy a relaxing atmosphere , a cosy atmosphere , a cosy atmosphere . </s> 

DE:  Die Prüfgeräte von Olympus erweitern den Bereich des menschlichen Auges bei der industriellen Sichtprüfung . Mit unseren Industrieendoskopen werden verdeckte Bereiche mit beschränktem Zugang sichtbar gemacht , wie z.

	 EN (TRUE):Olympus test equipment expands the range of the human eye in industrial visual inspection .


	 EN (Predicted): The <unk> ##AT##-##AT## <unk> ##AT##-##AT## <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> 

DE:  Standort war sehr praktisch . In 5 Minuten ist man am Hauptbahnhof , in 10 Minuten im Bankenviertel .


DE:  Obwohl das Nazi ##AT##-##AT## Regime die Buddhistische Gemeinde in Berlin , die seit 1936 aktiv gewesen war , schloss und kurzzeitig deren Begründer Martin Steinke 1941 inhaftierte , verfolgte es die Buddhisten nicht generell .

	 EN (TRUE):Although the Nazi regime closed the Buddhistische Gemeinde ( Buddhist Society ) in Berlin , which had been active from 1936 , and briefly arrested its founder Martin Steinke in 1941 , they generally did not persecute Buddhists .


	 EN (Predicted): Although the <unk> was the <unk> of the Nazi regime , and the <unk> of the Nazi regime , the <unk> was the <unk> of the Nazi regime , and the <unk> of the <unk> , the <unk> was the <unk> of the <unk> . </s> 

DE:  Bamberg , die &quot; Traumstadt der Deutschen &quot; , seine aufgeschlossenen Menschen und seine romantische Umgebung wird auch Sie begeistern , denn sie bietet für jeden etwas .

	 EN (TRUE):The beauty and rich cultural life of this town can be enjoyed at any time of year . Soak up the sum

DE:  Es existieren Busverbindungen in nahezu jeden Ort der Provence ( eventuell mit Umsteigen in Aix ##AT##-##AT## en ##AT##-##AT## Provence ) , allerdings sollte beachtet werden , dass die letzten Busse abends ca. um 19 Uhr fahren .

	 EN (TRUE):As always in France those highways are expensive but practical , comfortable and fast .


	 EN (Predicted): It is a bit of the most important tourist in the city center . </s> 

DE:  Ferienwohnungen erste Strandlinie . Dachwohnung in Conil de la Frontera , Cadiz .

	 EN (TRUE):Located at the foot of the beach , this Conil beach apartment rentals , Spain is perfect for your summer vacation in Conil de la Frontera .


	 EN (Predicted): In the summer , the apartment is located in the Cadiz Province of Cadiz . </s> 

DE:  Nach einigen Wanderwochen erreichten ich und Celina Warschau . Auf dem Weg zum jüdischen Komitee begegnete ich auf der Straße meinem Bruder !

	 EN (TRUE):It turned out that Marek had jumped from the window of a train moving to M

DE:  Das Personal war immer hilfsbereit und freundlich .

	 EN (TRUE):The location and helpfulness of staff was excellent .


	 EN (Predicted): The staff were very helpful and helpful . </s> 

(Test) BLEU (100 elements):  0.26491997595948624
.....Step  50500
Actual: With powerful functions and user ##AT##-##AT## <unk> , all needs are covered from simple network analyzes to advanced troubleshooting of complex problems . </s> 

Predicted: With the and , the ##AT##-##AT## defined , you the to required by the and and to the data for the . . </s> 
(Train) BLEU (430 elements):  0.41874023304477176
.....Step  51000
Actual: Clarion Hotel Royal <unk> will be refurbished between 1 October and 31 December but guests are not expected to be <unk> , as current rooms are housed in a separate building . </s> 

Predicted: The Hotel Royal is is be a to the <unk> 1st the December , the are not allowed to be a , but a and , available in a marble air , </s> 
(Train) BLEU (380 elements):  0.3983126002987014

DE:  Es war staubig , das Bad schmutzig . Sogar die Beleuchtung an der Wand im Flur ( Seitengebäude ) war richtig verstaubt .

	 EN (TRUE):It was rather old fashioned in the decoration .


	 EN (Predicted): The room was very well ##AT##-##AT## being in the bathroom . </s> 

DE:  Booking.com : Best Western Hotell SöderH , Söderhamn , Schweden - 29 Gästebewertungen .

	 EN (TRUE):Booking.com : Best Western Hotell SöderH , Söderhamn , Sweden - 29 Guest reviews .


	 EN (Predicted): Booking .com : Best Western Best Western Hotell <unk> , Sweden - 29 Guest reviews . </s> 

DE:  Zusätzlich enthält TBarCode / SAPwin eine Menge neuer Strichcode ##AT##-##AT## Symbologien .

	 EN (TRUE):In addition TBarCode / SAPwin comes with a bunch of new bar code symbologies .


	 EN (Predicted): In addition , the TBarCode / Direct is available for the TBarCode / Direct . </s> 

DE:  Das Athens Gate Hotel liegt unterhalb der Akropolis nur 100 m vom neuen Akropolis ##AT##-##AT## Museum entfernt .

	 EN (TRUE)

(Train) BLEU (480 elements):  0.40792167020918296
	 Loss:  0.6822498368918896
(Test) Translating test sentences ...
Processing test data ... 
DE:  Mag sein , dass du deine ersten Gehversuche in einem rostigen , undichten Kahn beginnst - aber mit der Zeit wirst du dich zum schnittigen Speedboat oder edlen Katamaran vorarbeiten .

	 EN (TRUE):You may be starting in a ramshackle old tub of a boat , but in no time at all you &apos;ll be able to buy a fancy speedboat , or a classy catamaran . Turn your newfound fame into money , and spend it to buy lavish new homes .


	 EN (Predicted): <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 

DE:  Das Hotel Opera befindet sich in der Nähe de

DE:  Das Personal war immer hilfsbereit und freundlich .

	 EN (TRUE):The location and helpfulness of staff was excellent .


	 EN (Predicted): The staff were very helpful and helpful . </s> 

(Test) BLEU (100 elements):  0.25678237441063384
.....Step  55500
Actual: Booking .com : La Cour Des <unk> Boutique Gallery Design Hotel , Geneva , Switzerland - 153 Guest reviews . </s> 

Predicted: Booking .com : hotel Cour Des <unk> Boutique Hotel Hotel Hotel , Geneva , Switzerland - 10 Guest reviews . </s> 
(Train) BLEU (550 elements):  0.4221859109277922
.....Step  56000
Actual: In 1989 LOT became the first carrier in Eastern Europe to fly western ##AT##-##AT## made Boeing <unk> . </s> 

Predicted: Since 1989 , Polish a first <unk> in 1989 Europe , the in flights the flights <unk> . </s> 
(Train) BLEU (570 elements):  0.43239924948605174
	 Loss:  0.6740336436480284
(Test) Translating test sentences ...
Processing test data ... 
DE:  Ideale Lage für Exkursionen in die Stadt und Nähe zur Prome

DE:  Standort war sehr praktisch . In 5 Minuten ist man am Hauptbahnhof , in 10 Minuten im Bankenviertel .

	 EN (TRUE):very central only a few minutes walk from Bohr / Ryanair bus stop and main train station.Generally cheap and cheerful .


	 EN (Predicted): The hotel is situated in the heart of Frankfurt , within easy reach of the city centre . </s> 

DE:  Hotelparkplätze sind gegen eine kleine Gebühr vorhanden .

	 EN (TRUE):Car Parking is available at the hotel at a small charge . ( check in advance as spaces are limited and certain conditions apply ) .


	 EN (Predicted): <unk> is a small , small and medium ##AT##-##AT## sized car park . </s> 

DE:  Alle älteren Kinder oder Erwachsene zahlen EUR 32,00 pro Übernachtung und Person für Zustellbetten .

	 EN (TRUE):All older children or adults are charged EUR 32.00 per night and person for extra beds .


	 EN (Predicted): All older children or adults are charged EUR 32 <unk> per night and person for extra beds . </s> 

DE:  Obwohl das

DE:  18 Denn siehe , er richtet , und sein Richterspruch ist gerecht ; und das Kleinkind , das im Kindesalter stirbt , geht nicht zugrunde ; aber die Menschen trinken Verdammnis für ihre eigene Seele , außer sie demütigen sich und a werden so wie kleine Kinder und glauben daran , daß die Errettung im b sühnenden Blut Christi , des Herrn , des Allmächtigen , und durch dasselbe war und ist und sein wird .

	 EN (TRUE):18 For behold he judgeth , and his judgment is just ; and the infant perisheth not that dieth in his infancy ; but men drink a damnation to their own souls except they humble themselves and b become as little children , and believe that c salvation was , and is , and is to come , in and through the d atoning blood of Christ , the Lord Omnipotent .


	 EN (Predicted): 18 For behold , he is not a man , and he had not a man , and his wife , <unk> , and <unk> , and the <unk> of the <unk> , and <unk> , and <unk> , and <unk> , and <unk> , and <unk> , and <unk> , and <unk> , and <

DE:  Tux Racer wird Ihnen helfen , die Zeit totzuschlagen und sie können OpenOffice zum Arbeiten verwenden .

	 EN (TRUE):Tux Racer will help you pass the time while you wait , and you can use OpenOffice for work .


	 EN (Predicted): The <unk> <unk> is the time of the time of the time of time . </s> 

DE:  Es handelt sich um ein ziemlich einfaches Protokoll ; TFTP macht aber manchmal Probleme .

	 EN (TRUE):This is a fairly simple protocol , but sometimes there are problems trying to get it to work .


	 EN (Predicted): It is a bit of the protocol protocol . </s> 

DE:  Bamberg , die &quot; Traumstadt der Deutschen &quot; , seine aufgeschlossenen Menschen und seine romantische Umgebung wird auch Sie begeistern , denn sie bietet für jeden etwas .

	 EN (TRUE):The beauty and rich cultural life of this town can be enjoyed at any time of year . Soak up the summer sun whilst relaxing at one of the many sidewalk cafés in the historic old town or savour a cool beer beneath a shady chestnut t

.....Step  61500
Actual: Around this statue are all manner of fortune tellers , street performers and puppet shows for children . </s> 

Predicted: The the time , the the of the tellers , and performers and children , . children . </s> 
(Train) BLEU (510 elements):  0.4178895409184141
.....Step  62000
Actual: If it looks like ( <unk> ) with these options , your object is too small . </s> 

Predicted: If you is like the <unk> ) , the image . and object is the small . </s> 
(Train) BLEU (550 elements):  0.41863655719354115
	 Loss:  0.6627434747517109
(Test) Translating test sentences ...
Processing test data ... 
DE:  Mag sein , dass du deine ersten Gehversuche in einem rostigen , undichten Kahn beginnst - aber mit der Zeit wirst du dich zum schnittigen Speedboat oder edlen Katamaran vorarbeiten .

	 EN (TRUE):You may be starting in a ramshackle old tub of a boat , but in no time at all you &apos;ll be able to buy a fancy speedboat , or a classy catamaran . Turn your newfound fame into m

DE:  Es war staubig , das Bad schmutzig . Sogar die Beleuchtung an der Wand im Flur ( Seitengebäude ) war richtig verstaubt .

	 EN (TRUE):It was rather old fashioned in the decoration .


	 EN (Predicted): The room was very well ##AT##-##AT## being in the middle of the <unk> . </s> 

DE:  Baustelle zwischen See und Hotel . Altmodische Einrichtung .

	 EN (TRUE):Shared lobby with campsite next door , apparently , and hotel check in / out were not handled by lobby staff but by restaurant staff .


	 EN (Predicted): The hotel is situated between the lake and the lake . </s> 

DE:  Auch ist , so denkt Dr. Gutherz , bereits die erste Seite sehr viel versprechend , da sie eine Definition des klinischen Psychotrauma ##AT##-##AT## Begriffes enthält , der er gänzlich zustimmen kann .

	 EN (TRUE):At the rhetorical climax of this summary , Dr Goodheart comes across some sentences expressed with great pathos .


	 EN (Predicted): The <unk> is the first thing that the &quot; <unk> &quot; , &quot;