# 1. Clean Text

In [17]:
import string
import re
from unicodedata import normalize
from pickle import dump
from numpy import array

In [18]:
# load doc into memory
def load_doc(filename):
    # open file in read mode
    file = open(filename,mode='rt',encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [19]:
# splitting a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

Now, we are ready to clean each sentence. The specific cleaning operations we
will perform are as follows:

1. Remove all non-printable characters.
2. Remove all punctuation characters.
3. Normalize all Unicode characters to ASCII.
4. Normalize the case to lowercase.
5. Remove any remaining tokens that are not alphabetic.

In [20]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD',line).encode('ascii','ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars from each token
            line = [re_print.sub('',w) for w in line]
            # remove tokens with numbers in item
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [21]:
# save a list of clean sentences to file
def save_clean_data(sentences,filename):
    dump(sentences,open(filename,'wb'))
    print('Saved: %s' % filename)

In [22]:
# load dataset 
filename = './deu-eng\deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0],clean_pairs[i,1]))

Saved: english-german.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[go on] => [mach weiter]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[attack] => [angriff]
[attack] => [attacke]
[cheers] => [zum wohl]
[eat it] => [iss es]
[eat up] => [iss auf]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [aha]
[got it] => [ich habs]
[got it] => [kapiert]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me

# 2. Split Text

In [23]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

In [24]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

In [25]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename,'wb'))
    print('Saved: %s' % filename)

In [26]:
# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

In [27]:
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)

In [28]:
# split into train/test
train, test = dataset[:9000], dataset[9000:]

In [29]:
# save 
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


# Train Neural Translation Model

In [52]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [30]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

In [32]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [33]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [38]:
# encode and pad seqeunces
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [42]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [40]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [31]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [36]:
# prepare english tokenizer 
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % eng_length)

English Vocabulary Size: 2214
English Max Length: 5


In [37]:
# prepare german tokenizer 
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % ger_length)

German Vocabulary Size: 3526
German Max Length: 9


In [43]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_output(trainY, eng_vocab_size)

In [44]:
# prepare training data
testX = encode_sequences(ger_tokenizer, ger_length, test[:,1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:,0])
testY = encode_output(testY, eng_vocab_size)

In [45]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [135]:
# summarize defined model
print(model.summary())
# plot_model(model,to_file='model.png',show_shapes=True)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 9, 256)            902656    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 2214)           568998    
Total params: 2,522,278
Trainable params: 2,522,278
Non-trainable params: 0
_________________________________________________________________
None


In [55]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX,testY),callbacks=[checkpoint],verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 32s - loss: 4.1548 - accuracy: 0.4446 - val_loss: 3.3701 - val_accuracy: 0.4800

Epoch 00001: val_loss improved from inf to 3.37014, saving model to model.h5
Epoch 2/30
 - 28s - loss: 3.2441 - accuracy: 0.4759 - val_loss: 3.2219 - val_accuracy: 0.4866

Epoch 00002: val_loss improved from 3.37014 to 3.22192, saving model to model.h5
Epoch 3/30
 - 27s - loss: 3.1131 - accuracy: 0.4852 - val_loss: 3.1547 - val_accuracy: 0.4936

Epoch 00003: val_loss improved from 3.22192 to 3.15467, saving model to model.h5
Epoch 4/30
 - 28s - loss: 2.9995 - accuracy: 0.4962 - val_loss: 3.0356 - val_accuracy: 0.5122

Epoch 00004: val_loss improved from 3.15467 to 3.03562, saving model to model.h5
Epoch 5/30
 - 28s - loss: 2.8465 - accuracy: 0.5148 - val_loss: 2.9006 - val_accuracy: 0.5266

Epoch 00005: val_loss improved from 3.03562 to 2.90062, saving model to model.h5
Epoch 6/30
 - 28s - loss: 2.6931 - accuracy: 0.5318 - val_loss: 2.7931 - val

<keras.callbacks.callbacks.History at 0x1ff8de03488>

# Evaluate Neural Translation Model

In [100]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [101]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

In [102]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [103]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [104]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    #pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [105]:
# map an integer to word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [106]:
# generate target given source sequence
def predict_sequence(model,tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [129]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src,_ = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target,translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0,0,0,0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5,0.5,0,0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3,0.3,0.3,0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25,0.25,0.25,0.25)))

In [108]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [109]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [110]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

In [111]:
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [112]:
# load model
model = load_model('model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [130]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)

train
src=[du bist dumm], target=[you are stupid], predicted=[you are stupid]
src=[sie hat ihm geholfen], target=[she helped him], predicted=[she called him]
src=[tom ist wutend], target=[tom is mad], predicted=[tom is]
src=[in keinster weise], target=[no way], predicted=[no way]
src=[tschuss], target=[goodbye], predicted=[see]
src=[tom heulte], target=[tom was crying], predicted=[tom was crying]
src=[frag einen fachmann], target=[ask an expert], predicted=[ask an expert]
src=[das ist ein bissiger hund], target=[this dog bites], predicted=[thats dog bites]
src=[das ist nicht meins], target=[its not mine], predicted=[its not mine]
src=[ich zuchte vieh], target=[i raise cattle], predicted=[i raise cattle]
BLEU-1: 0.857145
BLEU-2: 0.801516
BLEU-3: 0.700765
BLEU-4: 0.380523


In [131]:
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

test
src=[sie haben mir geholfen], target=[they helped me], predicted=[they know me]
src=[tom tanzt gerade], target=[tom is dancing], predicted=[tom is fasting]
src=[geh in deckung], target=[take cover], predicted=[get cover]
src=[ich habe den schlussel], target=[i have the key], predicted=[i have a]
src=[tom hat einen im tee], target=[tom is tipsy], predicted=[tom has a gun]
src=[ich kenne kein erbarmen], target=[im ruthless], predicted=[i saw a bus]
src=[tom vertraute mir], target=[tom trusted me], predicted=[tom trusts me]
src=[wir waren glucklich], target=[we were happy], predicted=[we were happy]
src=[nehmt meinen], target=[take mine], predicted=[take mine]
src=[tom gab auf], target=[tom gave in], predicted=[tom stood up]
BLEU-1: 0.547208
BLEU-2: 0.411397
BLEU-3: 0.316968
BLEU-4: 0.126257


In [118]:
print(train.shape)

(9000, 3)


In [124]:
print(test.shape)

(1000, 3)
