In [1]:
# Clean Text

In [2]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array


In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [5]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [6]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [7]:
# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check

Saved: english-german.pkl


In [8]:
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[go on] => [mach weiter]
[hello] => [hallo]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[eat up] => [iss auf]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [kapiert]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[i paid] => [ich habe bezahlt]


In [9]:
len(clean_pairs)

200519

In [10]:
# Split Text

In [25]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 20000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:18000], dataset[18000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [26]:
# Train Neural Translation Model

In [27]:
from pickle import load
from numpy import array
import tensorflow as tf

In [28]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.utils  import plot_model
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import RepeatVector, Dense, LSTM, Embedding,TimeDistributed
from tensorflow.python.keras.callbacks import ModelCheckpoint

In [29]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [30]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [31]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [32]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [33]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [34]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [35]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [36]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
# eng_vocab_size = 6000
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 3653
English Max Length: 5


In [37]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
# ger_vocab_size = 6000
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

German Vocabulary Size: 5690
German Max Length: 10


In [38]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

In [39]:
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [40]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [41]:
# summarize defined model
print(model.summary())
# plot_model(model, to_file='model.png', show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           1456640   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 256)         525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 3653)        938821    
Total params: 3,446,085
Trainable params: 3,446,085
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 18000 samples, validate on 2000 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 3.85774, saving model to model.h5
 - 63s - loss: 4.1553 - val_loss: 3.8577
Epoch 2/30

Epoch 00002: val_loss improved from 3.85774 to 3.63825, saving model to model.h5
 - 62s - loss: 3.6991 - val_loss: 3.6382
Epoch 3/30

Epoch 00003: val_loss improved from 3.63825 to 3.41706, saving model to model.h5
 - 61s - loss: 3.4218 - val_loss: 3.4171
Epoch 4/30

Epoch 00004: val_loss improved from 3.41706 to 3.23192, saving model to model.h5
 - 63s - loss: 3.1896 - val_loss: 3.2319
Epoch 5/30

Epoch 00005: val_loss improved from 3.23192 to 3.05903, saving model to model.h5
 - 63s - loss: 2.9798 - val_loss: 3.0590
Epoch 6/30

Epoch 00006: val_loss improved from 3.05903 to 2.86902, saving model to model.h5
 - 61s - loss: 2.7526 - val_loss: 2.8690
Epoch 7/30

Epoch 00007: val_loss improved from 2.86902 to 2.70950, saving model to model.h5
 - 61s - loss: 2.5319 - val_loss: 2.7095
Epoch 8/30

Epoch

<tensorflow.python.keras._impl.keras.callbacks.History at 0x21abb91b828>

In [43]:
# Evaluate Neural Translation Model

In [44]:
from pickle import load
from numpy import array
from numpy import argmax
from tensorflow.python.keras.models import load_model
import nltk
from nltk.translate.bleu_score import corpus_bleu

In [45]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [46]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [47]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [48]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [49]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [50]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [51]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src,test = raw_dataset[i]
        if i < 50:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [52]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [53]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [54]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

In [55]:
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [56]:
# load model
model = load_model('model.h5')

In [57]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)

train
src=[habe ich gewonnen], target=[did i win], predicted=[did i have]
src=[tom ist ein verlierer], target=[tom is a loser], predicted=[tom is a loser]
src=[es ist dieses buch], target=[its this book], predicted=[its this book]
src=[tom verfolgte maria], target=[tom chased mary], predicted=[tom chased mary]
src=[komm schnell], target=[come quickly], predicted=[come quickly]
src=[tom hat mich dazu gebracht zu gehen], target=[tom made me go], predicted=[tom made me go]
src=[du bist besturzt], target=[youre upset], predicted=[youre upset]
src=[er hat dich hintergangen], target=[he betrayed you], predicted=[he betrayed you]
src=[ich kann fahrrad fahren], target=[i can ride a bike], predicted=[i can get a bike]
src=[ich hab die fernstrae genommen], target=[i took highway], predicted=[i took highway]
src=[jetzt nicht bewegen], target=[now dont move], predicted=[you dont move]
src=[es ist eine regel], target=[its a rule], predicted=[its a rule]
src=[das sah ich kommen], target=[i saw that 

In [58]:
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

test
src=[uns ist hei], target=[were hot], predicted=[its hot]
src=[guten tag], target=[good afternoon], predicted=[a a nice]
src=[ich bade jeden tag], target=[i bathe every day], predicted=[i bathe every day day]
src=[er gab nach], target=[he relented], predicted=[he gave himself]
src=[bist du kunstlerisch veranlagt], target=[are you artistic], predicted=[are you dangerous]
src=[tom applaudierte], target=[tom applauded], predicted=[tom is]
src=[tom schien hart zu sein], target=[tom seemed tough], predicted=[tom seemed smart]
src=[tom schrieb weiter], target=[tom kept writing], predicted=[tom kept yelling]
src=[kommt esst mit uns], target=[come eat with us], predicted=[come and with us]
src=[die farben gefallen mir], target=[i like the colors], predicted=[i like me]
src=[ich wei es jetzt], target=[i know now], predicted=[i know it again]
src=[ich mochte zuschauen], target=[i want to watch], predicted=[i want to]
src=[ich bin sehr schnell], target=[im really fast], predicted=[im very fa

In [66]:
print(train[1])

['is that blue' 'ist das blau'
 'ccby france attribution tatoebaorg shekitten esperantostern']


In [67]:
  raw_target, raw_src,test = test[1]

In [68]:
print('The nltk version is {}.'.format(nltk.__version__))

The nltk version is 3.4.5.


In [None]:
evaluate_model(model, eng_tokenizer, testX, test)