In [39]:
import re
import string
import numpy as np
from pickle import dump,load
from unicodedata import normalize

In [40]:
def load_doc(filename):
    file = open(filename, mode = 'rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [41]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [42]:
def remove_garbage(lines):
    cleaned_pairs = []
    #separating unwanted text at end of every pair
    for line in lines:
        cleaned_pairs.append([line[0],line[1]])
    return cleaned_pairs

In [43]:
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [re_punc.sub('', w) for w in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [44]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [45]:
filename = 'deu.txt'

In [46]:
doc = load_doc(filename)

In [47]:
pairs = to_pairs(doc)

In [48]:
pairs = remove_garbage(pairs)

In [50]:
clean_pairs = clean_pairs(pairs)

In [53]:
save_clean_data(clean_pairs, 'english_german.pkl')

Saved: english_german.pkl


In [55]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [56]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

In [57]:
raw_dataset = load_clean_sentences('english_german.pkl')

In [58]:
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]

In [59]:
np.random.shuffle(dataset)

In [62]:
train, test = dataset[:9000], dataset[9000:]

In [65]:
save_clean_data(dataset, 'english_german-both.pkl')

Saved: english_german-both.pkl


In [66]:
save_clean_data(train, 'english_german-train.pkl')

Saved: english_german-train.pkl


In [69]:
dataset = load_clean_sentences('english_german-both.pkl')

In [70]:
train = load_clean_sentences('english_german-train.pkl')

In [71]:
test = load_clean_sentences('english_german-test.pkl')

### Tokenization

In [72]:
import tensorflow as tf
from tensorflow import keras

In [73]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [74]:
tokenizer = Tokenizer()

In [75]:
def create_tokenizer(lines):
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [76]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [87]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print(f'English Vocab size {eng_vocab_size}')

English Vocab size 5651


In [88]:
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:,1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print(f'German Vocab size {ger_vocab_size}')

German Vocab size 5651


In [89]:
max_length(dataset[:,0])

5

In [90]:
max_length(dataset[:,1])

9

In [91]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad
    X = pad_sequences(X, maxlen = length, padding='post')
    return X

In [92]:
from tensorflow.keras.utils import to_categorical

In [93]:
def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [94]:
trainX = encode_sequences(ger_tokenizer ,ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer ,eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

In [97]:
trainX.shape

(9000, 9)

In [98]:
trainY.shape

(9000, 5, 5651)

In [99]:
testX = encode_sequences(ger_tokenizer, ger_length, test[: ,1])
testY = encode_sequences(eng_tokenizer, eng_length, test[: ,0])
testY = encode_output(testY, eng_vocab_size)

In [100]:
testY.shape

(1000, 5, 5651)

#### MODEL

In [118]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint


In [119]:
def Model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length = src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.summary()
    return model

    

In [120]:
model = Model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 9, 256)            1446656   
_________________________________________________________________
lstm_10 (LSTM)               (None, 256)               525312    
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 5, 5651)           1452307   
Total params: 3,949,587
Trainable params: 3,949,587
Non-trainable params: 0
_________________________________________________________________


In [121]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [122]:
model.fit(trainX, trainY, epochs=30, batch_size = 32, validation_data=(testX, testY), callbacks=[checkpoint], verbose=1)

Train on 9000 samples, validate on 1000 samples
Epoch 1/30
Epoch 00001: val_loss improved from inf to 3.31287, saving model to model.h5
Epoch 2/30
Epoch 00002: val_loss improved from 3.31287 to 3.16415, saving model to model.h5
Epoch 3/30
Epoch 00003: val_loss improved from 3.16415 to 3.00219, saving model to model.h5
Epoch 4/30
Epoch 00004: val_loss improved from 3.00219 to 2.87523, saving model to model.h5
Epoch 5/30
Epoch 00005: val_loss improved from 2.87523 to 2.76291, saving model to model.h5
Epoch 6/30
Epoch 00006: val_loss improved from 2.76291 to 2.64196, saving model to model.h5
Epoch 7/30
Epoch 00007: val_loss improved from 2.64196 to 2.52446, saving model to model.h5
Epoch 8/30
Epoch 00008: val_loss improved from 2.52446 to 2.41269, saving model to model.h5
Epoch 9/30
Epoch 00009: val_loss improved from 2.41269 to 2.32699, saving model to model.h5
Epoch 10/30
Epoch 00010: val_loss improved from 2.32699 to 2.23633, saving model to model.h5
Epoch 11/30
Epoch 00011: val_loss i

<tensorflow.python.keras.callbacks.History at 0x1a831873888>

In [124]:
from nltk.translate.bleu_score import corpus_bleu

In [126]:
def load_clean_sentence(filename):
    return load(open(filename, 'rb'))

In [127]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [128]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [129]:
def encode_sequences(tokenizer ,length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [130]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [131]:
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [142]:
def evaluate_model(model, sources, raw_dataset):
    actual = []
    predicted = []
    for i, source in enumerate(sources):
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print(f'{raw_src}, {raw_target}, {translation}')
        actual.append(raw_target.split())
        predicted.append(translation.split())
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [134]:
dataset = load_clean_sentences('english_german-both.pkl')
train = load_clean_sentences('english_german-train.pkl')
test = load_clean_sentences('english_german-test.pkl')

In [135]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [136]:
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

In [137]:
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [139]:
from tensorflow.keras.models import load_model

In [140]:
model = load_model('model.h5')