In [None]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
% matplotlib inline
pd.set_option('display.max_colwidth',200)

In [None]:
# FUNCTION TO READ RAW TEXT FILE

def read_text(filename):
    # open the file
    file = open(filename, mode = 'rt', encoding = 'utf-8')
    # read all text
    text = file.read()
    file.close()
    return text

In [None]:
# SPLIT A TEXT INTO SENTENCES

def to_lines(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [None]:
data = read_text("/content/deu.txt")
deu_eng = to_lines(data)
deu_eng = array(deu_eng)

In [None]:
deu_eng = deu_eng[:50000,:]

In [None]:
# TAKING A LOOK AT OUR DATA:

deu_eng

In [None]:
# EMPTY LISTS

eng_l = []
deu_l = []

# POPULATE THE LISTS WITH SENTENCE LENGTHS

for i in deu_eng[:,0]:
    eng_l.append(len(i.split()))
    
for i in deu_eng[:,1]:
    deu_l.append(len(i.split()))

In [None]:
length_df = pd.DataFrame({'eng':eng_l, 'deu':deu_l})

In [None]:
length_df.hist(bins = 30)
plt.show()

In [None]:
# FUNCTION TO BUILD A TOKENIZER

def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# PREPARE ENGLISH TOKENIZER

eng_tokenizer = tokenizer(deu_eng[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print('English Vocabulary Size: %d' % eng_vocab_size)

In [None]:
# PREPARE DEUTCH TOKENIZER

deu_tokenizer = tokenizer(deu_eng[:,1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1

deu_length = 8
print('Deutch Vocabulary Size: %d' % deu_vocab_size)

In [None]:
# ENCODE AND PAD SEQUENCES

def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen = length, padding = 'post')
    return seq

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(deu_eng, test_size = 0.2, random_state = 12)

In [None]:
# PREPARE TRAINING DATA

trainX = encode_sequences(deu_tokenizer, deu_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])

In [None]:
# PREPARE VALIDATION DATA

testX = encode_sequences(deu_tokenizer, deu_length, train[:,1])
testY = encode_sequences(eng_tokenizer, eng_length, train[:,0])

In [None]:
# BUILD NMT MODEL

def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length = in_timesteps, mask_zero = True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timestep))
    model.add(LSTM(units, return_sequences = True))
    model.add(Dense(out_vocab, activation = 'softmax'))
    return model

In [None]:
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512)
rms = optimizer.RMSprop(lr = 0.001)
model.compile(optimizer = rms, loss = 'sparse_categorical_crossentropy')

In [None]:
filename = 'model.h1.deepra'
checkpoint = ModelCheckpoint(filename, monitor = 'val_loss', verbose = 1, save_best_only = True, mode = 'min')

history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1],1),
          epochs = 5, batch_size = 512,
          validation_split = 0.2,
          callbacks = [checkpoint],verbose = 1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

In [None]:
model = load_model('model.h1.deepra')
preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

In [None]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n;
            return word
    return None

In [None]:
# CONVERT PREDICTIONS INTO TEXT (ENGLISH)

preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.apend(t)
                
        else:
            if (t == None):
                temp.append('')
            else:
                if(t == None):
                    temp.append('')
                else:
                    temp.append(t)
                    
        preds_text.append(''.join(temp))                

In [None]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})

In [None]:
pd.set_option('display.max_colwidth', 200)

In [None]:
pred_df.head(15)

In [None]:
pred_df.tail(15)