In [1]:
import os
import re
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pickle import dump
from pickle import load
from numpy.random import shuffle
from numpy import array
from unicodedata import normalize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
%matplotlib inline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#Loading the data in a way such that it preserve unicode german characters
def load_doc(filename):
    file = open(filename,mode="rt",encoding="utf-8")
    text = file.read()
    file.close()
    return text

#Splitting the document into the sentence pair
def to_pair(doc):
    lines = doc.strip().split("\n")
    pairs = [line.split("\t") for line in lines]
    return pairs

#Removing text other than pair of german and english
def removing_extra(group):
    pair_group = []
    for pair in group:
        pair = pair[0:2]
        pair_group.append(pair)
    return pair_group

#Cleaning the text(list of text)
def clean_text(text_list):
    #Creating a empty list in which all the cleaned text will be appended
    clean = list()
    #For removing the punctuation from the text
    re_punc = re.compile("[%s]" % re.escape(string.punctuation))
    #For removing all the non printable characters
    re_print = re.compile("[^%s]" % re.escape(string.printable))
    for pair in text_list:
        clean_pair = list()
        for line in pair:
            #Normalizing the unicode characters
            line = normalize("NFD", line).encode("ascii","ignore")
            line = line.decode("UTF-8")
            #Tokenizing on white space in text
            line = line.split()
            #Convert the text to lowercase
            line = [word.lower() for word in line]
            #Removing the punctuation from each token
            line = [re_punc.sub("", w) for w in line]
            #Removing the non-printable chars form each token
            line = [re_print.sub("", w) for w in line]
            #Removing the tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            #After cleaning the pair appending the pair to clean_pair list
            clean_pair.append(" ".join(line))
        #Now appending the cleaned pair to the clean list
        clean.append(clean_pair)
    return array(clean)

#Now saving the list of clean sentences to file in our storage
def save_clean(data_list,filename):
    #Dumping sentences in a file(file opened in writebyte mode)
    dump(data_list,open(filename,"wb"))
    print("Saved: %s"%filename)
    
#Checking the length of cleaned data(number of pairs)
def checking_length(data):
    length = len(data)
    return length

#As the number of cleaned pairs we have is very lagre around 208486 so insted of fitting the model to the dataset which will--
#--be a quite expensive task(huge computation) we will insted train our model on a subset of our data taking fewer pairs as compared to the number of pairs we have
#A function for loading the data and it will be helpful in loading the files which we save for purpose of later use in save_clean function
def loading_data(filename):
    return load(open(filename,"rb"))

#Now as our data cleaning,splitting the data into training and testing parts is over what we can do is start building our translation network
#Creating a tokenizer
def create_tokenizer(text_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    return tokenizer

#For finding the maximum sequence length
def max_length(data):
    return max(len(dat.split()) for dat in data)

#We will be now preparing the training dataset.Each input and output sequence must beencoded to integers and padded to the maximum phrase length
#--this is because we will use a word embedding for the input sequences and one hot encode the output sequences
# encode and pad sequences
def encode_sequences(tokenizer,length,data):
    #Converting the array of text to sequence of integer on basis of their word-index position
    X = tokenizer.texts_to_sequences(data)
    #Length returned by max_length function
    #Now padding the sequence created(padding will be post zeros filled at end for shorter sequence)
    X = pad_sequences(X, maxlen=length, padding="post")
    return X

#The output sequence will be one hot encoded as the model will predict the probability of each word in the vocabulary as output
def encode_output(sequences,vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence,num_classes=vocab_size)
        ylist.append(encoded)
        y = array(ylist)
        y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

#Creating the model(encoder decoder)
def define_model(source_vocab,target_vocab,source_len_pad,target_len,n_units):
    model = Sequential()
    model.add(Embedding(source_vocab,n_units,input_length=source_len_pad,mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(target_len))
    model.add(LSTM(n_units,return_sequences=True))
    model.add(TimeDistributed(Dense(target_vocab, activation="softmax")))
    model.compile(optimizer="adam", loss="categorical_crossentropy")
    model.summary()
    plot_model(model, to_file="model.png", show_shapes=True)
    return model

In [3]:
doc = load_doc("deu.txt")
pairs = to_pair(doc)
pair = removing_extra(pairs)
clean = clean_text(pair)
save_clean(clean,"english-german.pkl")
print(checking_length(clean))
#Generally we have a total of 208486 pairs to english,german pairs and towards the end the length of pairs are quite bigger
print(clean[1:208486])

Saved: english-german.pkl
208486
[['hi' 'hallo']
 ['hi' 'gru gott']
 ['run' 'lauf']
 ...
 ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker'
  'wenn jemand der deine herkunft nicht kennt sagt dass du wie ein muttersprachler sprichst bedeutet das dass man wahrscheinlich etwas an deiner sprechweise bemerkt hat das erkennen lie dass du kein muttersprachler bist mit anderen worten du horst dich nicht wirklich wie ein muttersprachler an']
 ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker'
  'wenn jemand fremdes dir sagt dass du dich wie ein muttersprachler anhorst bedeutet das wahr

In [4]:
#Loading the dataset which we actually created
data = loading_data("english-german.pkl")
#Reducing the data size
num_sentences = 15000
data = data[:num_sentences, :]
#Randomly shuffling the data
shuffle(data)
#Splitting the data into train and test data
train,test = data[:12000],data[12000:]
#Saving the training and testing data
save_clean(data,"english-german(combined).pkl")
save_clean(train,"english-german-train.pkl")
save_clean(test,"english-german-test.pkl")

Saved: english-german(combined).pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [56]:
#LOading the train and test data
data = loading_data("english-german(combined).pkl")
train = loading_data("english-german-train.pkl")
test = loading_data("english-german-test.pkl")

In [5]:
train

array([['whats tom done', 'was hat tom getan'],
       ['dont misbehave', 'benehmt euch nicht daneben'],
       ['he is my type', 'er ist mein typ'],
       ...,
       ['hes intelligent', 'er ist klug'],
       ['you can try', 'du kannst es versuchen'],
       ['youre arrogant', 'sie sind uberheblich']], dtype='<U527')

In [6]:
#Basically we are using the full data for creating the vocabulary of boyh english and german(alternately we can also use only the train data)
#Preparing the english tokenizer
eng_tokenizer = create_tokenizer(data[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(data[:, 0])
print("English Vocabulary Size: %d"%eng_vocab_size)
print("English Max Length: %d"%(eng_length))
#Preparing the german tokenizer
ger_tokenizer = create_tokenizer(data[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(data[:, 1])
print("German Vocabulary Size: %d"%ger_vocab_size)
print("German Max Length: %d"%(ger_length))

English Vocabulary Size: 2960
English Max Length: 5
German Vocabulary Size: 4648
German Max Length: 10


In [None]:
#Preparing the training and testing sequences
#First we encode both the input and output sequence of word_index value and then encode output by one hot
trainX = encode_sequences(ger_tokenizer,ger_length,train[:,1])
trainY = encode_sequences(eng_tokenizer,eng_length,train[:,0])
trainY = encode_output(trainY,eng_vocab_size)
testX = encode_sequences(ger_tokenizer,ger_length,test[:,1])
testY = encode_sequences(eng_tokenizer,eng_length,test[:,0])
testY = encode_output(testY,eng_vocab_size)

In [None]:
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
checkpoint = ModelCheckpoint("model.h5",monitor="val_loss",verbose=1,save_best_only=True, mode="min")
model.fit(trainX,trainY,epochs=30, batch_size=64,validation_data=(testX, testY),callbacks=[checkpoint],verbose=2)