In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

def to_lines(doc):
	lines = doc.strip().split('\n')
	return lines

def clean_data(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines[0:70000]:
		clean_pair = list()
		# normalize unicode characters
		line = normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		# tokenize on white space
		line = line.split()
		# convert to lowercase
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [word.translate(table) for word in line]
		# remove non-printable chars form each token
		line = [re_print.sub('', w) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
en_file = 'dataset/eng.txt'
fr_file='dataset/frn.txt'
doc_en = load_doc(en_file)
doc_fr= load_doc(fr_file)
# split into english-german pairs
en_line = to_lines(doc_en)
fr_line= to_lines(doc_fr)
# clean sentences
clean_en = clean_data(en_line)
clean_fr= clean_data(fr_line)
# save clean pairs to file
save_clean_data(clean_en, 'english.pkl')
save_clean_data(clean_fr, 'french.pkl')
# spot check
for i in range(30):
	print('[%s] => [%s]' % (clean_en[i,0], clean_fr[i,0]))

Saved: english.pkl
Saved: french.pkl
[this does not pose any problems within the agricultural sector] => [ceci n a pas pose le moindre probleme dans le secteur agricole]
[quite the opposite in fact there is no loss of harvest and it is better for the health of the workers within agriculture and horticulture] => [au contraire il n y a pas eu de perte de recolte et c est un bien pour la sante des travailleurs du secteur agricole et horticole]
[it must be possible to stop using methyl bromide as from] => [il doit etre possible pour le debut de l an de ne plus utiliser du tout de bromure de methyle dans l union europeenne]
[hence exceptions for emergency situations seem completely redundant] => [des exceptions en cas d urgente necessite me paraissent totalement superflues]
[mr president i think that in this final debate before voting on this regulation the following points must be stressed] => [monsieur le president je crois quil est necessaire dinsister dans ce dernier debat avant le vote

In [3]:
from pickle import load, dump
from numpy.random import rand
from numpy.random import shuffle
 
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
raw_dataset_en = load_clean_sentences('english.pkl')
raw_dataset_fr=load_clean_sentences('french.pkl')
 

shuffle(raw_dataset_en)
shuffle(raw_dataset_fr)
# split into train/test
train_en, test_en = raw_dataset_en[:60000], raw_dataset_en[60000:]
train_fr, test_fr= raw_dataset_fr[:60000], raw_dataset_fr[60000:]
# save
save_clean_data(train_en, 'english-train.pkl')
save_clean_data(test_en, 'english-test.pkl')
save_clean_data(train_fr, 'french-train.pkl')
save_clean_data(test_fr, 'french-test.pkl')

['the action plan does not however specify that an external legal expert should sit on the board']
Saved: english-train.pkl
Saved: english-test.pkl
Saved: french-train.pkl
Saved: french-test.pkl


In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed 
from keras.callbacks import ModelCheckpoint

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

english_train= load_clean_sentences("english-train.pkl")
english_test= load_clean_sentences("english-test.pkl")
french_train=load_clean_sentences("french-train.pkl")
french_test=load_clean_sentences("french-test.pkl")

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# prepare english tokenizer
eng_tokenizer = create_tokenizer(english_train[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(english_train[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare french tokenizer
frn_tokenizer = create_tokenizer(french_train[:, 0])
frn_vocab_size = len(frn_tokenizer.word_index) + 1
frn_length = max_length(french_train[:, 0])
print('French Vocabulary Size: %d' % frn_vocab_size)
print('French Max Length: %d' % (frn_length))

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, english_train[:, 0])
trainY = encode_sequences(frn_tokenizer, frn_length, french_train[:, 0])
trainY = encode_output(trainY, frn_vocab_size)

# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, english_test[:, 0])
testY = encode_sequences(frn_tokenizer, frn_length, french_test[:, 0])
testY = encode_output(testY, frn_vocab_size)

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
    #RepeatVector layer adds an extra dimension to your dataset. For example if you have an input of shape (batch size, input size) and you want to feed that to a GRU layer, you can use a RepeatVector layer to convert the input to a tensor with shape (batch size, sequence length, input size).
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
    #TimeDistributedDense applies a same Dense (fully-connected) operation to every timestep of a 3D tensor.
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model
 
# define model
model = define_model(eng_vocab_size, frn_vocab_size, eng_length, frn_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model1.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Using TensorFlow backend.


English Vocabulary Size: 28956
English Max Length: 160
French Vocabulary Size: 42769
French Max Length: 173
