In [113]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, RepeatVector , TimeDistributed
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping , ModelCheckpoint
from keras.models import load_model

In [114]:
# load doc into memory
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text


In [115]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [116]:
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
filename = '/content/sample_data/fra.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-french.pkl')
# spot check
for i in range(50):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-french.pkl
[go] => [va]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[who] => [qui]
[wow] => [ca alors]
[fire] => [au feu]
[help] => [a laide]
[jump] => [saute]
[stop] => [ca suffit]
[stop] => [stop]
[stop] => [arretetoi]
[wait] => [attends]
[wait] => [attendez]
[go on] => [poursuis]
[go on] => [continuez]
[go on] => [poursuivez]
[hello] => [bonjour]
[hello] => [salut]
[i see] => [je comprends]
[i try] => [jessaye]
[i won] => [jai gagne]
[i won] => [je lai emporte]
[i won] => [jai gagne]
[oh no] => [oh non]
[attack] => [attaque]
[attack] => [attaquez]
[cheers] => [sante]
[cheers] => [a votre sante]
[cheers] => [merci]
[cheers] => [tchintchin]
[get up] => [levetoi]
[go now] => [va maintenant]
[go now] => [allezy maintenant]
[go now] => [vasy maintenant]
[got it] => [jai pige]
[got it] => [compris]
[got it] => [pige]
[got it] => [compris]
[got it] => [tas capte]
[hop in] => [monte]
[hop in] => [montez]
[hug me] => [serremoi dans tes bras]
[hug me] => 

In [117]:
for i in range(10):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[go] => [va]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[who] => [qui]
[wow] => [ca alors]
[fire] => [au feu]
[help] => [a laide]
[jump] => [saute]


In [118]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
 
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
raw_dataset = load_clean_sentences('english-french.pkl')
 
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

Saved: english-french-both.pkl
Saved: english-french-train.pkl
Saved: english-french-test.pkl


In [119]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [120]:
print(train)

[['we broke up' 'nous avons rompu'
  'ccby france attribution tatoebaorg hybrid sacredceltic']
 ['give me a hand' 'pretezmoi main forte'
  'ccby france attribution tatoebaorg ck sacredceltic']
 ['dont do it' 'ne le faites pas'
  'ccby france attribution tatoebaorg scott sacredceltic']
 ...
 ['i will testify' 'je temoignerai'
  'ccby france attribution tatoebaorg ck sacredceltic']
 ['im desperate' 'je suis desesperee'
  'ccby france attribution tatoebaorg erikspen micsmithel']
 ['are you joking' 'estce que tu blagues'
  'ccby france attribution tatoebaorg ck sacredceltic']]


In [121]:
train.shape

(9000, 3)

In [122]:
test.shape

(1000, 3)

In [123]:
print(test)

[['save tom' 'sauvez tom' 'ccby france attribution tatoebaorg ck aiji']
 ['youre crafty' 'vous etes rusees'
  'ccby france attribution tatoebaorg ck sacredceltic']
 ['i have doubts' 'jai des doutes'
  'ccby france attribution tatoebaorg ck aiji']
 ...
 ['i love animals' 'jadore les animaux'
  'ccby france attribution tatoebaorg hybrid sacredceltic']
 ['he is outgoing' 'il est extraverti'
  'ccby france attribution tatoebaorg ck sacredceltic']
 ['just ignore it' 'ignorele simplement'
  'ccby france attribution tatoebaorg hybrid sacredceltic']]


In [124]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [125]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [126]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('French Vocabulary Size: %d' % ger_vocab_size)
print('French Max Length: %d' % (ger_length))

English Vocabulary Size: 2136
English Max Length: 5
French Vocabulary Size: 4380
French Max Length: 10


In [127]:
# encode and pad sequences
from keras.preprocessing.sequence import pad_sequences
# define sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [128]:
# one hot encode target sequence
from keras.utils import to_categorical
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [129]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [130]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#plt.plot(model, to_file='model.png', show_shapes=True)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 10, 256)           1121280   
_________________________________________________________________
lstm_19 (LSTM)               (None, 256)               525312    
_________________________________________________________________
repeat_vector_9 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_8 (TimeDist (None, 5, 2136)           548952    
Total params: 2,720,856
Trainable params: 2,720,856
Non-trainable params: 0
_________________________________________________________________
None


In [131]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 3.40430, saving model to model.h5
141/141 - 23s - loss: 4.1965 - val_loss: 3.4043
Epoch 2/30

Epoch 00002: val_loss improved from 3.40430 to 3.21930, saving model to model.h5
141/141 - 22s - loss: 3.2329 - val_loss: 3.2193
Epoch 3/30

Epoch 00003: val_loss improved from 3.21930 to 3.13313, saving model to model.h5
141/141 - 22s - loss: 3.0742 - val_loss: 3.1331
Epoch 4/30

Epoch 00004: val_loss improved from 3.13313 to 3.01463, saving model to model.h5
141/141 - 23s - loss: 2.9448 - val_loss: 3.0146
Epoch 5/30

Epoch 00005: val_loss improved from 3.01463 to 2.88308, saving model to model.h5
141/141 - 22s - loss: 2.7698 - val_loss: 2.8831
Epoch 6/30

Epoch 00006: val_loss improved from 2.88308 to 2.78203, saving model to model.h5
141/141 - 22s - loss: 2.6019 - val_loss: 2.7820
Epoch 7/30

Epoch 00007: val_loss improved from 2.78203 to 2.67989, saving model to model.h5
141/141 - 22s - loss: 2.4458 - val_loss: 2.6799
Epoch 8/30

Epoch

<tensorflow.python.keras.callbacks.History at 0x7f6863229f60>

In [133]:
print(train[0])

['we broke up' 'nous avons rompu'
 'ccby france attribution tatoebaorg hybrid sacredceltic']


In [134]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
    # translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target = str(raw_dataset[i])
		if i < 10:
			print('target=[%s], predicted=[%s]' % ( raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')
#prepare english tokenizer
eng_tokenizer=create_tokenizer(dataset[:, 0])
eng_vocab_size=len(eng_tokenizer.word_index)+1
eng_length=max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer=create_tokenizer(dataset[:,1])
ger_vocab_size=len(ger_tokenizer.word_index)+1
ger_length=max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
 target=[['we broke up' 'nous avons rompu'
 'ccby france attribution tatoebaorg hybrid sacredceltic']], predicted=[we broke up]
 target=[['give me a hand' 'pretezmoi main forte'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[give me a hand]
 target=[['dont do it' 'ne le faites pas'
 'ccby france attribution tatoebaorg scott sacredceltic']], predicted=[dont do it]
 target=[['take a shower' 'prenez une douche'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[take a shower]
 target=[['tom succeeded' 'tom a reussi'
 'ccby france attribution tatoebaorg ck julienpdc']], predicted=[tom succeeded]
 target=[['youre lazy' 'vous etes paresseux'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[youre lazy]
 target=[['it was night' 'cetait la nuit'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[it was night]
 target=[['they gave up' 'ils ont abandonne'
 'ccby france attribution tatoebaorg ck aiji']], predicted=[the

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.009759
BLEU-2: 0.004451
BLEU-3: 0.010459
BLEU-4: 0.012950
test
 target=[['save tom' 'sauvez tom' 'ccby france attribution tatoebaorg ck aiji']], predicted=[warn tom]
 target=[['youre crafty' 'vous etes rusees'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[youre amazing]
 target=[['i have doubts' 'jai des doutes'
 'ccby france attribution tatoebaorg ck aiji']], predicted=[i have lost]
 target=[['they all knew' 'elles savaient toutes'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[they all drank]
 target=[['youve tried' 'vous avez essaye'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[you tried]
 target=[['now we wait' 'maintenant nous attendons'
 'ccby france attribution tatoebaorg ck sacredceltic']], predicted=[well will it]
 target=[['look at my arm' 'regarde mon bras'
 'ccby france attribution tatoebaorg hybrid aiji']], predicted=[many my my]
 target=[['we must obey' 'il nous faut obeir'
 'ccby france attributi