In [12]:
##########-DATASET-##########
from google.colab import files
uploaded = files.upload() 

Saving spa.txt to spa (1).txt


In [13]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

def load_document(name):
	file = open(name, mode='rt', encoding='utf-8')
	text = file.read()
	file.close()
	return text

def sep_to_pairs(document):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

def clean_all_pairs(lines):
	cleaned = list()
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

def save_data(sentences, name):
	dump(sentences, open(name, 'wb'))
	print('Saved: %s' % name)

name = 'spa.txt'
doc = load_document(name)
pairs = sep_to_pairs(doc)
clean_pairs = clean_all_pairs(pairs)
save_data(clean_pairs, 'english-spanish.pkl')
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-spanish.pkl
[go] => [ve]
[go] => [vete]
[go] => [vaya]
[go] => [vayase]
[hi] => [hola]
[run] => [corre]
[run] => [corred]
[who] => [quien]
[fire] => [fuego]
[fire] => [incendio]
[fire] => [disparad]
[help] => [ayuda]
[help] => [socorro auxilio]
[help] => [auxilio]
[jump] => [salta]
[jump] => [salte]
[stop] => [parad]
[stop] => [para]
[stop] => [pare]
[wait] => [espera]
[wait] => [esperen]
[go on] => [continua]
[go on] => [continue]
[hello] => [hola]
[i ran] => [corri]
[i ran] => [corria]
[i try] => [lo intento]
[i won] => [he ganado]
[oh no] => [oh no]
[relax] => [tomatelo con soda]
[smile] => [sonrie]
[attack] => [al ataque]
[attack] => [atacad]
[get up] => [levanta]
[go now] => [ve ahora mismo]
[got it] => [lo tengo]
[got it] => [lo pillas]
[got it] => [entendiste]
[he ran] => [el corrio]
[hop in] => [metete adentro]
[hug me] => [abrazame]
[i fell] => [me cai]
[i know] => [yo lo se]
[i left] => [sali]
[i lied] => [menti]
[i lost] => [perdi]
[i quit] => [dimito]
[i quit

In [14]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-spanish.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-spanish-both.pkl')
save_clean_data(train, 'english-spanish-train.pkl')
save_clean_data(test, 'english-spanish-test.pkl')

Saved: english-spanish-both.pkl
Saved: english-spanish-train.pkl
Saved: english-spanish-test.pkl


In [15]:
##########-MODEL-##########
!apt-get -qq install -y graphviz && pip install -q pydot
import pydot

from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

def create_new_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

def max_length_in_list(lines):
	return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer, length, lines):
	# encode sequences to integer 
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

dataset = load_clean_sentences('english-spanish-both.pkl')
trainset = load_clean_sentences('english-spanish-train.pkl')
testset = load_clean_sentences('english-spanish-test.pkl')

eng_tokenizer = create_new_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length_in_list(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
spa_tokenizer = create_new_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length_in_list(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % spa_vocab_size)
print('Spanish Max Length: %d' % (spa_length))

trainX = encode_sequences(spa_tokenizer, spa_length, trainset[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, trainset[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

testX = encode_sequences(spa_tokenizer, spa_length, testset[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, testset[:, 0])
testY = encode_output(testY, eng_vocab_size)

model = define_model(spa_vocab_size, eng_vocab_size, spa_length, eng_length, 256)
model.compile(optimizer='SGD', loss='mean_squared_error')
print(model.summary())
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=20)

English Vocabulary Size: 2343
English Max Length: 5
Spanish Vocabulary Size: 4520
Spanish Max Length: 8
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 8, 256)            1157120   
_________________________________________________________________
lstm_9 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 5, 2343)           602151    
Total params: 2,809,895
Trainable params: 2,809,895
Non-trainable params: 0
_________________________________________________________________
None
Train on 9000 samples, 

<keras.callbacks.History at 0x7fb7993b1978>

In [17]:
##########-BLEU-##########
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

def max_length(lines):
	return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer, length, lines):
	X = tokenizer.texts_to_sequences(lines)
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append(raw_target.split())
		predicted.append(translation.split())
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

dataset = load_clean_sentences('english-spanish-both.pkl')
trainset = load_clean_sentences('english-spanish-train.pkl')
testset = load_clean_sentences('english-spanish-test.pkl')
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length(dataset[:, 1])
trainX = encode_sequences(spa_tokenizer, spa_length, trainset[:, 1])
testX = encode_sequences(spa_tokenizer, spa_length, testset[:, 1])

model = load_model('model.h5')
print('train')
evaluate_model(model, eng_tokenizer, trainX, trainset)
print('test')
evaluate_model(model, eng_tokenizer, testX, testset)

train
src=[el cultiva arroz], target=[he grows rice], predicted=[honor honor honor honor honor]
src=[el se ha vuelto loco], target=[he has gone mad], predicted=[test behind behind behind behind]
src=[ellos lo construyeron], target=[they built it], predicted=[out out out out out]
src=[estas tu calvo], target=[are you bald], predicted=[dinnertime use use use use]
src=[quiero conducir], target=[i want to drive], predicted=[then then then then then]
src=[sabes nadar], target=[can you swim], predicted=[unreal unreal unreal unreal pass]
src=[si ciertamente], target=[yes of course], predicted=[despise despise despise despise sweating]
src=[para el coche], target=[stop the car], predicted=[mustnt mustnt mustnt mustnt mustnt]
src=[dejame explicar], target=[let me explain], predicted=[calling calling calling calling calling]
src=[yo hice eso], target=[did i do that], predicted=[yelled yelled yelled yelled yelled]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.000022
BLEU-2: 0.004714
BLEU-3: 0.040184
BLEU-4: 0.068662
test
src=[ella parecia triste], target=[she looked sad], predicted=[carefully carefully carefully carefully carefully]
src=[traemelo], target=[bring him to me], predicted=[answered answered sickens sickens sickens]
src=[me fui de excursion], target=[i went hiking], predicted=[biased biased biased biased biased]
src=[vamos a charlar], target=[lets chat], predicted=[black black black black black]
src=[lo he visto], target=[ive seen it], predicted=[turtles turtles turtles turtles turtles]
src=[ahora estas a salvo], target=[youre safe now], predicted=[cut cut cut cut cut]
src=[yo mire a otro lado], target=[i looked away], predicted=[tallest tallest tallest tallest tallest]
src=[hasta la vista], target=[goodbye], predicted=[kind kind useless useless useless]
src=[somos unas inutiles], target=[were useless], predicted=[thisll thisll thisll thisll thisll]
src=[puedo tomarlo por prestado], target=[can i borrow it], predicted=[