In [58]:
import numpy as np
import re
import string
from unicodedata import normalize 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical, plot_model

In [59]:
with open('/content/English to French.csv', mode = 'r', encoding = 'utf-8') as file:
  text = file.read()

# **Data Preprocessing**

In [60]:
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

In [61]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [62]:
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
            #line = line.decode('unicode-escape')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return np.array(cleaned)

In [63]:
pairs = to_pairs(text)
clean_pairs = clean_pairs(pairs)
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[go] => [va]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[who] => [qui]
[wow] => [ca alors]
[fire] => [au feu]
[help] => [a laide]
[jump] => [saute]
[stop] => [ca suffit]
[stop] => [stop]
[stop] => [arretetoi]
[wait] => [attends]
[wait] => [attendez]
[go on] => [poursuis]
[go on] => [continuez]
[go on] => [poursuivez]
[hello] => [bonjour]
[hello] => [salut]
[i see] => [je comprends]
[i try] => [jessaye]
[i won] => [jai gagne]
[i won] => [je lai emporte]
[i won] => [jai gagne]
[oh no] => [oh non]
[attack] => [attaque]
[attack] => [attaquez]
[cheers] => [sante]
[cheers] => [a votre sante]
[cheers] => [merci]
[cheers] => [tchintchin]
[get up] => [levetoi]
[go now] => [va maintenant]
[go now] => [allezy maintenant]
[go now] => [vasy maintenant]
[got it] => [jai pige]
[got it] => [compris]
[got it] => [pige]
[got it] => [compris]
[got it] => [tas capte]
[hop in] => [monte]
[hop in] => [montez]
[hug me] => [serremoi dans tes bras]
[hug me] => [serrezmoi dans vos bras]


In [64]:
dataset = clean_pairs[:15000, :]
len(dataset)

15000

# **Split Data**

In [65]:
np.random.shuffle(dataset)

In [66]:
from sklearn.model_selection import train_test_split
train, test =train_test_split(dataset, test_size=0.2, random_state=1)

In [67]:
len(train), len(test)

(12000, 3000)

In [68]:
train[3]

array(['youre shy', 'tu es timide',
       'ccby france attribution tatoebaorg ck sacredceltic'],
      dtype='<U339')

# **Build Model**

In [69]:
def tokenizer(x):   
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer

In [70]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [71]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	X = tokenizer.texts_to_sequences(lines)
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [72]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [73]:
# prepare english tokenizer
eng_tokenizer = tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 2888
English Max Length: 5


In [74]:
# prepare french tokenizer
fre_tokenizer = tokenizer(dataset[:, 1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:, 1])
print('French Vocabulary Size: %d' % fre_vocab_size)
print('French Max Length: %d' % (fre_length))

French Vocabulary Size: 5797
French Max Length: 11


In [75]:
trainX = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

In [76]:
testX = encode_sequences(fre_tokenizer, fre_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [77]:
model = Sequential([
	Embedding(fre_vocab_size, 256, input_length=fre_length, mask_zero=True),
	LSTM(256),
	RepeatVector(eng_length),
	LSTM(256, return_sequences=True),
	TimeDistributed(Dense(eng_vocab_size, activation='softmax'))
])

In [78]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 11, 256)           1484032   
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 repeat_vector_1 (RepeatVect  (None, 5, 256)           0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed_1 (TimeDis  (None, 5, 2888)          742216    
 tributed)                                                       
                                                                 
Total params: 3,276,872
Trainable params: 3,276,872
No

In [79]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [80]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), verbose=2)

Epoch 1/30
188/188 - 15s - loss: 4.3271 - val_loss: 3.6460 - 15s/epoch - 80ms/step
Epoch 2/30
188/188 - 2s - loss: 3.4893 - val_loss: 3.5039 - 2s/epoch - 13ms/step
Epoch 3/30
188/188 - 2s - loss: 3.3134 - val_loss: 3.3760 - 2s/epoch - 11ms/step
Epoch 4/30
188/188 - 2s - loss: 3.1211 - val_loss: 3.2480 - 2s/epoch - 12ms/step
Epoch 5/30
188/188 - 3s - loss: 2.9419 - val_loss: 3.1003 - 3s/epoch - 14ms/step
Epoch 6/30
188/188 - 3s - loss: 2.7562 - val_loss: 2.9743 - 3s/epoch - 14ms/step
Epoch 7/30
188/188 - 2s - loss: 2.5887 - val_loss: 2.8833 - 2s/epoch - 12ms/step
Epoch 8/30
188/188 - 3s - loss: 2.4307 - val_loss: 2.7761 - 3s/epoch - 15ms/step
Epoch 9/30
188/188 - 2s - loss: 2.2708 - val_loss: 2.6790 - 2s/epoch - 13ms/step
Epoch 10/30
188/188 - 3s - loss: 2.1193 - val_loss: 2.6019 - 3s/epoch - 15ms/step
Epoch 11/30
188/188 - 2s - loss: 1.9813 - val_loss: 2.5402 - 2s/epoch - 12ms/step
Epoch 12/30
188/188 - 2s - loss: 1.8517 - val_loss: 2.4716 - 2s/epoch - 11ms/step
Epoch 13/30
188/188 - 3

<keras.callbacks.History at 0x7fb1965f1760>