In [None]:
pip install tensorflow

In [7]:
import numpy as np
import re
import string
from unicodedata import normalize 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical, plot_model

ModuleNotFoundError: No module named 'tensorflow'

In [6]:
with open('English to French.csv', mode = 'r', encoding = 'utf-8') as file:
  text = file.read()

# **Data Preprocessing**

In [2]:
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

In [3]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
            #line = line.decode('unicode-escape')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return np.array(cleaned)

In [5]:
pairs = to_pairs(text)
clean_pairs = clean_pairs(pairs)
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

NameError: name 'text' is not defined

In [9]:
dataset = clean_pairs[:150000, :]
len(dataset)

133754

# **Split Data**

In [10]:
np.random.shuffle(dataset)

In [11]:
from sklearn.model_selection import train_test_split
train, test =train_test_split(dataset, test_size=0.2, random_state=1)

In [12]:
len(train), len(test)

(107003, 26751)

In [13]:
train[3]

array(['tom is much fatter than mary', 'tom est bien plus gros que marie',
       'ccby france attribution tatoebaorg ck'], dtype='<U81')

# **Build Model**

In [14]:
def tokenizer(x):   
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer

In [15]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [16]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	X = tokenizer.texts_to_sequences(lines)
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [17]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [18]:
# prepare english tokenizer
eng_tokenizer = tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 10686
English Max Length: 11


In [19]:
# prepare french tokenizer
fre_tokenizer = tokenizer(dataset[:, 1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:, 1])
print('French Vocabulary Size: %d' % fre_vocab_size)
print('French Max Length: %d' % (fre_length))

French Vocabulary Size: 21813
French Max Length: 16


In [None]:
trainX = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

In [None]:
testX = encode_sequences(fre_tokenizer, fre_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [None]:
model = Sequential([
	Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True)
	LSTM(n_units)
	RepeatVector(tar_timesteps)
	LSTM(n_units, return_sequences=True)
	TimeDistributed(Dense(tar_vocab, activation='softmax'))
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)