In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            12G        568M         11G        972K        606M         11G
Swap:            0B          0B          0B


In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [None]:
#load dataset

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [None]:
#tokenizer

def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [None]:
#max length

def max_length(lines):
	return max(len(line.split()) for line in lines)

In [None]:
#encoding and pading

def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [None]:
# one hot encode target sequence

def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [None]:
# define NMT model

def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# load datasets
dataset = load_clean_sentences('/content/drive/MyDrive/Marathi-English/Cleaned Dataset/qed_raw_dataset.pkl')
train = load_clean_sentences('/content/drive/MyDrive/Marathi-English/Cleaned Dataset/qed_test.pkl')
test = load_clean_sentences('/content/drive/MyDrive/Marathi-English/Cleaned Dataset/qed_test.pkl')

In [None]:
dataset=dataset.drop(dataset.index[0])

In [None]:
print(dataset)

                                                Marathi                                            English
1                एक शेतकरी ५३१ टमाटे उगवतो, तीन दिवसात.  a farmer grows 531 tomatoes and is able to sel...
2     आता ज़र सांगितले असेल कि टमाटान ची आवक १७६ नि क...  given that his supply of tomatoes decreases by...
3                                        (सारंगी संगीत)                                     (violin music)
4                                       (संगीत समाप्ति)                                       (music ends)
5     शुभ प्रभात. आज मी इकडे आलोय ते स्वयंचलित, उडणा...  good morning. i'm here today to talk about aut...
...                                                 ...                                                ...
1741  आपण मध्ययुगातून पाहू. आणि आपल्या पुढच्या भागात...  we'll see through the middle ages, and then in...
1742  मी कॅटलीन प्रिएम, धार्मिक अभ्यासात द्विपदवीधार...  caitlin priem, religious studies major, and i ...
1743  शिशिर ऋतूत मी पदवीधारक होण्यासा

In [None]:
# prepare marathi tokenizer

mar_tokenizer = create_tokenizer(dataset["Marathi"])
mar_vocab_size = len(mar_tokenizer.word_index) + 1
mar_length = max_length(dataset["Marathi"])
print('Marathi Vocabulary Size: %d' % mar_vocab_size)
print('Marathi Max Length: %d' % (mar_length))

Marathi Vocabulary Size: 26497
Marathi Max Length: 1210


In [None]:
# prepare english tokenizer

eng_tokenizer = create_tokenizer(dataset["English"])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset["English"])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 6286
English Max Length: 365


In [None]:
# prepare training data

trainX = encode_sequences(mar_tokenizer, mar_length, train["Marathi"])
trainY = encode_sequences(eng_tokenizer, eng_length, train["English"])
trainY = encode_output(trainY, eng_vocab_size)

In [None]:
# prepare validation data

testX = encode_sequences(mar_tokenizer, mar_length, test["Marathi"])
testY = encode_sequences(eng_tokenizer, eng_length, test["English"])
testY = encode_output(testY, eng_vocab_size)

In [None]:
# define model

model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
# summarize defined model

print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
# fit model

filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

In [None]:
!cat /var/log/colab-jupyter.log