In [None]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            25G        602M         22G        960K        1.9G         24G
Swap:            0B          0B          0B


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [2]:
# load a clean dataset

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [3]:
# fit a tokenizer

def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [4]:
# max sentence length

def max_length(lines):
	return max(len(line.split()) for line in lines)

In [5]:
# encode and pad sequences

def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [6]:
# map an integer to a word

def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [7]:
# generate target given source sequence

def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [26]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    # translate encoded source text
    print("Source:")
    print(source)
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, eng_tokenizer, source)
    print("Translation")
    print(translation)
    raw_src, raw_target = raw_dataset.iloc[i]
    if i < 10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
    actual.append([raw_target.split()])
    print(translation)
    # print("--------------------------------")
    predicted.append(translation.split())
    print("Predicted")
    print(predicted)
  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
  # print(predicted)
  # print("___")
  # print(actual)


In [11]:
# load datasets
dataset = load_clean_sentences('/content/drive/MyDrive/NLP/NMT/Marathi-English/Cleaned Dataset/qed_raw_dataset.pkl')
train = load_clean_sentences('/content/drive/MyDrive/NLP/NMT/Marathi-English/Cleaned Dataset/qed_train.pkl')
test = load_clean_sentences('/content/drive/MyDrive/NLP/NMT/Marathi-English/Cleaned Dataset/qed_test.pkl')

In [12]:
# prepare marathi tokenizer

mar_tokenizer = create_tokenizer(dataset["Marathi"])
mar_vocab_size = len(mar_tokenizer.word_index) + 1
mar_length = max_length(dataset["Marathi"])
print('Marathi Vocabulary Size: %d' % mar_vocab_size)
print('Marathi Max Length: %d' % (mar_length))

Marathi Vocabulary Size: 26502
Marathi Max Length: 1210


In [13]:
# prepare english tokenizer

eng_tokenizer = create_tokenizer(dataset["English"])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset["English"])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

English Vocabulary Size: 6288
English Max Length: 365


In [14]:
train["Marathi"]

0       aata aaplya kade general tools aahet konchyahi...
1                  एक शेतकरी ५३१ टमाटे उगवतो, तीन दिवसात.
2       आता ज़र सांगितले असेल कि टमाटान ची आवक १७६ नि क...
3                                          (सारंगी संगीत)
4                                         (संगीत समाप्ति)
                              ...                        
1295    JAWS येथे एका आदर्श स्त्रीला भेटल्यावर तिने या...
1296       तर दुसऱ्या करवी दुसरा, त्याचा वेगळा ठसा असतो .
1297    सर्व मुद्दे मांडल्यानेच तुम्हाला वास्तव कळते. ...
1298                                       [The Guardian]
1299    मेगन कामेरिक : तुम्ही सर्व सहमत असाल पूर्ण माह...
Name: Marathi, Length: 1300, dtype: object

In [15]:
test["Marathi"]

1300                                            (टाळ्या )
1301                                    Camera Never Lies
1302                            Aapala punha swagat aahe.
1303    Tar magil chitrafitit aapan jethe thamblo, te ...
1304    Ani, tumhala mahit aahe ki aaplyakade ek base ...
                              ...                        
1740    लोक एकमेकांना ज्या प्रकारच्या पुराणकथा सांगतात...
1741    आपण मध्ययुगातून पाहू. आणि आपल्या पुढच्या भागात...
1742    मी कॅटलीन प्रिएम, धार्मिक अभ्यासात द्विपदवीधार...
1743    शिशिर ऋतूत मी पदवीधारक होण्यासाठी तयार होत होत...
1744    MSID म्हणजे थोडं परदेशात शिक्षण आणि थोडी असं ह...
Name: Marathi, Length: 445, dtype: object

In [16]:
#Prepare data

trainX = encode_sequences(mar_tokenizer, mar_length, train["Marathi"])
testX = encode_sequences(mar_tokenizer, mar_length, test["Marathi"])

In [17]:
# load model
model = load_model('/content/drive/MyDrive/NLP/NMT/Marathi-English/model/qed/model.h5')

In [18]:
# test on some training sequences
print(train)

                                                Marathi                                            English
0     aata aaplya kade general tools aahet konchyahi...  we now have the general tools to really tackle...
1                एक शेतकरी ५३१ टमाटे उगवतो, तीन दिवसात.  a farmer grows 531 tomatoes and is able to sel...
2     आता ज़र सांगितले असेल कि टमाटान ची आवक १७६ नि क...  given that his supply of tomatoes decreases by...
3                                        (सारंगी संगीत)                                     (violin music)
4                                       (संगीत समाप्ति)                                       (music ends)
...                                                 ...                                                ...
1295  JAWS येथे एका आदर्श स्त्रीला भेटल्यावर तिने या...  one of my board members is an editor at a majo...
1296     तर दुसऱ्या करवी दुसरा, त्याचा वेगळा ठसा असतो .  seen from another point of view, it gives quit...
1297  सर्व मुद्दे मांडल्यानेच तुम्हाल

In [None]:
evaluate_model(model, eng_tokenizer, trainX, train)

In [None]:
print(test)

                                                Marathi                                            English
1300                                          (टाळ्या )                                         (applause)
1301                                  Camera Never Lies  (the camera never lies - introduction) [emmett...
1302                          Aapala punha swagat aahe.                                      welcome back.
1303  Tar magil chitrafitit aapan jethe thamblo, te ...  so where we left off in the last video, i'd sh...
1304  Ani, tumhala mahit aahe ki aaplyakade ek base ...          and, you know, we could have some base a.
...                                                 ...                                                ...
1740  लोक एकमेकांना ज्या प्रकारच्या पुराणकथा सांगतात...  so, according to xenophanes, culture actually ...
1741  आपण मध्ययुगातून पाहू. आणि आपल्या पुढच्या भागात...  we'll see through the middle ages, and then in...
1742  मी कॅटलीन प्रिएम, धार्मिक अभ्या

In [None]:
evaluate_model(model, eng_tokenizer, testX, test)

src=[(टाळ्या )], target=[(applause)], predicted=[laughter]
laughter
src=[Camera Never Lies], target=[(the camera never lies - introduction) [emmett sullivan] hello.], predicted=[applause]
applause
src=[Aapala punha swagat aahe.], target=[welcome back.], predicted=[laughter laughter laughter]
laughter laughter laughter
src=[Tar magil chitrafitit aapan jethe thamblo, te mi tumhala dakhaato Hyala Geometric series mhanatat.], target=[so where we left off in the last video, i'd shown you this thing called the geometric series.], predicted=[]

src=[Ani, tumhala mahit aahe ki aaplyakade ek base a asu shakato.], target=[and, you know, we could have some base a.], predicted=[]

src=[To kontahi ek anka asel.], target=[it could be any number.], predicted=[laughter laughter]
laughter laughter
src=[To 1/2 asel kinva 10 asel.], target=[it could be 1/2, it could be 10.], predicted=[]

src=[Pan to kewal ek anka asel.], target=[but that's just-- but some number.], predicted=[]

src=[मी तुमच्यासाठी घेऊन

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
