In [5]:
import string
from pickle import dump
from numpy import array
from pickle import load
from unicodedata import normalize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()

	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# split on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = 'kan-eng.tsv'
doc = load_doc(filename)
# split into english-kannada pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-kannada36.pkl')
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))


# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-kannada36.pkl')

# reduce dataset size
n_sentences = 36000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:34000], dataset[34000:]
# save
save_clean_data(dataset, 'english-kannada-both36.pkl')
save_clean_data(train, 'english-kannada-train36.pkl')
save_clean_data(test, 'english-kannada-test36.pkl')
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# load datasets
dataset = load_clean_sentences('english-kannada-both36.pkl')
train = load_clean_sentences('english-kannada-train36.pkl')
test = load_clean_sentences('english-kannada-test36.pkl')
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare kannada tokenizer
kan_tokenizer = create_tokenizer(dataset[:, 1])
kan_vocab_size = len(kan_tokenizer.word_index) + 1
kan_length = max_length(dataset[:, 1])
print('Kannada Vocabulary Size: %d' % kan_vocab_size)
print('Kannada Max Length: %d' % (kan_length))
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y
# prepare training data
trainX = encode_sequences(kan_tokenizer, kan_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(kan_tokenizer, kan_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# define model
model = define_model(kan_vocab_size, eng_vocab_size, kan_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename='model32k.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=1000, batch_size=128, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Saved: english-kannada36.pkl
[go.] => [ಹೋಗಿ.]
[run!] => [ಓಡು!]
[run!] => [ಓಡು!]
[run!] => [ಓಡು!]
[run!] => [ಓಡು!]
[who?] => [ಯಾರು?]
[wow!] => [ಅದ್ಭುತ!]
[fire!] => [ಬೆಂಕಿ!]
[fire!] => [ಬೆಂಕಿ!]
[help!] => [ಸಹಾಯ!]
[help!] => [ಸಹಾಯ!]
[jump!] => [ನೆಗೆಯುವುದನ್ನು!]
[jump!] => [ನೆಗೆಯುವುದನ್ನು!]
[jump.] => [ನೆಗೆಯುವುದನ್ನು.]
[jump.] => [ನೆಗೆಯುವುದನ್ನು.]
[stop!] => [ನಿಲ್ಲಿಸು!]
[stop!] => [ನಿಲ್ಲಿಸು!]
[wait!] => [ನಿರೀಕ್ಷಿಸಿ!]
[wait!] => [ನಿರೀಕ್ಷಿಸಿ!]
[hello!] => [ಹಲೋ!]
[hurry!] => [ಯದ್ವಾತದ್ವಾ!]
[hurry!] => [ಯದ್ವಾತದ್ವಾ!]
[hurry!] => [ಯದ್ವಾತದ್ವಾ!]
[i won!] => [ನಾನು ಗೆದ್ದೆ!]
[i won!] => [ನಾನು ಗೆದ್ದೆ!]
[get up.] => [ಎದ್ದೇಳು.]
[got it!] => [ಅರ್ಥವಾಯಿತು!]
[got it?] => [ಅರ್ಥವಾಯಿತು?]
[got it?] => [ಅರ್ಥವಾಯಿತು?]
[got it?] => [ಅರ್ಥವಾಯಿತು?]
[got it?] => [ಅರ್ಥವಾಯಿತು?]
[he ran.] => [ಅವನು ಓಡಿದ.]
[he ran.] => [ಅವನು ಓಡಿದ.]
[he ran.] => [ಅವನು ಓಡಿದ.]
[he ran.] => [ಅವನು ಓಡಿದ.]
[i fell.] => [ನಾನು ಬಿದ್ದೆ.]
[i fell.] => [ನಾನು ಬಿದ್ದೆ.]
[i fell.] => [ನಾನು ಬಿದ್ದೆ.]
[i fell.] => [ನಾನು ಬಿದ್ದೆ.]
[i know.] => [ನನಗೆ ಗೊತ್ತು.]
[i kno

<keras.callbacks.History at 0x7f3556ea62e8>

In [14]:
from pickle import load
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu


# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))


# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)


# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X


# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None


# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)


# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src, test = raw_dataset[i]
        '''if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))'''


# load datasets
dataset = load_clean_sentences('english-kannada-both36.pkl')
train = load_clean_sentences('english-kannada-train36.pkl')
test = load_clean_sentences('english-kannada-test36.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare kannada tokenizer
kan_tokenizer = create_tokenizer(dataset[:, 1])
kan_vocab_size = len(kan_tokenizer.word_index) + 1
kan_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(kan_tokenizer, kan_length, train[:, 1])
testX = encode_sequences(kan_tokenizer, kan_length, test[:, 1])


# load model
model = load_model('model32k.h5')
# test on some training sequences
#print('train')
#evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
#print('test')
#evaluate_model(model, eng_tokenizer, testX, test)

def custom_pred(sentence):
    data = [sentence]
    data = kan_tokenizer.texts_to_sequences(data)
    data = pad_sequences(data, maxlen=kan_length, padding='post')
    data = predict_sequence(model, eng_tokenizer, data)
    data = " ".join(data[:])
    return data


Using TensorFlow backend.


AttributeError: module 'tensorflow' has no attribute 'name_scope'

In [15]:
custom_pred("ಶ್ರೀನಿವಾಸ್ ಒಳ್ಳೆಯದು")

NameError: name 'kan_tokenizer' is not defined