In [1]:
# dependencies
import tensorflow as tf
import numpy as np
from sklearn.cross_validation import train_test_split
import time
import matplotlib.pyplot as plt
import pickle
import codecs

In [2]:
def read_dataset(filepath):
    with open(filepath, 'rb') as fp:
        return pickle.load(fp)

In [3]:
# read dataset
dataset_location = "D:/NLP Project/Hindi English/data.p"
X, Y, l1_word2idx, l1_idx2word, l1_vocab, l2_word2idx, l2_idx2word, l2_vocab = read_dataset(dataset_location)

In [4]:
input_seq_len = 20
output_seq_len = 22
l1_vocab_size = len(l1_vocab) + 2 # + <pad>, <ukn>
l2_vocab_size = len(l2_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>

In [5]:
# let's define some helper functions

# simple softmax function
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

# feed data into placeholders
def feed_dict(x, y, batch_size = 64):
    feed = {}
    
    idxes = np.random.choice(len(x), size = batch_size, replace = False)
    
    for i in range(input_seq_len):
        feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes])
        
    for i in range(output_seq_len):
        feed[decoder_inputs[i].name] = np.array([y[j][i] for j in idxes])
        
    feed[targets[len(targets)-1].name] = np.full(shape = [batch_size], fill_value = l2_word2idx['<pad>'])
    
    for i in range(output_seq_len-1):
        batch_weights = np.ones(batch_size, dtype = np.float32)
        target = feed[decoder_inputs[i+1].name]
        for j in range(batch_size):
            if target[j] == l2_word2idx['<pad>']:
                batch_weights[j] = 0.0
        feed[target_weights[i].name] = batch_weights
        
    feed[target_weights[output_seq_len-1].name] = np.zeros(batch_size, dtype = np.float32)
    
    return feed

# decode output sequence
def decode_output(output_seq):
    words = []
    for i in range(output_seq_len):
        smax = softmax(output_seq[i])
        idx = np.argmax(smax)
        words.append(l2_idx2word[idx])
    return words

In [6]:
def data_padding(x, l1_word2idx, length = 20):
    for i in range(len(x)):
        x[i] = x[i] + (length - len(x[i])) * [l1_word2idx['<pad>']]
    return x

In [7]:
def translate_model(sentences):
    
    result = []
    
    # read dataset
    dataset_location = "D:/NLP Project/Hindi English/data.p"
    X, Y, l1_word2idx, l1_idx2word, l1_vocab, l2_word2idx, l2_idx2word, l2_vocab = read_dataset(dataset_location)
    
    
    with tf.Graph().as_default():

        # placeholders
        encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
        decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]

        # output projection
        size = 512
        w_t = tf.get_variable('proj_w', [l2_vocab_size, size], tf.float32)
        b = tf.get_variable('proj_b', [l2_vocab_size], tf.float32)
        w = tf.transpose(w_t)
        output_projection = (w, b)


        # change the model so that output at time t can be fed as input at time t+1
        outputs, states = tf.nn.seq2seq.embedding_attention_seq2seq(
                                                    encoder_inputs,
                                                    decoder_inputs,
                                                    tf.nn.rnn_cell.BasicLSTMCell(size),
                                                    num_encoder_symbols = l1_vocab_size,
                                                    num_decoder_symbols = l2_vocab_size,
                                                    embedding_size = 80,
                                                    feed_previous = True, # <-----this is changed----->
                                                    output_projection = output_projection,
                                                    dtype = tf.float32)

        # ops for projecting outputs
        outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
        
        sentences = [[l1_word2idx.get(word.strip(',." ;:)(|][?!<>'), 0) for word in sentence.split(' ')] for sentence in sentences]
        
        encoded_sentences = data_padding(sentences, l1_word2idx)
        
        
        # restore all variables - use the last checkpoint saved
        saver = tf.train.Saver()
        path = tf.train.latest_checkpoint('D:/NLP Project/Hindi English/checkpoints/')
        
        with tf.Session() as sess:
            # restore
            saver.restore(sess, path)

            # feed data into placeholders
            feed = {}
            for i in range(input_seq_len):
                feed[encoder_inputs[i].name] = np.array([encoded_sentences[j][i] for j in range(len(encoded_sentences))])
            feed[decoder_inputs[0].name] = np.array([l2_word2idx['<go>']] * len(encoded_sentences))
            
            # translate
            output_sequences = sess.run(outputs_proj, feed_dict = feed)
            
            for i in range(len(encoded_sentences)):
                
                ouput_seq = [output_sequences[j][i] for j in range(output_seq_len)]
                
                #decode output sequence
                words = decode_output(ouput_seq)
                
                temp = """"""
                for i in range(len(words)):
                    if words[i] not in ['<eos>', '<pad>', '<go>']:
                        temp += words[i] + " "
                result.append(temp.strip())
    
    return result    

In [8]:
import codecs
import nltk
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize

In [9]:
def _read_iitb_dict(IITB_DICTIONARY_FILE_PATH, english_hindi_dict, english_hindi_multiword):
    with codecs.open(IITB_DICTIONARY_FILE_PATH, 'r', 'utf-8') as dictionary_file:
        for index, line in enumerate(dictionary_file):
            end_index = line.find("]")
            if(end_index == -1):
                # print(line)
                continue

            hindi_phrase = line[1:end_index]
            quote_start_index = line.index('"')
            quote_end_index = line.find('"', quote_start_index+1)
            if(quote_end_index == -1):
                # print(line)
                continue

            english_phrase_with_meaning = line[quote_start_index+1:quote_end_index]
            meaning_index = english_phrase_with_meaning.find('(')
            if(meaning_index != -1):
                english_phrase = english_phrase_with_meaning[:meaning_index]
            else:
                english_phrase = english_phrase_with_meaning
            # print(hindi_phrase, english_phrase)
            if(english_phrase.find(' ') == -1):
                # Single Word Phrase
                english_hindi_dict[english_phrase] = hindi_phrase
            else:
                english_hindi_multiword[english_phrase] = hindi_phrase

            # if(index%100 == 0):
            #     print(index)

In [10]:
def _read_hindencorp_dict(HINDENCORP_PATH, english_hindi_dict, english_hindi_multiword):
    model = pickle.load(open(HINDENCORP_PATH, 'rb'))
    english = model["X"]
    hindi = model["Y"]
    for index, english_phrase in enumerate(english):
        hindi_phrase = hindi[index]
        if(english_phrase.find(' ') == -1):
            english_hindi_dict[english_phrase] = hindi_phrase
        else:
            english_hindi_multiword[english_phrase] = hindi_phrase

In [11]:
def _read_dictionaries(IITB_DICTIONARY_FILE_PATH, HINDENCORP_PATH, english_hindi_dict, english_hindi_multiword):
    _read_iitb_dict(IITB_DICTIONARY_FILE_PATH, english_hindi_dict, english_hindi_multiword)
    _read_hindencorp_dict(HINDENCORP_PATH, english_hindi_dict, english_hindi_multiword)

    print("Dictionary Length:",len(english_hindi_dict))

In [12]:
def _run_on_test_data(english_hindi_dict, TEST_DATA_FILE = "test_data.p"):
    obj = pickle.load(open(TEST_DATA_FILE, "rb"))

    absent_words = set()
    for keys in obj.keys():
        sentence = obj[keys]["I"]
        obj[keys]["X"] = translate_sentence(sentence, english_hindi_dict)
        print(obj[keys]["I"], obj[keys]["X"])

    # print(absent_words) 
    pickle.dump(obj, open('dictionary_test_data.p', 'wb'))

In [13]:
def _translate_sentences(sentences, english_hindi_dict):
    translated_sentences = list()
    for sentence in sentences:
        translated_sentences.append(translate_sentence(sentence, english_hindi_dict))

    return translated_sentences

In [14]:
def translate_sentence(sentence, english_hindi_dict):
    tokenized = word_tokenize(sentence)
    translated_sentence = list()
    for word in tokenized:
        if word in english_hindi_dict:
            translated_sentence.append(english_hindi_dict[word])
        elif not word.isalnum():
            translated_sentence.append(word)
        # else:
            # absent_words.add(word)

    return ' '.join(translated_sentence)

In [15]:
def dictionary_translation(sentences, IITB_DICTIONARY_FILE_PATH = 'D:/NLP Project/Hindi English/rahul/UW-Hindi_Dict-20131003.txt', 
                           HINDENCORP_PATH = 'D:/NLP Project/Hindi English/hindencorp05.p'):

    english_hindi_dict = dict()
    english_hindi_multiword = dict()

    _read_dictionaries(IITB_DICTIONARY_FILE_PATH, HINDENCORP_PATH, english_hindi_dict, english_hindi_multiword)
    # _run_on_test_data(english_hindi_dict)
    return _translate_sentences(sentences, english_hindi_dict)

In [16]:
import codecs
input_file = codecs.open("D:/NLP Project/Hindi English/test/input.txt", encoding = "utf-8", mode="r")
output_file = codecs.open("D:/NLP Project/Hindi English/test/output.txt", encoding="utf-8", mode="r")

inputs = input_file.read().split('\n')
actual_output = output_file.read().split('\n')

def attention_nn_function(x):
    return translate_model(x)

def baseline_dict_function(x):
    return dictionary_translation(x)

predicted_output = attention_nn_function(inputs)
baseline_output = baseline_dict_function(inputs)

import pandas as pd


d = {'input':inputs,'actual_output':actual_output,'predicted_output':predicted_output,'baseline_output':baseline_output}
df = pd.DataFrame(data=d)

for i in range(len(inputs)):
    print('==================================')
    print('Input - ', inputs[i])
    print('Actual - ', actual_output[i])
    print('Baseline - ', baseline_output[i])
    print('Predicted - ', predicted_output[i])
    print('----------------------------------')

#print(df)

Dictionary Length: 48903
Input -  there should be only one domain object
Actual -  एक ही <ukn> वस्तु होना चाहिए 
Baseline -  है। गा,गे,गीईना रहना केवल एक डोमेन वस्तु
Predicted -  केवल एक डोमेन को ऑब्जेक्ट होना चाहिए
----------------------------------
Input -  max <ukn>
Actual -  मैक्स पेन
Baseline -  अधिकतम < >
Predicted -  अधिकतम <ukn>
----------------------------------
Input -  video playback
Actual -  वीडियो प्लेबैक
Baseline -  वीडियो प्लेबैक
Predicted -  वीडियो प्लेबैक
----------------------------------
Input -  unknown time remaining
Actual -  अज्ञात समय शेष
Baseline -  अज्ञात समय(m) शेष
Predicted -  अनजान समय शेष
----------------------------------
Input -  pause
Actual -  ठहरें
Baseline -  ठहराना
Predicted -  रोकें
----------------------------------
Input -  enter password to unlock your login keyring
Actual -  लॉगिन कीरिंग को खोलने के लिये कूटशब्द डालें
Baseline -  प्रविष्ट करें पासवर्डों प्रति खोलें तुम्हारा लॉगिन करें
Predicted -  आपका लॉगिन कीरिंग को पासवर्ड के लिए कूटशब्द दा

In [17]:
if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()

In [18]:
sentences = ["what is your name", "who is this", "not", "bad", "not bad", "water is life", "what do you do", 
            "what is todays date", "dead animal", "how are you today", "tell me a good joke", "dont make me mad", "who is your dad"]
result = translate_model(sentences)
print(result)

['आपका नाम क्या है', 'कौन कौन है।', 'नही', 'बुरा', 'बुरा नहीं', 'जल जीवनी है', 'आप क्या कर सकते हैं', 'दिनांक दिनांक क्या है', 'मृत जानवर', 'आज आपको कैसे हो', 'मुझे विश्वास का एक अच्छा मजाक', 'मुझे दो पागल बनाएं', 'कौन आपकी पिता है।']
