In [2]:
import os 
import re 
import string
import pickle
import numpy as np 
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import corpus_bleu

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Force CPU computation
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Data preparation

In [3]:
# Load files
with open("../data/task2/train.en", "r") as f:
    train_en = f.read().split("\n")
    
with open("../data/task2/train.hi", "r") as f:
    train_hi = f.read().split("\n")
    
with open("../data/task2/dev.en", "r") as f:
    dev_en = f.read().split("\n")
    
with open("../data/task2/dev.hi", "r") as f:
    dev_hi = f.read().split("\n")
    
with open("../saved_data/machine_translation/lstm/en_tokenizer.pkl", "rb") as f:
    en_tokenizer = pickle.load(f)
    
with open("../saved_data/machine_translation/lstm/hi_tokenizer.pkl", "rb") as f:
    hi_tokenizer = pickle.load(f)
    
puncts = string.punctuation + train_hi[3][-1]


def preprocess_line(line):
    line = line.translate(str.maketrans("", "", puncts))
    line = re.sub("\u200b", " ", line)
    line = re.sub("\u200d", " ", line)
    line = re.sub("\d+", " ", line)
    line = line.lower()
    line = " ".join(line.split())
    return line

train_en = [preprocess_line(line) for line in train_en]
train_hi = [preprocess_line(line) for line in train_hi]
dev_en = [preprocess_line(line) for line in dev_en]
dev_hi = [preprocess_line(line) for line in dev_hi]

# Evaluation functions

In [4]:
def load_models(enc_path, dec_path):
    encoder_model = load_model(enc_path)
    decoder_model = load_model(dec_path)
    return encoder_model, decoder_model


def translate(input_sentence):
    line = preprocess_line(input_sentence)
    tokens = en_tokenizer.texts_to_sequences([line])
    tokens = pad_sequences(tokens, maxlen=30, padding='post')
    tokens = np.flip(tokens)
    
    init_states = encoder_model(tokens)
    dec_in = tf.expand_dims([hi_tokenizer.word_index['startseq']], 1)
    
    words = []
    
    for t in range(29):
        preds, init_states = decoder_model([dec_in, init_states])
        pred_idx = np.argmax(preds, -1)[0]
        word = hi_tokenizer.index_word.get(pred_idx)
        
        if word is None or word == "endseq":
            break
            
        words.append(word)
        dec_in = tf.expand_dims([pred_idx], 1)
        
    return " ".join(words)


def evaluate_model(input_sentences, output_sentences):
    actual, predicted = [], []
    for in_line, out_line in tqdm(zip(input_sentences, output_sentences), 
                                  total=len(input_sentences)):
        hypothesis = translate(in_line)
        references = out_line
        actual.append([references.split()])
        predicted.append(hypothesis.split())
    
    print("BLEU-1: {:.3f}".format(corpus_bleu(actual, predicted, weights=(1., 0, 0, 0))))
    print("BLEU-2: {:.3f}".format(corpus_bleu(actual, predicted, weights=(.5, .5, 0, 0))))
    print("BLEU-3: {:.3f}".format(corpus_bleu(actual, predicted, weights=(.3, .3, .3, 0))))
    print("BLEU-4: {:.3f}".format(corpus_bleu(actual, predicted, weights=(.25, .25, .25, .25))))

# Model evaluation

In [6]:
# Load models
encoder_model, decoder_model = load_models(
    enc_path = "../saved_data/machine_translation/lstm/models/encoder_30",
    dec_path = "../saved_data/machine_translation/lstm/models/decoder_30"
)

# Train data evaluation
evaluate_model(train_en[:5000], train_hi[:5000])

# Development data evaluation
evaluate_model(dev_en, dev_hi)



HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


BLEU-1: 0.371
BLEU-2: 0.268
BLEU-3: 0.236
BLEU-4: 0.159


HBox(children=(FloatProgress(value=0.0, max=501.0), HTML(value='')))


BLEU-1: 0.207
BLEU-2: 0.085
BLEU-3: 0.053
BLEU-4: 0.017


# Examples

In [7]:
input_sentence = "What are we doing here?"

print("[ENGLISH]: {}".format(input_sentence))
print("\n[HINDI]: {}".format(translate(input_sentence)))

[ENGLISH]: What are we doing here?

[HINDI]: यहाँ क्या कर रहे हैं
