# Natural Language Processing
## Assignment 3 - Machine Translation
### Code 1 - No Attention
### Students
- M Maheeth Reddy (1801CS31)
- Nischal A (1801CS33)

### Import Block

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import string

import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import KeyedVectors
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from sklearn.model_selection import train_test_split

from tqdm import tqdm

%matplotlib inline


### Load Glove Vector

using 300-dim Glove word embeddings


In [None]:
glove_emb = {}
with open('glove.840B.300d.txt') as f:
    for line in f:
        values = line.split(' ')
        # The first entry is the word
        # The rest are vectors representing the embedding for the word
        glove_emb[values[0]] = np.asarray(values[1:], dtype='float32')


initializing unknown tokens

words not found are assigned an unknown token


In [None]:
UNK = np.random.random(300)
SOS = np.random.random(300)
EOS = np.random.random(300)

glove_emb['<sos>'] = SOS
glove_emb['<eos>'] = EOS

print('GloVe data loaded')


### Get training dataset

In [None]:
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4" -O mt.zip && rm -rf /tmp/cookies.txt
# ! unzip mt.zip


--2022-04-07 08:43:14--  https://docs.google.com/uc?export=download&confirm=&id=1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4
Resolving docs.google.com (docs.google.com)... 142.251.111.113, 142.251.111.100, 142.251.111.138, ...
Connecting to docs.google.com (docs.google.com)|142.251.111.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1cepl2e8rspjc4toretfhv30f7kjp7bn/1649320950000/02653466601279893693/*/1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4?e=download [following]
--2022-04-07 08:43:16--  https://doc-08-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1cepl2e8rspjc4toretfhv30f7kjp7bn/1649320950000/02653466601279893693/*/1jvZxoMsfVDvupZMqTMx11aHmMQFPyWG4?e=download
Resolving doc-08-ak-docs.googleusercontent.com (doc-08-ak-docs.googleusercontent.com)... 172.217.164.129, 2607:f8b0:4004:814::2001
Connecting to doc-08-ak-docs.googleusercontent.com (doc

In [None]:
def file_parser(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = [line.strip() for line in f.readlines()]
    return lines


eng_train = file_parser('MT/english.train')
eng_test = file_parser('MT/english.test')
hindi_train = file_parser('MT/hindi.train')
hindi_test = file_parser('MT/hindi.test')


In [None]:
train_df = pd.DataFrame(
    {'eng_sent': eng_train, 'hindi_sent': hindi_train})
train_df.head()


Unnamed: 0,eng_sent,hindi_sent
0,Give your application an accessibility workout,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,Accerciser Accessibility Explorer,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,The default plugin layout for the bottom panel,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,The default plugin layout for the top panel,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4,A list of plugins that are disabled by default,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...


In [None]:
test_df = pd.DataFrame(
    {'eng_sent': eng_test, 'hindi_sent': hindi_test})
test_df.head()


Unnamed: 0,eng_sent,hindi_sent
0,A black box in your car?,आपकी कार में ब्लैक बॉक्स?
1,As America's road planners struggle to find th...,"जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए..."
2,"The devices, which track every mile a motorist...","यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्..."
3,The usually dull arena of highway planning has...,आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक ...
4,Libertarians have joined environmental groups ...,"आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्रा..."


In [None]:
def dataset_preprocessor(sent):
    sent = sent.strip().lower()  # convert characters to a lowercase
    sent = re.sub("'", '', sent)  # remove quotes
    
    # remove special characters
    sent = ''.join(ch for ch in sent if ch not in set(string.punctuation))

    # remove numbers from text
    sent = sent.translate(str.maketrans('', '', string.digits))
    sent = re.sub("[२३०८१५७९४६]", "", sent)
    sent = sent.strip()
    sent = re.sub(" +", " ", sent)
    return '<sos> ' + sent + ' <eos>'



In [None]:

train_df['eng_sent'] = train_df['eng_sent'].apply(dataset_preprocessor)
train_df['hindi_sent'] = train_df['hindi_sent'].apply(dataset_preprocessor)

test_df['eng_sent'] = test_df['eng_sent'].apply(dataset_preprocessor)
test_df['hindi_sent'] = test_df['hindi_sent'].apply(dataset_preprocessor)


In [None]:
train_df.head()


Unnamed: 0,eng_sent,hindi_sent
0,<sos> give your application an accessibility w...,<sos> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का ...
1,<sos> accerciser accessibility explorer <eos>,<sos> एक्सेर्साइसर पहुंचनीयता अन्वेषक <eos>
2,<sos> the default plugin layout for the bottom...,<sos> निचले पटल के लिए डिफोल्ट प्लगइन खाका <eos>
3,<sos> the default plugin layout for the top pa...,<sos> ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका <eos>
4,<sos> a list of plugins that are disabled by d...,<sos> उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप ...


In [None]:
test_df.head()


Unnamed: 0,eng_sent,hindi_sent
0,<sos> a black box in your car <eos>,<sos> आपकी कार में ब्लैक बॉक्स <eos>
1,<sos> as americas road planners struggle to fi...,<sos> जबकि अमेरिका के सड़क योजनाकार ध्वस्त होत...
2,<sos> the devices which track every mile a mot...,<sos> यह डिवाइस जो मोटरचालक द्वारा वाहन चलाए ग...
3,<sos> the usually dull arena of highway planni...,<sos> आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी ...
4,<sos> libertarians have joined environmental g...,<sos> आपने द्वारा ड्राइव किए गए मील तथा संभवतः...


### Create Tokenizer
Fit the tokenizer on the whole text where each word is assigned a unique number and every word is now represented by a number


In [None]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(
    train_df['eng_sent'].tolist() + test_df['eng_sent'].tolist())

eng_vocab_size = len(eng_tokenizer.word_index)+1  # +1 for UNK


In [None]:
hindi_tokenizer = Tokenizer()
hindi_tokenizer.fit_on_texts(
    train_df['hindi_sent'].tolist() + test_df['hindi_sent'].tolist())

hindi_vocab_size = len(hindi_tokenizer.word_index)+1  # +1 for UNK


create a matrix that contains only the words present in our vocabulary and their corresponding embedding vector

In [None]:
eng_embedding_matrix = np.zeros((eng_vocab_size, 300))
for word, i in tqdm(eng_tokenizer.word_index.items()):
    e_value = glove_emb.get(word, UNK)
    eng_embedding_matrix[i] = e_value

hindi_embedding_matrix = np.zeros((hindi_vocab_size, 300))
for word, i in tqdm(hindi_tokenizer.word_index.items()):
    e_value = glove_emb.get(word, UNK)
    hindi_embedding_matrix[i] = e_value


100%|██████████| 10406/10406 [00:00<00:00, 39539.30it/s]
100%|██████████| 11973/11973 [00:00<00:00, 323638.91it/s]


converts each sentence into a sequence of numbers


In [None]:
SEQ_MAX_LEN = 300
EMB_DIM = 300

eng_train_seq = eng_tokenizer.texts_to_sequences(train_df['eng_sent'])
pad_eng_train_seq = pad_sequences(
    eng_train_seq, maxlen=SEQ_MAX_LEN, padding='post', truncating='post')

hindi_train_seq = hindi_tokenizer.texts_to_sequences(train_df['hindi_sent'])
pad_hindi_train_seq = pad_sequences(
    hindi_train_seq, maxlen=SEQ_MAX_LEN, padding='post', truncating='post')

eng_test_seq = eng_tokenizer.texts_to_sequences(test_df['eng_sent'])
pad_eng_test_seq = pad_sequences(
    eng_test_seq, maxlen=SEQ_MAX_LEN, padding='post', truncating='post')

hindi_test_seq = hindi_tokenizer.texts_to_sequences(test_df['hindi_sent'])
pad_hindi_test_seq = pad_sequences(
    hindi_test_seq, maxlen=SEQ_MAX_LEN, padding='post', truncating='post')


### Neural Network

In [None]:
SEQ_MAX_LEN = 300
EMB_DIM = 300

# Generate a batch of data
def batch_generator(X, y, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            enc_ip = np.zeros(
                (batch_size, SEQ_MAX_LEN), dtype='int')
            dec_ip = np.zeros(
                (batch_size, SEQ_MAX_LEN), dtype='int')
            dec_target = np.zeros(
                (batch_size, SEQ_MAX_LEN, EMB_DIM), dtype='float32')

            for i, (ip_txt, target_txt) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                enc_ip[i] = ip_txt
                dec_ip[i] = target_txt
                for t in range(1, SEQ_MAX_LEN):
                    dec_target[i][t-1] = hindi_embedding_matrix[target_txt[t]]

            yield([enc_ip, dec_ip], dec_target)


### Encoder

In [None]:
enc_ips = Input(shape=(SEQ_MAX_LEN,))
enc_emb = Embedding(output_dim=EMB_DIM, input_dim=eng_vocab_size, weights=[
                    eng_embedding_matrix], input_length=SEQ_MAX_LEN, trainable=False, mask_zero=True)(enc_ips)

print(enc_emb.shape)
enc_lstm = LSTM(SEQ_MAX_LEN, return_state=True)
encoder_outputs, state_h, state_c = enc_lstm(enc_emb)


In [None]:
# discard 'encoder_outputs' and retain only states.
enc_states = [state_h, state_c]

(None, 300, 300)


using encoder states as initial state, set up the decoder


In [None]:
dec_ips = Input(shape=(SEQ_MAX_LEN,))
dec_emb_layer = Embedding(output_dim=EMB_DIM, input_dim=hindi_vocab_size, weights=[
                          hindi_embedding_matrix], input_length=SEQ_MAX_LEN, trainable=False)
dec_emb = dec_emb_layer(dec_ips)


set up decoder to return full output sequences and internal states. return states will be used in inference but not in the training model.


In [None]:

dec_lstm = LSTM(SEQ_MAX_LEN, return_sequences=True, return_state=True)
dec_ops, _, _ = dec_lstm(dec_emb,
                                     initial_state=enc_states)
print(dec_ops.shape)
dec_dense = Dense(EMB_DIM, activation='softmax')
dec_ops = dec_dense(dec_ops)


Define the model that will turn encoder input data & decoder input data into decoder target data


In [None]:
model = Model([enc_ips, dec_ips], dec_ops)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

print(model.summary())


In [None]:
train_samples = train_df.shape[0]
val_samples = test_df.shape[0]
batch_size = 512
epochs = 2

model.fit_generator(generator=batch_generator(pad_eng_train_seq, pad_hindi_train_seq, batch_size),
                    steps_per_epoch=train_samples//batch_size,
                    epochs=epochs,
                    validation_data=batch_generator(
                        pad_eng_test_seq, pad_hindi_test_seq, batch_size),
                    validation_steps=int(val_samples/batch_size))


  # Remove the CWD from sys.path while we load stuff.


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa25c9ee1d0>

# Evaluation

Encode the input sequence to get the "thought vectors"


In [None]:
encoder_model = Model(enc_ips, enc_states)



### Decoder setup


Below tensors will hold the states of the previous time step


In [None]:
dec_ip_state_h = Input(shape=(SEQ_MAX_LEN,))
dec_ip_state_c = Input(shape=(SEQ_MAX_LEN,))
dec_ips_states = [dec_ip_state_h, dec_ip_state_c]

# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(dec_ips)



To predict the next word in the sequence, set the initial states to the states from the previous time step


In [None]:
dec_ops2, state_h2, state_c2 = dec_lstm(
    dec_emb2, initial_state=dec_ips_states)
dec_states2 = [state_h2, state_c2]

# A dense softmax layer to generate prob dist. over the target vocabulary
dec_ops2 = dec_dense(dec_ops2)



Final decoder model


In [None]:
dec_model = Model(
    [dec_ips] + dec_ips_states,
    [dec_ops2] + dec_states2)


In [None]:
eng_token_index = dict(eng_tokenizer.word_index.items())
hindi_token_index = dict(hindi_tokenizer.word_index.items())

eng_reverse_word_map = dict(map(reversed, eng_tokenizer.word_index.items()))
hin_reverse_word_map = dict(
    map(reversed, hindi_tokenizer.word_index.items()))


In [None]:
def sequence_decoder(input):
    # Encode the input as state vectors.
    value_states = encoder_model.predict(input)

    # Generate empty target sequence of length 1.
    seq_target = np.zeros((1, 1))

    # Populate the first character of target sequence with the start character.
    seq_target[0, 0] = hindi_token_index['sos']

    # Sampling loop for a batch of sequences (to simplify, here we assume a batch of size 1).
    decoded = []
    flag = False
    while not flag:
        output_tokens, h, c = dec_model.predict(
            [seq_target] + value_states)

        # Sample a token
        token_index_sampled = np.argmax(output_tokens[0, -1, :])
        char_sampled = hin_reverse_word_map[token_index_sampled]
        decoded.append(char_sampled)

        flag = True if (char_sampled == 'eos' or len(decoded) >= SEQ_MAX_LEN) else False

        # Update the target sequence (of length 1).
        seq_target = np.zeros((1, 1))
        seq_target[0, 0] = token_index_sampled

        # Update states
        value_states = [h, c]

    return decoded


In [None]:
def token_word_converter(tokens, lang='english'):
    sent = []
    if lang == 'english':
        for i in tokens:
            if i == 2:
                break
            if i == 1:
                continue
            word = eng_reverse_word_map[i]
            sent.append(word)
    elif lang == 'hindi':
        for i in tokens:
            if i == 2:
                break
            if i == 1:
                continue
            word = hin_reverse_word_map[i]
            sent.append(word)
    return sent


In [None]:
gen_test = batch_generator(pad_eng_test_seq, pad_hindi_test_seq, batch_size=1)
k = -1


In [None]:
k += 1
(seq_ip, op_actual), _ = next(gen_test)

sent_decoded = sequence_decoder(seq_ip)

eng_sent = token_word_converter(pad_eng_test_seq[k:k+1][0], 'english')
hindi_sent = token_word_converter(pad_hindi_test_seq[k:k+1][0], 'hindi')




In [None]:
print("Input English sentence:", eng_sent)
print("Actual Hindi translation:", hindi_sent)
print("Predicted Hindi Translation:", sent_decoded[:len(hindi_sent)])


Input English sentence: ['a', 'black', 'box', 'in', 'your', 'car']
Actual Hindi translation: ['आपकी', 'कार', 'में', 'ब्लैक', 'बॉक्स']
Predicted Hindi Translation: ['साथ', 'साथ', 'साथ', 'साथ', 'साथ']


In [None]:
print(len(token_word_converter(pad_eng_test_seq[k:k+1][0], 'english')))
print(len(token_word_converter(pad_hindi_test_seq[k:k+1][0], 'hindi')))
print(len(sent_decoded))


6
5
300


In [None]:
bleu_score = 0

gen_test = batch_generator(pad_eng_test_seq, pad_hindi_test_seq, batch_size=1)

k = -1
max_num = 100
while k < max_num:
    k += 1
    print("Test: ", k, "out of", max_num)
    (seq_ip, op_actual), _ = next(gen_test)

    sent_decoded = sequence_decoder(seq_ip)
    hindi_sent = token_word_converter(pad_hindi_test_seq[k:k+1][0], 'hindi')
    
    reference = [hindi_sent]
    bleu_score += sentence_bleu(reference,
                                sent_decoded[:len(hindi_sent)], weights=(1, 0, 0, 0))

print("==============================")
print("Bleu Score is", round(bleu_score,4))
print("==============================")


Test:  0 out of 100
Test:  1 out of 100
Test:  2 out of 100
Test:  3 out of 100
Test:  4 out of 100


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Test:  5 out of 100
Test:  6 out of 100
Test:  7 out of 100
Test:  8 out of 100
Test:  9 out of 100
Test:  10 out of 100
Test:  11 out of 100
Test:  12 out of 100
Test:  13 out of 100
Test:  14 out of 100
Test:  15 out of 100
Test:  16 out of 100
Test:  17 out of 100
Test:  18 out of 100
Test:  19 out of 100
Test:  20 out of 100
Test:  21 out of 100
Test:  22 out of 100
Test:  23 out of 100
Test:  24 out of 100
Test:  25 out of 100
Test:  26 out of 100
Test:  27 out of 100
Test:  28 out of 100
Test:  29 out of 100
Test:  30 out of 100
Test:  31 out of 100
Test:  32 out of 100
Test:  33 out of 100
Test:  34 out of 100
Test:  35 out of 100
Test:  36 out of 100
Test:  37 out of 100
Test:  38 out of 100
Test:  39 out of 100
Test:  40 out of 100
Test:  41 out of 100
Test:  42 out of 100
Test:  43 out of 100
Test:  44 out of 100
Test:  45 out of 100
Test:  46 out of 100
Test:  47 out of 100
Test:  48 out of 100
Test:  49 out of 100
Test:  50 out of 100
Test:  51 out of 100
Test:  52 out of 1