In [29]:
# import necessary files
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from string import digits
import re
import string
import unicodedata

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import GRU, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

## Load data

In [5]:
data = pd.read_csv("data/Hindi_English_Truncated_Corpus.csv")
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [6]:
# shape of the dataset
data.shape

(127607, 3)

In [7]:
# extract data from ted only
ted_data = data[data['source']=='ted']
ted_data.shape

(39881, 3)

## Preprocess

In [8]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',
                                                   s))
def preprocess_sentence(s, hindi=False):
    punctuations = string.punctuation
    digits = string.digits
    remove_digits = str.maketrans('', '', digits)
    
    # convert to lowercase
    s = s.lower()
    if not hindi:
        
        # convert to ascii
        s = unicode_to_ascii(s.strip())
#     s = re.sub(r"([?.!,¿])", r" \1 ", s )
#     s = re.sub(r'[" "]+', " ", s)
#     s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
#     s = s.rstrip().strip()
        # remove digits
        s = s.translate(remove_digits)
    else:
        s = re.sub("[२३०८१५७९४६]", "", s)
        
    # remove punctuations
    s = "".join([c for c in s if c not in punctuations])
    
    # remove extra spaces
    s = s.strip()
    
    return s
    

In [9]:
preprocess_sentence("This is 100 dollars")

'this is  dollars'

In [12]:
def create_dataset(path):
    data = pd.read_csv(path)
    data = data.dropna()
    data = data[data["source"] == "ted"]
    
    english_sentences, hindi_sentences = [], []
    
    for i, j in zip(data['english_sentence'], data["hindi_sentence"]):
        eng_sent = preprocess_sentence(i).split()
        hin_sent = preprocess_sentence(j, hindi=True).split()
        
        eng_sent.append("<end>")
        eng_sent.insert(0, "<start>")
        hin_sent.append("<end>")
        hin_sent.insert(0, "<start>")
        english_sentences.append(eng_sent)
        hindi_sentences.append(hin_sent)
        
    return english_sentences, hindi_sentences

In [13]:
english_sentences, hindi_sentences = create_dataset('data/Hindi_English_Truncated_Corpus.csv')
len(english_sentences), len(hindi_sentences)

(39881, 39881)

In [14]:
print("Source: {}\nTarget: {}".format(english_sentences[0],
                                      hindi_sentences[0]))

Source: ['<start>', 'politicians', 'do', 'not', 'have', 'permission', 'to', 'do', 'what', 'needs', 'to', 'be', 'done', '<end>']
Target: ['<start>', 'राजनीतिज्ञों', 'के', 'पास', 'जो', 'कार्य', 'करना', 'चाहिए', 'वह', 'करने', 'कि', 'अनुमति', 'नहीं', 'है', '<end>']


## Tokenize the data

In [15]:
def tokenize(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = keras.preprocessing.sequence.pad_sequences(
        tensor, padding='post')
    return tensor, lang_tokenizer

In [16]:
def load_dataset(path):
    source_lang, target_lang = create_dataset(path)
    input_tensor, input_tokenizer = tokenize(source_lang)
    target_tensor, target_tokenizer = tokenize(target_lang)
    return input_tensor, target_tensor, input_tokenizer, target_tokenizer


In [17]:
# load dataset
input_tensor, target_tensor, input_tokenizer, target_tokenizer = load_dataset(
    'data/Hindi_English_Truncated_Corpus.csv')
max_input_len = max([len(x) for x in input_tensor])
max_target_len = max([len(x) for x in target_tensor])

max_input_len, max_target_len

(23, 32)

## Create train and test dataset

In [18]:
input_tensor_train, input_tensor_val, \
target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor,
                                                          test_size=0.2, random_state=42)
input_tensor_train.shape, input_tensor_val.shape, \
target_tensor_train.shape, target_tensor_val.shape

((31904, 23), (7977, 23), (31904, 32), (7977, 32))

## Create TensorFlow Dataset

In [19]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 32
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 128
units = 256
vocab_size_input = len(input_tokenizer.word_index)+1
vocab_size_target = len(target_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,
                                              target_tensor_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## Build the Model

### Encoder

In [20]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units,
                batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(self.enc_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer="glorot_uniform")
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))
    
encoder = Encoder(vocab_size_input, embedding_dim, units, BATCH_SIZE)

### Attention Mechanism (BahdaunauAttention)

In [21]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

### Decoder

In [22]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units,
                 batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(dec_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer="glorot_uniform")
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden,
                                                           enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights
    
decoder = Decoder(vocab_size_target, embedding_dim, units, BATCH_SIZE)

## Define optimizer and loss function

In [23]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                         reduction='none')

def loss_function(true, pred):
    mask = tf.math.logical_not(tf.math.equal(true, 0))
    loss_ = loss_object(true, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [24]:
# Create checkpoints
checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training Model

In [25]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']]*BATCH_SIZE, 1)
        
        # teacher forcing method
        for i in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, i], predictions)
            dec_input = tf.expand_dims(targ[:, i], 1)
            
        batch_loss = (loss / int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        return batch_loss
    

In [28]:
EPOCHS = 2

for epoch in range(EPOCHS):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(epoch+1,
                                                         batch,
                                                         batch_loss.numpy()))
            
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    print("Epoch {} Loss {:.4f}".format(epoch+1,
                                        total_loss / steps_per_epoch))
    print("Time taken for 1 epoch {} sec\n".format(time.time() - start))

Epoch 1 Batch 0 Loss 1.8431
Epoch 1 Batch 100 Loss 2.0377
Epoch 1 Batch 200 Loss 1.7499
Epoch 1 Batch 300 Loss 1.8334
Epoch 1 Batch 400 Loss 1.9309
Epoch 1 Batch 500 Loss 1.8146
Epoch 1 Batch 600 Loss 2.0090
Epoch 1 Batch 700 Loss 1.7578
Epoch 1 Batch 800 Loss 1.9163
Epoch 1 Batch 900 Loss 1.9670
Epoch 1 Loss 1.9109
Time taken for 1 epoch 581.5377941131592 sec

Epoch 2 Batch 0 Loss 1.7961
Epoch 2 Batch 100 Loss 1.8769
Epoch 2 Batch 200 Loss 1.9683
Epoch 2 Batch 300 Loss 2.0691
Epoch 2 Batch 400 Loss 2.1273
Epoch 2 Batch 500 Loss 1.8904
Epoch 2 Batch 600 Loss 2.1055
Epoch 2 Batch 700 Loss 1.8803
Epoch 2 Batch 800 Loss 2.0204
Epoch 2 Batch 900 Loss 1.8793
Epoch 2 Loss 1.8916
Time taken for 1 epoch 915.207731962204 sec



In [43]:
def evaluate(sentence):
    attention_plot = np.zeros((max_target_len, max_input_len))
    sentence = preprocess_sentence(sentence)
    inputs = [input_tokenizer.word_index[i] for i in sentence.split()]
    inputs = pad_sequences([inputs], maxlen=max_input_len, padding="post")
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_output, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    for i in range(max_target_len):
        predictions, decc_hidden, attention_weights = decoder(dec_input, dec_hidden,
                                                              enc_output)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_tokenizer.index_word[predicted_id] + ' '
        if target_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence
        

In [44]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    print("Input: {}".format(sentence))
    print("Translated {}".format(result))

In [45]:
translate(u'politicians do not have permission to do what needs to be done.')

Input: politicians do not have permission to do what needs to be done
Translated और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और और 
