In [2]:
# Download the dataset
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1GeIz3GMuWIRTi_BtW6cYy39O0Mu3ujRF' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1GeIz3GMuWIRTi_BtW6cYy39O0Mu3ujRF" -O english_hindi.csv && rm -rf /tmp/cookies.txt

--2021-06-15 05:54:37--  https://docs.google.com/uc?export=download&confirm=&id=1GeIz3GMuWIRTi_BtW6cYy39O0Mu3ujRF
Resolving docs.google.com (docs.google.com)... 142.250.141.100, 142.250.141.138, 142.250.141.101, ...
Connecting to docs.google.com (docs.google.com)|142.250.141.100|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0g-5g-docs.googleusercontent.com/docs/securesc/juaufom0bqjtru32rs3dp7i0omnm4d72/1shdldmq4nujajhtadbqkqvt73ncgtbb/1623736425000/04727882715947363558/03236179916224479565Z/1GeIz3GMuWIRTi_BtW6cYy39O0Mu3ujRF?e=download [following]
--2021-06-15 05:54:38--  https://doc-0g-5g-docs.googleusercontent.com/docs/securesc/juaufom0bqjtru32rs3dp7i0omnm4d72/1shdldmq4nujajhtadbqkqvt73ncgtbb/1623736425000/04727882715947363558/03236179916224479565Z/1GeIz3GMuWIRTi_BtW6cYy39O0Mu3ujRF?e=download
Resolving doc-0g-5g-docs.googleusercontent.com (doc-0g-5g-docs.googleusercontent.com)... 142.250.141.132, 2607:f8b0:4023:c0b::84
Connectin

In [3]:
# import necessary files
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from string import digits
import re
import string
import unicodedata

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import GRU, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

## Preprocess

In [5]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',
                                                   s))
def preprocess_sentence(s, hindi=False):
    punctuations = string.punctuation
    digits = string.digits
    remove_digits = str.maketrans('', '', digits)
    
    # convert to lowercase
    s = s.lower()
    s = s.translate(remove_digits)
    
    if not hindi:
        # convert to ascii
        s = unicode_to_ascii(s.strip())
    else:
        s = re.sub("[२३०८१५७९४६]", "", s)
        
    # remove punctuations
    s = "".join([c for c in s if c not in punctuations])
    
    # remove extra spaces
    s = s.strip()
    
    return s
    

In [6]:
preprocess_sentence("This is 100 dollars")

'this is  dollars'

In [7]:
def create_dataset(path):
    data = pd.read_csv(path)
    data = data.dropna()
    data = data[data["source"].isin(["ted", "tides"])]
    data["inp_len"] = data["english_sentence"].apply(lambda x: len(x.split()))
    data = data[data["inp_len"] <= 50]

    
    english_sentences, hindi_sentences = [], []
    
    for i, j in zip(data['english_sentence'], data["hindi_sentence"]):
        eng_sent = preprocess_sentence(i).split()
        hin_sent = preprocess_sentence(j, hindi=True).split()
        
        eng_sent.append("<end>")
        eng_sent.insert(0, "<start>")
        hin_sent.append("<end>")
        hin_sent.insert(0, "<start>")
        english_sentences.append(eng_sent)
        hindi_sentences.append(hin_sent)
        
    return english_sentences, hindi_sentences

In [8]:
english_sentences, hindi_sentences = create_dataset(path)
len(english_sentences), len(hindi_sentences)

(87776, 87776)

In [9]:
print("Source: {}\nTarget: {}".format(english_sentences[0],
                                      hindi_sentences[0]))

Source: ['<start>', 'politicians', 'do', 'not', 'have', 'permission', 'to', 'do', 'what', 'needs', 'to', 'be', 'done', '<end>']
Target: ['<start>', 'राजनीतिज्ञों', 'के', 'पास', 'जो', 'कार्य', 'करना', 'चाहिए', 'वह', 'करने', 'कि', 'अनुमति', 'नहीं', 'है', '<end>']


## Tokenize the data

In [10]:
def tokenize(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = keras.preprocessing.sequence.pad_sequences(
        tensor, padding='post')
    return tensor, lang_tokenizer

In [11]:
def load_dataset(path):
    source_lang, target_lang = create_dataset(path)
    input_tensor, input_tokenizer = tokenize(source_lang)
    target_tensor, target_tokenizer = tokenize(target_lang)
    return input_tensor, target_tensor, input_tokenizer, target_tokenizer


In [12]:
# load dataset
input_tensor, target_tensor, input_tokenizer, target_tokenizer = load_dataset(path)
max_input_len = max([len(x) for x in input_tensor])
max_target_len = max([len(x) for x in target_tensor])

max_input_len, max_target_len

(51, 95)

## Create train and test dataset

In [13]:
input_tensor_train, input_tensor_val, \
target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor,
                                                          test_size=0.2, random_state=42)
input_tensor_train.shape, input_tensor_val.shape, \
target_tensor_train.shape, target_tensor_val.shape

((70220, 51), (17556, 51), (70220, 95), (17556, 95))

## Create TensorFlow Dataset

In [14]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 128
units = 256
vocab_size_input = len(input_tokenizer.word_index)+1
vocab_size_target = len(target_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,
                                              target_tensor_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## Build the Model

### Encoder

In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units,
                batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(self.enc_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer="glorot_uniform")
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))
    
encoder = Encoder(vocab_size_input, embedding_dim, units, BATCH_SIZE)

### Attention Mechanism (BahdaunauAttention)

In [16]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

### Decoder

In [17]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units,
                 batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(dec_units,
                       return_sequences=True,
                       return_state=True,
                       recurrent_initializer="glorot_uniform")
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden,
                                                           enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights
    
decoder = Decoder(vocab_size_target, embedding_dim, units, BATCH_SIZE)

## Define optimizer and loss function

In [18]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                         reduction='none')

def loss_function(true, pred):
    mask = tf.math.logical_not(tf.math.equal(true, 0))
    loss_ = loss_object(true, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [19]:
# Create checkpoints
checkpoint_dir = "/content/drive/MyDrive/eng_hindi/checkpoint2"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training Model

In [20]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']]*BATCH_SIZE, 1)
        
        # teacher forcing method
        for i in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, i], predictions)
            dec_input = tf.expand_dims(targ[:, i], 1)
            
        batch_loss = (loss / int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        return batch_loss
    

In [None]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(epoch+1,
                                                         batch,
                                                         batch_loss.numpy()))
            
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    print("Epoch {} Loss {:.4f}".format(epoch+1,
                                        total_loss / steps_per_epoch))
    print("Time taken for 1 epoch {} sec\n".format(time.time() - start))

Epoch 1 Batch 0 Loss 2.0136
Epoch 1 Batch 100 Loss 1.1946
Epoch 1 Batch 200 Loss 1.2330
Epoch 1 Batch 300 Loss 1.2124
Epoch 1 Batch 400 Loss 1.3548
Epoch 1 Batch 500 Loss 1.2309
Epoch 1 Batch 600 Loss 1.2322
Epoch 1 Batch 700 Loss 1.3354
Epoch 1 Batch 800 Loss 1.2038
Epoch 1 Batch 900 Loss 1.1221
Epoch 1 Batch 1000 Loss 1.2161
Epoch 1 Loss 1.2186
Time taken for 1 epoch 806.9351568222046 sec

Epoch 2 Batch 0 Loss 1.1897
Epoch 2 Batch 100 Loss 1.0392
Epoch 2 Batch 200 Loss 1.0479
Epoch 2 Batch 300 Loss 1.0433
Epoch 2 Batch 400 Loss 1.1533
Epoch 2 Batch 500 Loss 0.9413
Epoch 2 Batch 600 Loss 1.0643
Epoch 2 Batch 700 Loss 1.0075
Epoch 2 Batch 800 Loss 1.0305
Epoch 2 Batch 900 Loss 1.1263
Epoch 2 Batch 1000 Loss 0.9490
Epoch 2 Loss 1.0888
Time taken for 1 epoch 688.0440459251404 sec

Epoch 3 Batch 0 Loss 0.8660
Epoch 3 Batch 100 Loss 0.9653
Epoch 3 Batch 200 Loss 0.9599
Epoch 3 Batch 300 Loss 0.8560
Epoch 3 Batch 400 Loss 1.1662
Epoch 3 Batch 500 Loss 1.0913
Epoch 3 Batch 600 Loss 1.1185
Ep

In [21]:
def evaluate(sentence):
    attention_plot = np.zeros((max_target_len, max_input_len))
    sentence = preprocess_sentence(sentence)
    inputs = [input_tokenizer.word_index[i] for i in sentence.split()]
    inputs = pad_sequences([inputs], maxlen=max_input_len, padding="post")
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_output, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    for i in range(max_target_len):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden,
                                                              enc_output)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if target_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence
        result += target_tokenizer.index_word[predicted_id] + ' '
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence
        

In [22]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    print("Input: {}".format(sentence))
    print("Translated {}".format(result))

In [23]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1d8f8b4850>

In [24]:
translate(u'politicians do not have permission to do what needs to be done.')

Input: politicians do not have permission to do what needs to be done
Translated वे नेता नहीं हैं कि वे क्या करना चाहिए 


In [27]:
translate(u"where are you from?")

Input: where are you from
Translated जहां आप 
