In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
dataset_list = []

with open("hindencorp05.plaintext", encoding="utf-8") as f:
    for line in f:
        splits = line.strip().split("\t")
        if len(splits) == 5:
            dataset_list.append({
                "id": splits[0],
                "source": splits[1],
                "alignment_type": splits[2],
                "alignment_quality": splits[3],
                "translation": {"en": splits[3], "hi": splits[4]},
            })

print(dataset_list[0])  # Print the first 5 example
print("Hindi:", dataset_list[0]['translation']['hi'])
print(len(dataset_list))
dataset_list = dataset_list[:100]
print(len(dataset_list))

{'id': 'wikiner2013inflected', 'source': '1-1', 'alignment_type': '1.000', 'alignment_quality': 'Sharaabi', 'translation': {'en': 'Sharaabi', 'hi': 'शराबी'}}
Hindi: शराबी
273885
100


In [None]:
def preprocess_dataset(dataset):
    eng_sentences = []
    hin_sentences = []
    for entry in dataset:
        eng_sentences.append(entry['translation']['en'].lower())
        hin_sentences.append(entry['translation']['hi'])
    return eng_sentences, hin_sentences

# Preprocess the dataset
eng_sentences, hin_sentences = preprocess_dataset(dataset_list)

print(len(eng_sentences),"English Sentences:", eng_sentences[:5])
print(len(hin_sentences),"Hindi Sentences:", hin_sentences[:5])

100 English Sentences: ['sharaabi', 'politicians do not have permission to do what needs to be done.', "i'd like to tell you about one such child,", 'this percentage is even greater than the percentage in india.', '- john collins']
100 Hindi Sentences: ['शराबी', 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .', 'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,', 'यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।', '- जॉन कॉलिन्स']


In [None]:
eng_sentences = ["<start> " + s + " <end>" for s in eng_sentences]
hin_sentences = ["<start> " + s + " <end>" for s in hin_sentences]

# Tokenize and pad sequences
def tokenize(sentences):
    tokenizer = Tokenizer(filters="")
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    return sequences, tokenizer

eng_sequences, eng_tokenizer = tokenize(eng_sentences)
hin_sequences, hin_tokenizer = tokenize(hin_sentences)

eng_sequences = pad_sequences(eng_sequences, padding='post')
hin_sequences = pad_sequences(hin_sequences, padding='post')


In [None]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1

# Model Parameters
embedding_dim = 256
units = 512

In [None]:
# Encoder
class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c

In [None]:
# Attention
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [None]:
# Decoder
class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(units, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size)
        self.attention = BahdanauAttention(units)

    def call(self, x, enc_output, state_h, state_c):
        context_vector, attention_weights = self.attention(state_h, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x, initial_state=[state_h, state_c])
        output = self.fc(output)
        return output, state_h, state_c, attention_weights

In [None]:
# Instantiate models
encoder = Encoder(eng_vocab_size, embedding_dim, units)
decoder = Decoder(hin_vocab_size, embedding_dim, units)

# Loss function and optimizer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.not_equal(real, 0)
    loss = loss_object(real, pred)
    return tf.reduce_mean(loss * tf.cast(mask, dtype=loss.dtype))

optimizer = tf.keras.optimizers.Adam()

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        # Encode input
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp)
        dec_hidden_h, dec_hidden_c = enc_hidden_h, enc_hidden_c
        
        # Initial decoder input (<start> tokens)
        dec_input = tf.expand_dims([hin_tokenizer.word_index['<start>']] * targ.shape[0], 1)
        
        # Iterate through each timestep
        for t in range(1, targ.shape[1]):
            # Forward pass
            predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, enc_output, dec_hidden_h, dec_hidden_c)
            
            # Adjust shape for loss calculation (remove time dimension)
            predictions = tf.squeeze(predictions, axis=1)  # Shape: (batch_size, vocab_size)
            
            # Calculate loss
            loss += loss_function(targ[:, t], predictions)
            
            # Use the true target as the next decoder input
            dec_input = tf.expand_dims(targ[:, t], 1)
    
    # Normalize the loss
    batch_loss = loss / int(targ.shape[1])
    
    # Backpropagation
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

In [None]:
EPOCHS = 10
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((eng_sequences, hin_sequences)).shuffle(len(eng_sequences))
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

for epoch in range(EPOCHS):
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset):
        batch_loss = train_step(inp, targ, None)
        total_loss += batch_loss
    print(f'Epoch {epoch+1} Loss {total_loss:.4f}')

Epoch 1 Loss 1.1214
Epoch 2 Loss 0.9258
Epoch 3 Loss 1.0127
Epoch 4 Loss 0.9488
Epoch 5 Loss 0.8837
Epoch 6 Loss 0.9647
Epoch 7 Loss 0.9644
Epoch 8 Loss 0.9235
Epoch 9 Loss 1.0321
Epoch 10 Loss 1.0486


In [None]:
import os

save_path = "translator_model/"
os.makedirs(save_path, exist_ok=True)

# Save encoder and decoder weights
encoder.save_weights(os.path.join(save_path, "encoder.weights.h5"))
decoder.save_weights(os.path.join(save_path, "decoder.weights.h5"))


In [None]:
import re
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", r" ", sentence)
    sentence = re.sub(r"\s+", " ", sentence).strip()
    return sentence


In [None]:
def translate(sentence, encoder, decoder, eng_tokenizer, hin_tokenizer, max_length_targ):
    # Preprocess the input sentence
    sentence = preprocess_sentence(sentence)  # Ensure it's lowercased and cleaned
    inputs = eng_tokenizer.texts_to_sequences([sentence])
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=max_length_targ, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    # Encode the input sentence
    enc_out, enc_hidden_h, enc_hidden_c = encoder(inputs)
    
    # Prepare for decoding
    dec_hidden_h, dec_hidden_c = enc_hidden_h, enc_hidden_c
    dec_input = tf.expand_dims([hin_tokenizer.word_index['<start>']], 0)
    
    # Generate translation
    result = []
    for _ in range(max_length_targ):
        predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, enc_out, dec_hidden_h, dec_hidden_c)
        predictions = tf.argmax(predictions, axis=-1).numpy()
        
        # Get the predicted word index
        predicted_id = predictions[0][0]
        if hin_tokenizer.index_word[predicted_id] == '<end>':
            break
        
        # Append the word to the result
        result.append(hin_tokenizer.index_word[predicted_id])
        
        # Use the predicted word as the next decoder input
        dec_input = tf.expand_dims([predicted_id], 0)
    
    return ' '.join(result)


In [None]:
# Example input
input_sentence = "How are you?"

# Translate
translated_sentence = translate(input_sentence, encoder, decoder, eng_tokenizer, hin_tokenizer, max_length_targ=20)
print(f'Input: {input_sentence}')
print(f'Translation: {translated_sentence}')


Input: How are you?
Translation: 
