## About Project
This is an English to Spanish Translator. The model is a LSTM based encoder-decoder with Bahdanau (Additive) Attention Mechanism. The model is trained on 45,000 sentence pairs.

## Importing Libraries

In [3]:
## Importing Libraries
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Layer
from tensorflow.keras.models import Model

## Loading the Dataset
The csv file contains 1.5 lakh entries. We train our model on subset of 60,000 entries randomly selected from it.


In [4]:

# 1. Load CSV & Set Columns
df = pd.read_csv("data.csv")
df.columns = ['English', 'Spanish']
df = df.dropna()
df = df.sample(n=60000, random_state=42).reset_index(drop=True)



## Visualising Dataset

In [5]:
df.head()

Unnamed: 0,English,Spanish
0,How boring!,¡Qué aburrimiento!
1,I love sports.,Adoro el deporte.
2,Would you like to swap jobs?,¿Te gustaría que intercambiemos los trabajos?
3,My mother did nothing but weep.,Mi madre no hizo nada sino llorar.
4,Croatia is in the southeastern part of Europe.,Croacia está en el sudeste de Europa.


## Preprocessing Text
Lowercasing,Punctuation Removal, Addition of <start> and <end> tokens.

In [None]:
# Preprocessing function
def preprocess_text(text, filters=''):
    text = text.lower().strip()
    if filters != '':
        text = re.sub(filters, ' ', text)
    text = re.sub(r"\s+", " ", text)
    return text

df['English'] = df['English'].apply(lambda x: preprocess_text(x, filters='[^a-zA-ZÀ-ÿ0-9?.!,¿]+'))
df['Spanish'] = df['Spanish'].apply(lambda x: "<start> " + preprocess_text(x, filters='[^a-zA-ZÀ-ÿ0-9?.!,¿]+') + " <end>")

## Tokenization and Padding of Sequences

In [None]:
# Tokenization & Padding
eng_tokenizer = Tokenizer(num_words=20000, oov_token='<unk>',filters='')
eng_tokenizer.fit_on_texts(df['English'])
X_enc = eng_tokenizer.texts_to_sequences(df['English'])

sp_tokenizer = Tokenizer(num_words=20000, oov_token='<unk>',filters='')
sp_tokenizer.fit_on_texts(df['Spanish'])
X_dec = sp_tokenizer.texts_to_sequences(df['Spanish'])

max_enc_len = max(len(seq) for seq in X_enc)
max_dec_len = max(len(seq) for seq in X_dec)

X_enc = pad_sequences(X_enc, maxlen=max_enc_len, padding='post')
X_dec = pad_sequences(X_dec, maxlen=max_dec_len, padding='post')

y_dec = np.zeros_like(X_dec)
y_dec[:, :-1] = X_dec[:, 1:]
y_dec[:, -1] = sp_tokenizer.word_index['<end>']

## Train and Validation Dataset Split

In [None]:
# Train-validation split
X_train_enc, X_val_enc, X_train_dec, X_val_dec, y_train, y_val = train_test_split(
    X_enc, X_dec, y_dec, test_size=0.2, random_state=42
)


## Bahdanau Attention Mechanism from scratch

In [None]:
# Bahdanau Attention Layer

class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        # query: decoder outputs (batch, T_dec, units)
        # values: encoder outputs (batch, T_enc, units)
        query_with_time_axis = tf.expand_dims(query, 2)  # (batch, T_dec, 1, units)
        values_with_time_axis = tf.expand_dims(values, 1)  # (batch, 1, T_enc, units)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=2)  # (batch, T_dec, T_enc, 1)
        context_vector = tf.matmul(tf.squeeze(attention_weights, -1), values)  # (batch, T_dec, units)
        return context_vector


## LSTM Based Encoder-Decoder with Attention

In [None]:
#  Build LSTM Encoder-Decoder with Attention
units = 128
embedding_dim = 128
enc_vocab_size = len(eng_tokenizer.word_index) + 1
dec_vocab_size = len(sp_tokenizer.word_index) + 1

# Encoder
enc_in = Input(shape=(None,))
enc_emb = Embedding(enc_vocab_size, embedding_dim)(enc_in)
enc_out, enc_state_h, enc_state_c = LSTM(units, return_sequences=True, return_state=True)(enc_emb)

# Decoder
dec_in = Input(shape=(None,))
dec_emb = Embedding(dec_vocab_size, embedding_dim)(dec_in)
dec_out, _, _ = LSTM(units, return_sequences=True, return_state=True)(dec_emb, initial_state=[enc_state_h, enc_state_c])

# Attention
attention_layer = BahdanauAttention(units)
context = attention_layer(dec_out, enc_out)
concat = Concatenate(axis=-1)([dec_out, context])

# Output
logits = Dense(dec_vocab_size, activation='softmax')(concat)

## Model Description

In [None]:
# Model
model = Model([enc_in, dec_in], logits)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


## Creating Tensorflow Dataset

In [None]:
# tf.data.Dataset

batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices(((X_train_enc, X_train_dec), np.expand_dims(y_train, -1)))
train_dataset = train_dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices(((X_val_enc, X_val_dec), np.expand_dims(y_val, -1)))
val_dataset = val_dataset.batch(batch_size, drop_remainder=True)


## Training the Model

In [None]:
# Train

history = model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Epoch 1/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 212ms/step - accuracy: 0.8487 - loss: 1.8191 - val_accuracy: 0.8911 - val_loss: 0.7675
Epoch 2/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 212ms/step - accuracy: 0.8940 - loss: 0.7318 - val_accuracy: 0.8998 - val_loss: 0.6737
Epoch 3/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 212ms/step - accuracy: 0.9027 - loss: 0.6335 - val_accuracy: 0.9088 - val_loss: 0.5997
Epoch 4/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 212ms/step - accuracy: 0.9127 - loss: 0.5473 - val_accuracy: 0.9169 - val_loss: 0.5373
Epoch 5/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 212ms/step - accuracy: 0.9219 - loss: 0.4665 - val_accuracy: 0.9234 - val_loss: 0.4851
Epoch 6/10
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 212ms/step - accuracy: 0.9308 - loss: 0.3926 - val_accuracy: 0.9298 - val_loss: 0.4426
Epoc

## Translate a few Sentences

In [None]:
def translate(sentence, max_dec_len=max_dec_len):
    # Preprocess input
    sentence = preprocess_text(sentence)
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_enc_len, padding='post')

    # Decoder starts with <start> token
    start_id = sp_tokenizer.word_index['<start>']
    end_id = sp_tokenizer.word_index['<end>']

    dec_seq = np.zeros((1, max_dec_len))
    dec_seq[0, 0] = start_id  # put <start> at first position

    words = []
    for i in range(1, max_dec_len):
        pred = model.predict([seq, dec_seq], verbose=0)
        pred_id = np.argmax(pred[0, i-1])  # predict next word
        if pred_id == end_id:
            break
        if pred_id != 0:  # ignore padding
            words.append(sp_tokenizer.index_word.get(pred_id, ''))
        dec_seq[0, i] = pred_id  # feed predicted token back into decoder

    return ' '.join(words)

# Translate Some Example Sentences

for sentence in df['English'][:10]:
    translation = translate(sentence, max_dec_len)
    print(f"English: {sentence}")
    print(f"Predicted Spanish: {translation}")
    print("-" * 40)


English: how boring!
Predicted Spanish: qué <unk>
----------------------------------------
English: i love sports.
Predicted Spanish: me encanta los deportes.
----------------------------------------
English: would you like to swap jobs?
Predicted Spanish: ¿te gustaría que intercambiemos los trabajos?
----------------------------------------
English: my mother did nothing but weep.
Predicted Spanish: mi madre no hizo nada más que llorar.
----------------------------------------
English: croatia is in the southeastern part of europe.
Predicted Spanish: pekín es en el volumen de europa.
----------------------------------------
English: i have never eaten a mango before.
Predicted Spanish: nunca he comido un mango.
----------------------------------------
English: tell the taxi driver to drive faster.
Predicted Spanish: dile al que <unk> a un <unk> y a john.
----------------------------------------
English: tom and i work together.
Predicted Spanish: tom y yo trabajamos juntos.
----------

## Evaluating on Validation Set(Test Accuracy) 94%

In [None]:
#  Evaluate on Validation Set

val_loss, val_acc = model.evaluate(val_dataset)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")

[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 173ms/step - accuracy: 0.9400 - loss: 0.3787
Validation Loss: 0.3734
Validation Accuracy: 0.9404


## BLEU Score on 500 samples

In [None]:
import numpy as np
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu(model, X_enc_samples, X_dec_samples, sp_tokenizer, max_enc_len, max_dec_len, n_samples=500):
    """
    Parameters:
    1.model: trained Keras seq2seq model
    2.X_enc_samples: encoder input sequences (English)
    3.X_dec_samples: decoder target sequences (Spanish)
    4.sp_tokenizer: Spanish tokenizer
    5.max_enc_len, max_dec_len: maximum sequence lengths
    6.n_samples: number of samples to evaluate (default 500)

    Returns:
    - BLEU score (corpus-level)
    """
    start_token = sp_tokenizer.word_index['<start>']
    end_token = sp_tokenizer.word_index['<end>']

    references = []
    predictions = []

    for i in range(min(n_samples, len(X_enc_samples))):
        enc_seq = X_enc_samples[i:i+1]  # batch of 1
        dec_seq_true = X_dec_samples[i]

        # Reference sequence (remove <start> and <end>)
        ref_tokens = [sp_tokenizer.index_word[tok]
                      for tok in dec_seq_true if tok > 0 and tok != start_token and tok != end_token]
        references.append([ref_tokens])

        # Generate translation
        dec_input = np.array([[start_token]])
        translated = []

        for _ in range(max_dec_len):
            preds = model.predict([enc_seq, dec_input], verbose=0)
            next_token = np.argmax(preds[0, -1, :])
            if next_token == end_token:
                break
            translated.append(next_token)
            dec_input = np.append(dec_input, [[next_token]], axis=1)

        pred_tokens = [sp_tokenizer.index_word.get(tok, '') for tok in translated]
        predictions.append(pred_tokens)

    # Compute corpus BLEU score
    bleu_score = corpus_bleu(references, predictions)
    return bleu_score


In [None]:
bleu_score = evaluate_bleu(
    model,
    X_val_enc,
    X_val_dec,
    sp_tokenizer,
    max_enc_len,
    max_dec_len,
    n_samples=500
)


Validation BLEU score (on 500 samples): 0.2201


In [None]:

print(f"Validation BLEU score (on 500 samples): {bleu_score*100:.4f}")

Validation BLEU score (on 500 samples): 22.0135


## Conclusion:
 Accuracy of my Model is 94% on validtion dataset indicating it is predicting the right words.It also shows that model has understood the vocabulary well and has achieved token level fluency. But,the BLEU score is not upto the mark showing grammatical inaccuracies and overfitting to frequent words.
 ## Possible Updates to Model:
 Use of Bidirectional Encoder, Tuning Embedding and hidden sizes. Trying different attention mechanisms like Luong. Adding Layer Normalisation.

 Use of Advanced models like Transformers with multihead attention.