In [None]:
# Cell 1: Import required libraries
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
import string


In [None]:
# Configure GPU settings
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth to prevent TensorFlow from allocating all GPU memory
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ {len(gpus)} GPU(s) detected and configured")
        print(f"GPU Details: {gpus}")
    except RuntimeError as e:
        print(e)
else:
    print("✗ No GPU detected - will run on CPU")

# Verify CUDA
print(f"TensorFlow built with CUDA: {tf.test.is_built_with_cuda()}")
print(f"TensorFlow version: {tf.__version__}")


✓ 1 GPU(s) detected and configured
GPU Details: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow built with CUDA: True
TensorFlow version: 2.19.0


In [None]:
# Cell 2: Set hyperparameters and configuration
BATCH_SIZE = 64
TRAINING_EPOCHS = 100
LSTM_UNITS = 256
EMBED_SIZE = 128
MAX_SAMPLES = 15000
DATASET_PATH = "hin.txt"


In [None]:
# Cell 3: Load and preprocess the dataset
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = "[start] " + text + " [end]"
    return text

source_sentences = []
target_sentences = []

with open(DATASET_PATH, "r", encoding="utf-8") as file:
    data_lines = file.read().split("\n")

for line in data_lines[: min(MAX_SAMPLES, len(data_lines) - 1)]:
    split_parts = line.split("\t")
    if len(split_parts) >= 2:
        english_sentence, hindi_sentence = split_parts[0], split_parts[1]
        source_sentences.append(hindi_sentence)
        target_sentences.append(clean_text(english_sentence))

# Split data into train and test sets (80-20 split)
from sklearn.model_selection import train_test_split

train_source, test_source, train_target, test_target = train_test_split(
    source_sentences,
    target_sentences,
    test_size=0.2,
    random_state=42
)

print(f"Total samples: {len(source_sentences)}")
print(f"Training samples: {len(train_source)}")
print(f"Testing samples: {len(test_source)}")


Total samples: 3116
Training samples: 2492
Testing samples: 624


In [None]:
# Cell 4: Create text vectorization layers (using only training data)
source_vectorization = layers.TextVectorization(
    max_tokens=5000,
    output_mode="int",
    output_sequence_length=20
)
target_vectorization = layers.TextVectorization(
    max_tokens=5000,
    output_mode="int",
    output_sequence_length=20
)

# Adapt only on training data to prevent data leakage
source_vectorization.adapt(train_source)
target_vectorization.adapt(train_target)


In [None]:
# Cell 5: Define custom text standardization and prepare dataset
def standardize_text(text_input):
    lowercase_text = tf.strings.lower(text_input)
    # Remove all punctuation except square brackets
    cleaned = tf.strings.regex_replace(
        lowercase_text,
        "[!\"#$%&'()*+,-./:;<=>?@\\^_`{|}~]",
        ""
    )
    return cleaned

source_vectorization = layers.TextVectorization(
    max_tokens=5000,
    output_mode="int",
    output_sequence_length=20
)
target_vectorization = layers.TextVectorization(
    max_tokens=5000,
    output_mode="int",
    output_sequence_length=20,
    standardize=standardize_text
)

# Adapt only on training data
source_vectorization.adapt(train_source)
target_vectorization.adapt(train_target)

def prepare_training_batch(hindi, english):
    hindi_encoded = source_vectorization(hindi)
    english_encoded = target_vectorization(english)
    return (
        {"encoder_inputs": hindi_encoded, "decoder_inputs": english_encoded[:, :-1]},
        english_encoded[:, 1:]
    )

# Create training dataset only from training split
training_data = tf.data.Dataset.from_tensor_slices((train_source, train_target))
training_data = training_data.batch(BATCH_SIZE).map(prepare_training_batch).shuffle(2048).prefetch(16)


In [None]:
# Cell 6: Build the encoder-decoder architecture
enc_input = keras.Input(shape=(None,), name="encoder_inputs")
enc_embedding = layers.Embedding(5000, EMBED_SIZE, mask_zero=True)(enc_input)

enc_output, hidden_state, cell_state = layers.LSTM(
    LSTM_UNITS,
    return_state=True,
    dropout=0.2
)(enc_embedding)
encoder_final_states = [hidden_state, cell_state]

dec_input = keras.Input(shape=(None,), name="decoder_inputs")
dec_embedding = layers.Embedding(5000, EMBED_SIZE, mask_zero=True)(dec_input)
decoder_lstm_layer = layers.LSTM(
    LSTM_UNITS,
    return_sequences=True,
    return_state=True,
    dropout=0.2
)
dec_output, _, _ = decoder_lstm_layer(dec_embedding, initial_state=encoder_final_states)
output_layer = layers.Dense(5000, activation="softmax")
final_output = output_layer(dec_output)


In [None]:
# Cell 7: Compile and train the model
translation_model = keras.Model([enc_input, dec_input], final_output)
translation_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
translation_model.fit(training_data, epochs=TRAINING_EPOCHS)


Epoch 1/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.2224 - loss: 7.7398
Epoch 2/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.4066 - loss: 5.1385
Epoch 3/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.1188 - loss: 4.7674
Epoch 4/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.1209 - loss: 4.6168
Epoch 5/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.1231 - loss: 4.4800
Epoch 6/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.1248 - loss: 4.3926
Epoch 7/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.1272 - loss: 4.2924
Epoch 8/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.1302 - loss: 4.1878
Epoch 9/100
[1m39/39[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7dad2a6fd2e0>

In [None]:
# Cell 8: Build inference models for translation
inference_encoder = keras.Model(enc_input, encoder_final_states)

dec_hidden_input = keras.Input(shape=(LSTM_UNITS,))
dec_cell_input = keras.Input(shape=(LSTM_UNITS,))
decoder_state_inputs = [dec_hidden_input, dec_cell_input]

dec_embedded = translation_model.layers[3](dec_input)
dec_out, h_state, c_state = decoder_lstm_layer(
    dec_embedded,
    initial_state=decoder_state_inputs
)
decoder_final_states = [h_state, c_state]
dec_predictions = output_layer(dec_out)
inference_decoder = keras.Model(
    [dec_input] + decoder_state_inputs,
    [dec_predictions] + decoder_final_states
)

vocabulary = target_vectorization.get_vocabulary()

def translate_sentence(hindi_input):
    vectorized_input = source_vectorization([hindi_input])
    encoder_states = inference_encoder.predict(vectorized_input, verbose=0)

    decoder_input_seq = np.zeros((1, 1))
    decoder_input_seq[0, 0] = vocabulary.index("[start]")

    should_stop = False
    translated_text = ""

    while not should_stop:
        predictions, h, c = inference_decoder.predict(
            [decoder_input_seq] + encoder_states,
            verbose=0
        )
        predicted_token = np.argmax(predictions[0, -1, :])
        predicted_word = vocabulary[predicted_token]

        if predicted_word == "[end]" or len(translated_text.split()) > 20:
            should_stop = True
        else:
            translated_text += " " + predicted_word
            decoder_input_seq[0, 0] = predicted_token
            encoder_states = [h, c]

    return translated_text.strip()


In [None]:
# Cell 9: Test the model with random samples from TEST set
print("\n" + "=" * 50)
print("MODEL TRANSLATION RESULTS (TEST SET)")
print("=" * 50)

for idx in range(10):
    rand_idx = np.random.randint(0, len(test_source))
    hindi_test = test_source[rand_idx]

    predicted_translation = translate_sentence(hindi_test)

    ground_truth = test_target[rand_idx].replace("[start]", "").replace("[end]", "").strip()

    print(f"\nTest Case {idx + 1}:")
    print(f" > Original (Hindi): {hindi_test}")
    print(f" > Expected (English): {ground_truth}")
    print(f" > Predicted (English): {predicted_translation}")



MODEL TRANSLATION RESULTS (TEST SET)

Test Case 1:
 > Original (Hindi): मेरे चाचा क्रिकेट के शौकिया खिलाड़ी हैं।
 > Expected (English): my uncle is an amateur cricket player .
 > Predicted (English): my uncle is an amateur cricket player

Test Case 2:
 > Original (Hindi): मैं अगले सोमवार आके ले जाऊंगा।
 > Expected (English): i'll come pick it up next monday .
 > Predicted (English): ill be back at a bank

Test Case 3:
 > Original (Hindi): उसका पर्स उससे चुरा लिया गया।
 > Expected (English): she was robbed of her purse .
 > Predicted (English): he went to india by her

Test Case 4:
 > Original (Hindi): चीनी गर्म कॉफी में घुल जाती है।
 > Expected (English): sugar dissolves in hot coffee .
 > Predicted (English): your sister is very fond of music

Test Case 5:
 > Original (Hindi): मैं अंग्रेज़ी पढ़ सकती हूँ।
 > Expected (English): i can read english .
 > Predicted (English): i will explain the manager

Test Case 6:
 > Original (Hindi): उसका जन्मदिन इक्कीस अगस्त को है।
 > Expected (Englis

In [None]:
# Cell 10: Calculate and display test accuracy on TEST SET
print("\n" + "=" * 50)
print("COMPUTING TEST SET ACCURACY")
print("=" * 50)

def calculate_word_accuracy(reference, hypothesis):
    """Calculate word-level accuracy between reference and hypothesis"""
    ref_words = reference.lower().split()
    hyp_words = hypothesis.lower().split()

    if len(ref_words) == 0:
        return 0.0

    correct_words = sum(1 for r, h in zip(ref_words, hyp_words) if r == h)
    return correct_words / len(ref_words)

# Evaluate on the entire test set (unseen data)
test_sample_size = len(test_source)
total_accuracy = 0.0
perfect_matches = 0

print(f"\nEvaluating on {test_sample_size} TEST samples (unseen during training)...")

for i in range(test_sample_size):
    test_hindi = test_source[i]
    expected_english = test_target[i].replace("[start]", "").replace("[end]", "").strip()
    predicted_english = translate_sentence(test_hindi)

    # Calculate word-level accuracy
    accuracy = calculate_word_accuracy(expected_english, predicted_english)
    total_accuracy += accuracy

    # Count perfect translations
    if predicted_english == expected_english:
        perfect_matches += 1

    # Show progress every 500 samples
    if (i + 1) % 5 == 0:
        print(f"Processed {i + 1}/{test_sample_size} samples...")

average_word_accuracy = (total_accuracy / test_sample_size) * 100
perfect_match_rate = (perfect_matches / test_sample_size) * 100

print(f"\n{'─' * 50}")
print(f"TEST SET RESULTS:")
print(f"{'─' * 50}")
print(f"Average Word-Level Accuracy: {average_word_accuracy:.2f}%")
print(f"{'─' * 50}")



COMPUTING TEST SET ACCURACY

Evaluating on 624 TEST samples (unseen during training)...
Processed 5/624 samples...
Processed 10/624 samples...
Processed 15/624 samples...
Processed 20/624 samples...
Processed 25/624 samples...
Processed 30/624 samples...
Processed 35/624 samples...
Processed 40/624 samples...
Processed 45/624 samples...
Processed 50/624 samples...
Processed 55/624 samples...
Processed 60/624 samples...
Processed 65/624 samples...
Processed 70/624 samples...
Processed 75/624 samples...
Processed 80/624 samples...
Processed 85/624 samples...
Processed 90/624 samples...
Processed 95/624 samples...
Processed 100/624 samples...
Processed 105/624 samples...
Processed 110/624 samples...
Processed 115/624 samples...
Processed 120/624 samples...
Processed 125/624 samples...
Processed 130/624 samples...
Processed 135/624 samples...
Processed 140/624 samples...
Processed 145/624 samples...
Processed 150/624 samples...
Processed 155/624 samples...
Processed 160/624 samples...
Pro