In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install openpyxl to read Excel files
!pip install openpyxl

# Step 1: Import Necessary Libraries

import numpy as np
import pandas as pd
import os
import string
from string import digits
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf

print("Libraries imported successfully.")


In [None]:
# Step 2: Set up TPU

try:
    # Detect TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    # Connect to TPU cluster
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    # Create TPU strategy
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()  # Default strategy

print("REPLICAS: ", strategy.num_replicas_in_sync)


In [None]:
# Step 3: Load the Dataset

# List files in the input directory
print("Files in the input directory:")
print(os.listdir("../input"))

# Adjust the file path according to your dataset
# Replace 'hinditotelugu' and 'FINAL_ML (1).xlsx' with actual names if different
data = pd.read_excel("/kaggle/input/mlfinal/FINAL_ML (1).xlsx", names=['HINDI', 'TELUGU'], header=None, skiprows=1)

# Display the first few rows
print("\nFirst few rows of the dataset:")
print(data.head())

In [None]:
# Step 4: Preprocess the Data

# Check for missing values
print("\nNumber of missing values in HINDI column:", data['HINDI'].isnull().sum())
print("Number of missing values in TELUGU column:", data['TELUGU'].isnull().sum())

# Drop rows with missing values
data = data.dropna(subset=['HINDI', 'TELUGU'])
print("After dropping missing values, data shape:", data.shape)

# Remove empty strings
data = data[data['HINDI'].str.strip().astype(bool)]
data = data[data['TELUGU'].str.strip().astype(bool)]
data.reset_index(drop=True, inplace=True)

# Convert all entries to strings
data['HINDI'] = data['HINDI'].astype(str)
data['TELUGU'] = data['TELUGU'].astype(str)

# Lowercase all characters
data['HINDI'] = data['HINDI'].apply(lambda x: x.lower())
data['TELUGU'] = data['TELUGU'].apply(lambda x: x.lower())

# Remove quotes
data['HINDI'] = data['HINDI'].apply(lambda x: x.replace("'", ''))
data['TELUGU'] = data['TELUGU'].apply(lambda x: x.replace("'", ''))

# Remove punctuation
exclude = set(string.punctuation)
data['HINDI'] = data['HINDI'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['TELUGU'] = data['TELUGU'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Remove digits
remove_digits = str.maketrans('', '', digits)
data['HINDI'] = data['HINDI'].apply(lambda x: x.translate(remove_digits))
data['TELUGU'] = data['TELUGU'].apply(lambda x: x.translate(remove_digits))

# Remove extra spaces
data['HINDI'] = data['HINDI'].apply(lambda x: x.strip())
data['TELUGU'] = data['TELUGU'].apply(lambda x: x.strip())
data['HINDI'] = data['HINDI'].apply(lambda x: re.sub(" +", " ", x))
data['TELUGU'] = data['TELUGU'].apply(lambda x: re.sub(" +", " ", x))

# Add start and end tokens to target sequences
data['TELUGU'] = data['TELUGU'].apply(lambda x: 'START_ ' + x + ' _END')

# Display the preprocessed data
print("\nPreprocessed Data:")
print(data.head())

In [None]:
# Step 5: Create Vocabulary Dictionaries

all_hindi_words = set()
for hin in data['HINDI']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

all_telugu_words = set()
for tel in data['TELUGU']:
    for word in tel.split():
        if word not in all_telugu_words:
            all_telugu_words.add(word)

print("\nTotal unique Hindi words:", len(all_hindi_words))
print("Total unique Telugu words:", len(all_telugu_words))

In [None]:
# Step 6: Calculate Sentence Lengths and Filter Sentences

data['length_hindi_sentence'] = data['HINDI'].apply(lambda x: len(x.split(" ")))
data['length_telugu_sentence'] = data['TELUGU'].apply(lambda x: len(x.split(" ")))

print("\nSentence length statistics:")
print(data[['length_hindi_sentence', 'length_telugu_sentence']].describe())

# Set maximum sentence lengths (adjust as needed)
max_length_src = 20  # Hindi sentences
max_length_tar = 20  # Telugu sentences

# Filter sentences based on length
data = data[data['length_hindi_sentence'] <= max_length_src]
data = data[data['length_telugu_sentence'] <= max_length_tar]
data.reset_index(drop=True, inplace=True)

print("\nData after filtering based on sentence lengths:")
print(data.head())
print("Total samples after filtering:", data.shape[0])

In [None]:
# Step 7: Prepare Data for Training

# Sort the unique words to maintain consistency
input_words = sorted(list(all_hindi_words))
target_words = sorted(list(all_telugu_words))

# Define the number of unique tokens (+1 for padding)
num_encoder_tokens = len(input_words) + 1  # +1 for padding token
num_decoder_tokens = len(target_words) + 1  # +1 for padding token

# Create word-to-index and index-to-word dictionaries for Hindi
input_token_index = dict([(word, i + 1) for i, word in enumerate(input_words)])
reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())

# Create word-to-index and index-to-word dictionaries for Telugu
target_token_index = dict([(word, i + 1) for i, word in enumerate(target_words)])
reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())

print("\nNumber of unique input tokens (Hindi):", num_encoder_tokens)
print("Number of unique output tokens (Telugu):", num_decoder_tokens)

In [None]:
# Step 8: Prepare Sequences and Split the Data

# Prepare input sequences
encoder_input_data = []
for input_text in data['HINDI']:
    encoder_input_data.append([input_token_index.get(word, 0) for word in input_text.split()])

# Prepare decoder input and output sequences
decoder_input_data = []
decoder_target_data = []
for target_text in data['TELUGU']:
    target_words = target_text.split()
    decoder_input_data.append([target_token_index.get(word, 0) for word in target_words[:-1]])  # exclude _END token
    decoder_target_data.append([target_token_index.get(word, 0) for word in target_words[1:]])  # exclude START_ token

# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_length_src, padding='post')
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_length_tar, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_length_tar, padding='post')

# Convert decoder_target_data to one-hot vectors
decoder_target_data_onehot = tf.keras.utils.to_categorical(decoder_target_data, num_classes=num_decoder_tokens)

print("\nData prepared for training.")

# Split the data into training and test sets
X_train_enc, X_test_enc, X_train_dec_inp, X_test_dec_inp, y_train, y_test = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data_onehot, test_size=0.2, random_state=42)

print("\nTraining and test data sizes:")
print("Training data:", X_train_enc.shape)
print("Test data:", X_test_enc.shape)

In [None]:
# Step 9: Build the Encoder-Decoder Model

latent_dim = 256  # Adjust as needed

with strategy.scope():
    # Encoder
    encoder_inputs = tf.keras.layers.Input(shape=(None,), name='encoder_inputs')
    enc_emb = tf.keras.layers.Embedding(num_encoder_tokens, latent_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
    encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(latent_dim, return_state=True, name='encoder_lstm')(enc_emb)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = tf.keras.layers.Input(shape=(None,), name='decoder_inputs')
    dec_emb_layer = tf.keras.layers.Embedding(num_decoder_tokens, latent_dim, mask_zero=True, name='decoder_embedding')
    dec_emb = dec_emb_layer(decoder_inputs)
    decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
    decoder_dense = tf.keras.layers.Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model
    model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Compile the model
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

print("\nModel Summary:")
model.summary()

In [None]:
# Step 10: Train the Model

batch_size = 64  # Adjust as needed (should be a multiple of 8 for TPU)
epochs = 12 # Adjust as needed

# Since we are using numpy arrays, we can use model.fit directly
history = model.fit([X_train_enc, X_train_dec_inp], y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=([X_test_enc, X_test_dec_inp], y_test))

In [None]:
# Step 12: Build the Inference Model

with strategy.scope():
    # Encoder inference model
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    # Decoder inference model
    decoder_state_input_h = tf.keras.layers.Input(shape=(latent_dim,), name='decoder_state_input_h')
    decoder_state_input_c = tf.keras.layers.Input(shape=(latent_dim,), name='decoder_state_input_c')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    dec_emb2 = dec_emb_layer(decoder_inputs)

    decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
    decoder_states2 = [state_h2, state_c2]
    decoder_outputs2 = decoder_dense(decoder_outputs2)

    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs2] + decoder_states2)

print("\nInference models are ready.")

In [None]:
# Step 13: Define a Function to Decode Sequences

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token.
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index.get('START_', 0)

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_token_index.get(sampled_token_index, '')

        if sampled_word != '_END':
            decoded_sentence += ' ' + sampled_word

        # Exit condition: either hit max length or find stop token.
        if (sampled_word == '_END' or len(decoded_sentence.split()) > max_length_tar):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

print("\nSequence decoding function is ready.")

In [None]:
# Step 14: Test the Model with Sample Sentences

for seq_index in range(5):
    # Get the input sentence
    input_seq = X_test_enc[seq_index: seq_index + 1]

    # Decode the sequence to get the translated sentence
    decoded_sentence = decode_sequence(input_seq)

    # Get the actual TELUGU translation (remove START_ and _END tokens)
    actual_translation = data['TELUGU'][X_train_enc.shape[0] + seq_index][7:-5]  # Adjust indices if necessary

    print(f"\nInput Hindi sentence: {data['HINDI'][X_train_enc.shape[0] + seq_index]}")
    print(f"Predicted sentence: {actual_translation}")
   
