In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding,Input, LSTM, Embedding, Dense
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense
# ## Data Preprocessing
# This section covers how to preprocess text data for model training.

# ## Model Building
# This section involves creating a sequential model using LSTM layers.

In [3]:
# ## Reading the Dataset
train=pd.read_csv("engtamilTest.csv")
# ## Dropping Unnecessary Columns
train=train.drop(['Unnamed: 0'],axis=1)
 ## Selecting the First 500 Sentences in English and Tamil
english_sentences=train['en'].head(500)
tamil_sentences=train['ta'].head(500)
## Continue with further processing or model building...

In [4]:
def addSosEos(seriesSentences):  
   # Define <SOS> and <EOS> tokens
    sos_token="<SOS>"
    eos_token="<EOS>"
        # Adding <SOS> and <EOS> tokens to each sentence
    statements_with_tokens = [f"{sos_token} {sentences} {eos_token}" for sentences  in seriesSentences]
    # Initialize a list to store sentences with tokens
    english_sent=[]
    # Append each processed sentence to the list
    for statements in statements_with_tokens:
        english_sent.append(statements)
        print (statements)
    return english_sent
# Example Usage
# english_sentences_with_tokens = addSosEos(english_sentences)

In [5]:
# Adding <SOS> and <EOS> tokens to the English sentences
english_sent_SE=addSosEos(english_sentences)
# Adding <SOS> and <EOS> tokens to the Tamil sentences
tamil_sent_SE=addSosEos(tamil_sentences)

<SOS> Just hours before the incident, Kurdish nationalist leader and Iraqi president, Jalal Talibani, had welcomed the US president on his last visit to Iraq as a man "who helped us liberate our country and to reach this day in which we have democracy, human rights and prosperity gradually in our country."
 <EOS>
<SOS> Confronting a worsening foreign exchange crisis, the Sri Lankan government is seeking a $US1.9 billion loan from the International Monetary Fund (IMF) to bail out the country.
 <EOS>
<SOS> A room was arranged for him at Sun Towers Lodge.
 <EOS>
<SOS> The kidnapping and threatened execution of 63-year-old Australian citizen and US resident Douglas Wood is a further expression of the living hell that Iraq has become under the US-led occupation.
 <EOS>
<SOS> And Hilkiah, and they that the king had appointed, went to Huldah the prophetess, the wife of Shallum the son of Tikvath, the son of Hasrah, keeper of the wardrobe; (now she dwelled in Jerusalem in the college:) and the

In [6]:
# Tokenize the English and Tamil sentences
english_tokenizer = Tokenizer(filters="")
english_tokenizer.fit_on_texts(english_sent_SE)
english_vocab_size = len(english_tokenizer.word_index) + 1 # +1 is to account for the reserved index 0.
english_sequences = english_tokenizer.texts_to_sequences(english_sent_SE)

In [7]:
# Tokenize the English and Tamil sentences
tamil_tokenizer = Tokenizer(filters="")
tamil_tokenizer.fit_on_texts(english_sent_SE)
tamil_vocab_size = len(english_tokenizer.word_index) + 1# +1 is to account for the reserved index 0.
tamil_sequences = english_tokenizer.texts_to_sequences(tamil_sent_SE)

In [8]:
# Set the maximum input sequence length
max_input_seq_length=20
# Set the maximum output sequence length
max_output_seq_length=20

In [9]:
# Pad sequences to a fixed length
input_sequences = pad_sequences(english_sequences, maxlen=max_input_seq_length, padding='post')
# Pad Tamil sequences to a fixed length
output_sequences = pad_sequences(tamil_sequences, maxlen=max_output_seq_length, padding='post')

In [10]:
input_sequences[0]

array([ 107,    5,    6, 1094,   23,  115,    7,   37,   53,   20, 1095,
        560,  561,    5, 1096, 1097,    7,   75, 1098,    3])

In [11]:
# Prepare the decoder input and output sequences for teacher forcing
decoder_input_sequences = np.zeros_like(output_sequences) # generate matrix of same shape filled with zeros
decoder_input_sequences[:, 1:] = output_sequences[:, :-1]
decoder_input_sequences[:, 0] = tamil_tokenizer.word_index['<sos>']
# Prepare the decoder output sequences
decoder_output_sequences = np.eye(tamil_vocab_size)[output_sequences]

In [15]:
# Load pre-trained Word2Vec models for English and Tamil
eng_model = Word2Vec.load('engmodel.bin')
tam_model = Word2Vec.load('tammodel.bin')

In [16]:
def create_embedding_matrix(word2vec_model,tokenizer,vocab_size):
    # Initialize the embedding matrix with zeros
    embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
    # Populate the embedding matrix
    for word,i in tokenizer.word_index.items():
        try:
            # Retrieve the embedding vector for the word
            embedding_vector = word2vec_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            pass  # Words not found in the embedding index will be all zeros
    return embedding_matrix

In [17]:
# Create embedding matrices for English and Tamil using pre-trained Word2Vec models
eng_embedding_matrix = create_embedding_matrix(eng_model, english_tokenizer, english_vocab_size)
tam_embedding_matrix = create_embedding_matrix(tam_model, tamil_tokenizer, tamil_vocab_size)

In [18]:
eng_embedding_matrix.shape

(4234, 100)

In [19]:
tam_embedding_matrix.shape

(4234, 100)

In [20]:
# Convert target_sequences to one-hot encoded format
target_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=tamil_vocab_size)

In [22]:
def create_seq2seq_model(input_vocab_size, output_vocab_size, input_seq_length, output_seq_length, hidden_units, eng_embedding_matrix=None, tam_embedding_matrix=None):
    # Encoder
    encoder_inputs = Input(shape=(input_seq_length,))
    encoder_embedding_layer = Embedding(input_dim=input_vocab_size, output_dim=hidden_units, trainable=False)
    if eng_embedding_matrix is not None:
        encoder_embedding_layer.build((None,))  # Initialize the weights
        encoder_embedding_layer.set_weights([eng_embedding_matrix]) # Set weights to pre-trained embeddings
    encoder_embedding = encoder_embedding_layer(encoder_inputs)
    encoder_lstm, encoder_state_h, encoder_state_c = LSTM(hidden_units, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(output_seq_length,))
    decoder_embedding_layer = Embedding(input_dim=output_vocab_size, output_dim=hidden_units, trainable=False)
    if tam_embedding_matrix is not None:
        decoder_embedding_layer.build((None,))  # Initialize the weights
        decoder_embedding_layer.set_weights([tam_embedding_matrix])# Set weights to pre-trained embeddings
    decoder_embedding = decoder_embedding_layer(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [23]:
# Create the Seq2Seq model with specified parameters and embedding matrices
model = create_seq2seq_model(english_vocab_size, tamil_vocab_size, max_input_seq_length, max_output_seq_length, 100, eng_embedding_matrix, tam_embedding_matrix)

In [24]:
# Convert target_sequences to one-hot encoded format
target_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=tamil_vocab_size)

In [25]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
# Fit the model to the data
batch_size = 32
epochs = 100
model.fit([input_sequences, output_sequences], decoder_output_sequences, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 84ms/step - accuracy: 0.6787 - loss: 8.3161 - val_accuracy: 0.8855 - val_loss: 7.5664
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.9057 - loss: 6.7531 - val_accuracy: 0.8855 - val_loss: 4.2218
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.8866 - loss: 3.2757 - val_accuracy: 0.8855 - val_loss: 1.0452
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - accuracy: 0.8823 - loss: 0.8377 - val_accuracy: 0.8855 - val_loss: 0.5257
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.8847 - loss: 0.4789 - val_accuracy: 0.8855 - val_loss: 0.4397
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - accuracy: 0.8869 - loss: 0.3863 - val_accuracy: 0.8855 - val_loss: 0.3992
Epoch 7/100
[1m13/13[0m [

<keras.src.callbacks.history.History at 0x177e31a0a10>

In [27]:
# Preprocessing the input

input_sentence="<sos> They lied to us <eos>"

#input_sentence = "<sos>Finally, the columnist fails to tell us who among the political leaders of the 
#bourgeoisie, past and present, he counts among the paragons of morality<eos>"

# Convert the input sentence to sequence
input_sequence = english_tokenizer.texts_to_sequences([input_sentence])

# Pad the statement to the maximum input sequence length
input_sequence = pad_sequences(input_sequence, maxlen=max_input_seq_length, padding='post')

# Generate predictions
predictions = model.predict([input_sequence, np.zeros((1, max_output_seq_length))])

# Convert predictions to tokens
predicted_tokens = np.argmax(predictions, axis=-1)[0]

# Create index to word mapping for Tamil vocabulary
tamil_index_word = {i: w for w, i in tamil_tokenizer.word_index.items()}


# Convert tokens to text
decoded_sentence = []
for token in predicted_tokens:
    if token == 0:  # Assuming 0 is the padding token
        continue
    word = tamil_index_word.get(token)
    if word == '<eos>':
        break
    if word is not None:
        decoded_sentence.append(word)
    else:
        decoded_sentence.append('<unk>')

# Join the words to form the decoded statement
decoded_statement = ' '.join(decoded_sentence)

# Print the decoded statement
print(decoded_statement)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step
<sos>
