In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding,Input, LSTM, Embedding, Dense
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
train=pd.read_csv("engtamilTrain.csv")
train=train.drop(['Unnamed: 0'],axis=1)
english_sentences=train['en'].head(500)
tamil_sentences=train['ta'].head(500)

In [3]:
def addSosEos(seriesSentences):
   
   # Define <SOS> and <EOS> tokens
    sos_token="<SOS>"
    eos_token="<EOS>"

    # adding <SOS>  <EOS> tokens
    statements_with_tokens = [f"{sos_token} {sentences} {eos_token}" for sentences  in seriesSentences]

    english_sent=[]
    for statements in statements_with_tokens:
        english_sent.append(statements)
        print (statements)

    return english_sent


In [4]:
english_sent_SE=addSosEos(english_sentences)
tamil_sent_SE=addSosEos(tamil_sentences)

<SOS> MMA vice president Qazi Hussain Ahmad declared last month: 'We are not extremists.
 <EOS>
<SOS> Information has surfaced in recent years suggesting that Julius Rosenberg was involved in passing some form of intelligence to Soviet officials during the Second World War.
 <EOS>
<SOS> And Azor begat Sadoc; and Sadoc begat Achim; and Achim begat Eliud;
 <EOS>
<SOS> She says she knows what is going on, but can do nothing about it.
 <EOS>
<SOS> And be it indeed that I have erred, my error remains with myself.
 <EOS>
<SOS> Finally, the columnist fails to tell us who among the political leaders of the bourgeoisie, past and present, he counts among the paragons of morality.
 <EOS>
<SOS> These include the British Tamil Forum, La Maison du Tamil Eelam (France), the Canadian Tamil Congress, and the Swiss Tamil Forum.
 <EOS>
<SOS> Vijay accompanied with his wife and daughter enjoyed the film 'Anjathey'.
 <EOS>
<SOS> Both Musharraf and Vajpayee have exploited the current war drive to divert pub

In [5]:
# Tokenize the English and Tamil sentences
english_tokenizer = Tokenizer(filters="")
english_tokenizer.fit_on_texts(english_sent_SE)
english_vocab_size = len(english_tokenizer.word_index) + 1 # +1 is to account for the reserved index 0.
english_sequences = english_tokenizer.texts_to_sequences(english_sent_SE)

In [6]:
tamil_tokenizer = Tokenizer(filters="")
tamil_tokenizer.fit_on_texts(english_sent_SE)
tamil_vocab_size = len(english_tokenizer.word_index) + 1
tamil_sequences = english_tokenizer.texts_to_sequences(tamil_sent_SE)

In [7]:
max_input_seq_length=20
max_output_seq_length=20

In [8]:
# Pad sequences to a fixed length
input_sequences = pad_sequences(english_sequences, maxlen=max_input_seq_length, padding='post')
output_sequences = pad_sequences(tamil_sequences, maxlen=max_output_seq_length, padding='post')

In [9]:
input_sequences[0]

array([   2, 1083, 1084,  264, 1085,  548, 1086,  211,   55, 1087,  265,
         24,   20, 1088,    3,    0,    0,    0,    0,    0])

In [10]:
# Prepare the decoder input and output sequences for teacher forcing
decoder_input_sequences = np.zeros_like(output_sequences) # generate matrix of same shape filled with zeros
decoder_input_sequences[:, 1:] = output_sequences[:, :-1]
decoder_input_sequences[:, 0] = tamil_tokenizer.word_index['<sos>']
decoder_output_sequences = np.eye(tamil_vocab_size)[output_sequences]

In [11]:
from gensim.models import Word2Vec

eng_model = Word2Vec.load('engmodel.bin')
tam_model = Word2Vec.load('tammodel.bin')

In [12]:
def create_embedding_matrix(word2vec_model,tokenizer,vocab_size):
    embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
    for word,i in tokenizer.word_index.items():
        try:
            embedding_vector = word2vec_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            pass  # Words not found in the embedding index will be all zeros
    return embedding_matrix

eng_embedding_matrix = create_embedding_matrix(eng_model, english_tokenizer, english_vocab_size)
tam_embedding_matrix = create_embedding_matrix(tam_model, tamil_tokenizer, tamil_vocab_size)

In [13]:
eng_embedding_matrix.shape

(4184, 100)

In [14]:
tam_embedding_matrix.shape

(4184, 100)

In [15]:
# Convert target_sequences to one-hot encoded format
target_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=tamil_vocab_size)

In [20]:
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense

def create_seq2seq_model(input_vocab_size, output_vocab_size, input_seq_length, output_seq_length, hidden_units, eng_embedding_matrix=None, tam_embedding_matrix=None):
    # Encoder
    encoder_inputs = Input(shape=(input_seq_length,))
    encoder_embedding_layer = Embedding(input_dim=input_vocab_size, output_dim=hidden_units, trainable=False)
    if eng_embedding_matrix is not None:
        encoder_embedding_layer.build((None,))  # Initialize the weights
        encoder_embedding_layer.set_weights([eng_embedding_matrix])
    encoder_embedding = encoder_embedding_layer(encoder_inputs)
    encoder_lstm, encoder_state_h, encoder_state_c = LSTM(hidden_units, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(output_seq_length,))
    decoder_embedding_layer = Embedding(input_dim=output_vocab_size, output_dim=hidden_units, trainable=False)
    if tam_embedding_matrix is not None:
        decoder_embedding_layer.build((None,))  # Initialize the weights
        decoder_embedding_layer.set_weights([tam_embedding_matrix])
    decoder_embedding = decoder_embedding_layer(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model


In [21]:
model = create_seq2seq_model(english_vocab_size, tamil_vocab_size, max_input_seq_length, max_output_seq_length, 100, eng_embedding_matrix, tam_embedding_matrix)

In [22]:
# Convert target_sequences to one-hot encoded format
target_sequences = tf.keras.utils.to_categorical(output_sequences, num_classes=tamil_vocab_size)


In [23]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [25]:
# Fit the model to the data
batch_size = 32
epochs = 100
model.fit([input_sequences, output_sequences], decoder_output_sequences, batch_size=batch_size, epochs=epochs, validation_split=0.2)


Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 630ms/step - accuracy: 0.9868 - loss: 0.0570 - val_accuracy: 0.9870 - val_loss: 0.1094
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 373ms/step - accuracy: 0.9853 - loss: 0.0657 - val_accuracy: 0.9870 - val_loss: 0.1095
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 336ms/step - accuracy: 0.9863 - loss: 0.0610 - val_accuracy: 0.9870 - val_loss: 0.1096
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 364ms/step - accuracy: 0.9866 - loss: 0.0583 - val_accuracy: 0.9870 - val_loss: 0.1097
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 437ms/step - accuracy: 0.9892 - loss: 0.0502 - val_accuracy: 0.9870 - val_loss: 0.1097
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 366ms/step - accuracy: 0.9865 - loss: 0.0583 - val_accuracy: 0.9870 - val_loss: 0.1099
Epoch 7/100
[1m13/13

<keras.src.callbacks.history.History at 0x2098ac4a590>

In [26]:
# Preprocessing the input

input_sentence="<sos> They lied to us <eos>"

#input_sentence = "<sos>Finally, the columnist fails to tell us who among the political leaders of the 
#bourgeoisie, past and present, he counts among the paragons of morality<eos>"

# Convert the input sentence to sequence
input_sequence = english_tokenizer.texts_to_sequences([input_sentence])

# Pad the statement to the maximum input sequence length
input_sequence = pad_sequences(input_sequence, maxlen=max_input_seq_length, padding='post')

# Generate predictions
predictions = model.predict([input_sequence, np.zeros((1, max_output_seq_length))])

# Convert predictions to tokens
predicted_tokens = np.argmax(predictions, axis=-1)[0]

# Create index to word mapping for Tamil vocabulary
tamil_index_word = {i: w for w, i in tamil_tokenizer.word_index.items()}


# Convert tokens to text
decoded_sentence = []
for token in predicted_tokens:
    if token == 0:  # Assuming 0 is the padding token
        continue
    word = tamil_index_word.get(token)
    if word == '<eos>':
        break
    if word is not None:
        decoded_sentence.append(word)
    else:
        decoded_sentence.append('<unk>')

# Join the words to form the decoded statement
decoded_statement = ' '.join(decoded_sentence)

# Print the decoded statement
print(decoded_statement)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
<sos>
