# Main Library

In [1]:
# Reading Data
import pandas as pd
import numpy as np
import re, string, nltk

# Data Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Building Deep Learning Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Input
from tensorflow.keras.models import Model, load_model
import tensorflow.keras as k

# Reading Data

In [2]:
data = pd.read_table(r"D:\Courses language programming\9_Big Projects\14 - Machine Translation\mar-eng\mar.txt")
data.columns = ["english", "marathi", "x"]
data = data.drop(columns="x", axis=1)
data = data.iloc[10000:20000]
data.head()

Unnamed: 0,english,marathi
10000,I just sold my car.,मी आत्ताच माझी गाडी विकून टाकली.
10001,I just took a bath.,मी आत्ताच आंघोळ केली.
10002,I just want to die.,मला फक्त मरायचं आहे.
10003,I knew you'd laugh.,मला माहीत होतं की तू हसशील.
10004,I knew you'd laugh.,तू हसशील हे मला माहीत होतं.


In [3]:
data.shape

(10000, 2)

# Building Encoder input :: English Sentence

In [4]:
eng_lines = []

for line in data.english:
    eng_lines.append(line)
    
tokenizer = Tokenizer()
tokenizer.fit_on_texts(eng_lines)
eng_seq = tokenizer.texts_to_sequences(eng_lines)
max_eng = max([len(word) for word in eng_seq])

print("The Max Length of English Word is --> ", max_eng)
english_padded = pad_sequences(eng_seq, maxlen=max_eng, padding="post")
english_padded_arr = np.array(english_padded)

print("The Shape Of Encoder Input is --> ", english_padded_arr.shape)

eng_word_dict = tokenizer.word_index
num_eng_words = len(eng_word_dict) + 1

print("The Number of English Word is --> ",  num_eng_words)

The Max Length of English Word is -->  7
The Shape Of Encoder Input is -->  (10000, 7)
The Number of English Word is -->  2433


# Building Decoder Input :: Marathi  Sentence

In [5]:
mar_lines = []

for line in data.marathi:
    mar_lines.append("<START>" + line + "<END>")
    
tokenizer = Tokenizer()
tokenizer.fit_on_texts(mar_lines)
mar_seq = tokenizer.texts_to_sequences(mar_lines)

max_mar = max([len(word) for word in mar_seq])
print("The Max Length of Marathi Word is --> ", max_mar)

marathi_padded = pad_sequences(mar_seq, maxlen=max_mar, padding="post")
marathi_padded_arr = np.array(marathi_padded)

print("The Shape Of Encoder Input is --> ", marathi_padded_arr.shape)

mar_word_dict = tokenizer.word_index
num_mar_words = len(mar_word_dict) + 1

print("The Number of Marathi Word is --> ",  num_mar_words)

The Max Length of Marathi Word is -->  11
The Shape Of Encoder Input is -->  (10000, 11)
The Number of Marathi Word is -->  4833


# Building Decoder :: Output

In [6]:
decoder_output_data = []

for token in mar_seq:
    decoder_output_data.append(token[1:])

marathi_padded_output = pad_sequences(decoder_output_data, maxlen=max_mar, padding="post")
onehot_mar_lines = to_categorical(marathi_padded_output, num_mar_words)

decoder_output_data = np.array(onehot_mar_lines)

# -- Building LSTM Model

In [7]:
encoder_input = Input(shape=(None, ))
encoder_embeding = Embedding(num_eng_words, 256 ,mask_zero=True)(encoder_input)
encoder_output, state_h, state_c = LSTM(128, return_state=True)(encoder_embeding)
encoder_states = [state_h, state_c]

decoder_input = Input(shape=(None, ))
decoder_embeding = Embedding(num_mar_words, 256, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(128, return_state=True, return_sequences=True)
decoder_output, _, _ = decoder_lstm(decoder_embeding, initial_state=encoder_states)
decoder_dense = Dense(num_mar_words, activation="softmax")
output = decoder_dense(decoder_output)

model = Model([encoder_input, decoder_input], output)

model.compile(optimizer=k.optimizers.RMSprop(), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            622848    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            1237248   ['input_2[0][0]']             
                                                                                              

In [8]:
model.fit([english_padded_arr, marathi_padded_arr], decoder_output_data, epochs=200, batch_size=300)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.src.callbacks.History at 0x248e083cbe0>

In [16]:
def make_refrece_model():
    encoder_model_refrence = Model(encoder_input, encoder_states)
    
    decoder_state_h = Input(shape=(128, ))
    decoder_state_c = Input(shape=(128, ))
    decoder_inputs_states = [decoder_state_h, decoder_state_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embeding, initial_state=decoder_inputs_states)
    decoder_states = [state_h, state_c]
    
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model_refrence = Model([decoder_input] + decoder_inputs_states,
                                   [decoder_outputs] + decoder_states)
    
    return encoder_model_refrence, decoder_model_refrence

# Transform Sentence To Tokens

In [17]:
def str_to_token(sen):
    words = sen.lower().split()
    token_list = []
    for word in words:
        try:
            token_list.append(eng_word_dict[word])
        except:
            print("THe sentence is not recognize ")
            run()
    return pad_sequences([token_list], maxlen=max_eng, padding="post")

In [25]:
def run():
    enc_model, dec_model = make_refrece_model()

#     enc_model.save(r"D:\Courses language programming\9_Big Projects\14 - Machine Translation\mar-eng\encoder_model")
#     enc_model.save(r"D:\Courses language programming\9_Big Projects\14 - Machine Translation\mar-eng\decoder_model")

    for sen in range(english_padded_arr.shape[0]):
        states_values = enc_model.predict(str_to_token(input("Enter an English Sentence:  ")))

        empty_target = np.zeros((1, 1))
        empty_target[0, 0] = mar_word_dict["start"]
        stopping_condition = False
        decoded_translation = ""
        while not stopping_condition:
            dec_output, h, c = dec_model.predict([empty_target] + states_values)
            sampled_word_index = np.argmax(dec_output[0, -1, :])
            sampled_word = None
            for word, index in mar_word_dict.items():
                if sampled_word_index == index:
                    decoded_translation += " " + word
                    sampled_word = word
                if sampled_word == "end" or len(decoded_translation.split()) > max_mar:
                    stopping_condition = True

            empty_target = np.zeros((1, 1))
            empty_target[0, 0] = sampled_word_index
            states_values = [h, c]

        print(decoded_translation[:-3])

In [27]:
# run()