In [2]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

In [3]:
train_df = pd.read_parquet("train-00000-of-00001.parquet")
test_df  = pd.read_parquet("test-00000-of-00001.parquet")
val_df   = pd.read_parquet("validation-00000-of-00001.parquet")

In [4]:
train_df.head()

Unnamed: 0,translation
0,{'en': 'Give your application an accessibility...
1,"{'en': 'Accerciser Accessibility Explorer', 'h..."
2,{'en': 'The default plugin layout for the bott...
3,{'en': 'The default plugin layout for the top ...
4,{'en': 'A list of plugins that are disabled by...


In [5]:
train_df.shape

(1659083, 1)

In [6]:
train_df.columns

Index(['translation'], dtype='object')

In [7]:
train_df['english'] = train_df['translation'].apply(lambda x: x['en'])
train_df['hindi']   = train_df['translation'].apply(lambda x: x['hi'])

val_df['english'] = val_df['translation'].apply(lambda x: x['en'])
val_df['hindi']   = val_df['translation'].apply(lambda x: x['hi'])

test_df['english'] = test_df['translation'].apply(lambda x: x['en'])
test_df['hindi']   = test_df['translation'].apply(lambda x: x['hi'])


In [8]:
train_df = train_df[['english', 'hindi']]
test_df  = test_df[['english', 'hindi']]
val_df   = val_df[['english', 'hindi']]

In [9]:
train_df.head()

Unnamed: 0,english,hindi
0,Give your application an accessibility workout,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,Accerciser Accessibility Explorer,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,The default plugin layout for the bottom panel,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,The default plugin layout for the top panel,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4,A list of plugins that are disabled by default,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...


In [10]:
def clean_english(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_hindi(text):
    text = re.sub(r"[^\u0900-\u097F?.!,]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [11]:
train_df['english'] = train_df['english'].apply(clean_english)
train_df['hindi']   = train_df['hindi'].apply(clean_hindi)

val_df['english'] = val_df['english'].apply(clean_english)
val_df['hindi']   = val_df['hindi'].apply(clean_hindi)

test_df['english'] = test_df['english'].apply(clean_english)
test_df['hindi']   = test_df['hindi'].apply(clean_hindi)


In [12]:
train_df['hindi'] = train_df['hindi'].apply(lambda x: "<start> " + x + " <end>")
val_df['hindi']   = val_df['hindi'].apply(lambda x: "<start> " + x + " <end>")
test_df['hindi']  = test_df['hindi'].apply(lambda x: "<start> " + x + " <end>")

In [13]:
train_df.head(5)

Unnamed: 0,english,hindi
0,give your application an accessibility workout,<start> अपने अनुप्रयोग को पहुंचनीयता व्यायाम क...
1,accerciser accessibility explorer,<start> एक्सेर्साइसर पहुंचनीयता अन्वेषक <end>
2,the default plugin layout for the bottom panel,<start> निचले पटल के लिए डिफोल्ट प्लग इन खाका ...
3,the default plugin layout for the top panel,<start> ऊपरी पटल के लिए डिफोल्ट प्लग इन खाका <...
4,a list of plugins that are disabled by default,<start> उन प्लग इनों की सूची जिन्हें डिफोल्ट र...


In [14]:
MAX_ENG_VOCAB = 15000
MAX_HIN_VOCAB = 15000

eng_tokenizer = Tokenizer(num_words=MAX_ENG_VOCAB, filters='')
hin_tokenizer = Tokenizer(num_words=MAX_HIN_VOCAB, filters='')

eng_tokenizer.fit_on_texts(train_df['english'])
hin_tokenizer.fit_on_texts(train_df['hindi'])


In [15]:
train_eng_seq = eng_tokenizer.texts_to_sequences(train_df['english'])
train_hin_seq = hin_tokenizer.texts_to_sequences(train_df['hindi'])

val_eng_seq = eng_tokenizer.texts_to_sequences(val_df['english'])
val_hin_seq = hin_tokenizer.texts_to_sequences(val_df['hindi'])

In [16]:
MAX_ENG_LEN = 20
MAX_HIN_LEN = 20

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_eng_pad = pad_sequences(
    train_eng_seq,
    maxlen=MAX_ENG_LEN,
    padding='post',
    truncating='post'
)

train_hin_pad = pad_sequences(
    train_hin_seq,
    maxlen=MAX_HIN_LEN,
    padding='post',
    truncating='post'
)

val_eng_pad = pad_sequences(
    val_eng_seq,
    maxlen=MAX_ENG_LEN,
    padding='post',
    truncating='post'
)

val_hin_pad = pad_sequences(
    val_hin_seq,
    maxlen=MAX_HIN_LEN,
    padding='post',
    truncating='post'
)

In [18]:
print(train_eng_pad.shape)
print(train_hin_pad.shape)


(1659083, 20)
(1659083, 20)


In [19]:
SAMPLE_SIZE = 50000  

train_eng_pad = train_eng_pad[:SAMPLE_SIZE]
train_hin_pad = train_hin_pad[:SAMPLE_SIZE]

In [20]:
train_eng_pad.shape

(50000, 20)

In [21]:
decoder_input_train  = train_hin_pad[:, :-1]
decoder_output_train = train_hin_pad[:, 1:]

In [22]:
decoder_input_train.shape
decoder_output_train.shape

(50000, 19)

In [23]:
decoder_input_val  = val_hin_pad[:, :-1]
decoder_output_val = val_hin_pad[:, 1:]

In [24]:
print("Encoder input shape:", train_eng_pad.shape)
print("Decoder input shape:", decoder_input_train.shape)
print("Decoder output shape:", decoder_output_train.shape)

Encoder input shape: (50000, 20)
Decoder input shape: (50000, 19)
Decoder output shape: (50000, 19)


In [25]:
eng_vocab_size = MAX_ENG_VOCAB
hin_vocab_size = MAX_HIN_VOCAB

encoder_inputs = Input(shape=(MAX_ENG_LEN,))
enc_emb = Embedding(eng_vocab_size, 32, mask_zero=True)(encoder_inputs)
_, state_h, state_c = LSTM(32, return_state=True)(enc_emb)

decoder_inputs = Input(shape=(MAX_HIN_LEN - 1,))
dec_emb = Embedding(hin_vocab_size, 32, mask_zero=True)(decoder_inputs)
dec_outputs, _, _ = LSTM(
    32, return_sequences=True, return_state=True
)(dec_emb, initial_state=[state_h, state_c])

decoder_dense = Dense(hin_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(dec_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy'
)

model.summary()


In [29]:
history = model.fit(
    [train_eng_pad, decoder_input_train],
    decoder_output_train,
    batch_size=4,
    epochs=4
)

Epoch 1/4
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 39ms/step - loss: 3.1656
Epoch 2/4
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 32ms/step - loss: 2.2826
Epoch 3/4
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 33ms/step - loss: 1.7130
Epoch 4/4
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m479s[0m 38ms/step - loss: 1.3652


In [30]:
encoder_model = Model(
    encoder_inputs,
    [state_h, state_c]
)

In [31]:
decoder_state_input_h = Input(shape=(32,))
decoder_state_input_c = Input(shape=(32,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb

decoder_outputs2, state_h2, state_c2 = LSTM(
    32, return_sequences=True, return_state=True
)(
    dec_emb2,
    initial_state=decoder_states_inputs
)

decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

In [32]:
reverse_hin_index = {
    v: k for k, v in hin_tokenizer.word_index.items()
}

In [33]:
import numpy as np

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_hin_index.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence) > MAX_HIN_LEN:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [34]:
def translate_sentence(sentence):
    sentence = clean_english(sentence)
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(
        seq, maxlen=MAX_ENG_LEN, padding='post', truncating='post'
    )
    return decode_sequence(seq)

In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq, verbose=0)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index["<start>"]

    decoded_sentence = []
    used_words = set()

    for _ in range(10):   
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value,
            verbose=0
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_hin_index.get(sampled_token_index, "")

        if sampled_word in ["<end>", "", "<unk>"]:
            break

        if sampled_word in used_words:
            break

        used_words.add(sampled_word)
        decoded_sentence.append(sampled_word)

        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return " ".join(decoded_sentence).strip()


In [73]:
print(translate_sentence("Good"))

रिपोर्ट अवधि, साथ जायेगा


In [47]:
encoder_model.save("encoder_model.keras")
decoder_model.save("decoder_model.keras")