In [69]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention,Concatenate
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.gleu_score import corpus_gleu
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [46]:
df=pd.read_csv("D:/100devMl/Projects/Machine-Translation/experiments/english_to_spanish/en_es_45k.csv")
df.head()

Unnamed: 0,English,Spanish
0,I declare resumed the session of the European ...,Declaro reanudado el período de sesiones del P...
1,"Although, as you will have seen, the dreaded '...","Como todos han podido comprobar, el gran ""efec..."
2,You have requested a debate on this subject in...,Sus Señorías han solicitado un debate sobre el...
3,"In the meantime, I should like to observe a mi...","A la espera de que se produzca, de acuerdo con..."
4,"Please rise, then, for this minute' s silence.",Invito a todos a que nos pongamos de pie para ...


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149988 entries, 0 to 149987
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   English  149829 non-null  object
 1   Spanish  149559 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [48]:
print(df["English"].apply(type).value_counts())

English
<class 'str'>      149829
<class 'float'>       159
Name: count, dtype: int64


In [49]:
df.English.isnull().sum(),df.Spanish.isnull().sum()

(159, 429)

In [50]:
df.dropna(inplace=True)

In [51]:
print(df["Spanish"].apply(type).value_counts())


Spanish
<class 'str'>    149401
Name: count, dtype: int64


In [52]:
np.random.seed(42)
tf.random.set_seed(42)

In [53]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [54]:
def tokenize_with_special_tokens(data, lang_key, num_words=None):
    tokenizer = Tokenizer(num_words=num_words, filters='', lower=True, oov_token="<unk>")
    special_tokens = ["<sos>", "<eos>"]
    tokenizer.fit_on_texts(special_tokens + data[lang_key].tolist())
    return tokenizer

In [55]:
en_tokenizer = tokenize_with_special_tokens(train_data, "English")
sp_tokenizer = tokenize_with_special_tokens(train_data, "Spanish") 

In [56]:
def preprocess_with_special_tokens(data, en_tokenizer, sp_tokenizer, max_seq_len):
    en_sequences = en_tokenizer.texts_to_sequences(data['English'])
    sp_sequences = ["<sos> " + sent + " <eos>" for sent in data['Spanish']]
    sp_sequences = sp_tokenizer.texts_to_sequences(sp_sequences)

    en_sequences = pad_sequences(en_sequences, maxlen=max_seq_len, padding="post")
    sp_sequences = pad_sequences(sp_sequences, maxlen=max_seq_len + 2, padding="post")

    decoder_input = sp_sequences[:, :-1]
    decoder_target = sp_sequences[:, 1:]

    return en_sequences, decoder_input, decoder_target

In [57]:
max_seq_len = 20
en_train, dec_train_input, dec_train_target = preprocess_with_special_tokens(train_data, en_tokenizer, sp_tokenizer, max_seq_len)
en_val, dec_val_input, dec_val_target = preprocess_with_special_tokens(val_data, en_tokenizer, sp_tokenizer, max_seq_len)
en_test, _, _ = preprocess_with_special_tokens(test_data, en_tokenizer, sp_tokenizer, max_seq_len)

In [58]:
en_train.shape,dec_train_input.shape, dec_train_target.shape

((107568, 20), (107568, 21), (107568, 21))

In [59]:
embedding_dim = 128
hidden_units = 256
vocab_size_en = len(en_tokenizer.word_index) + 1
vocab_size_hi = len(sp_tokenizer.word_index) + 1

In [60]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size_en, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)

In [61]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size_hi, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])

In [62]:
attention_layer = Attention()
context_vector = attention_layer([decoder_outputs, encoder_outputs])

In [63]:
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])
decoder_dense = Dense(vocab_size_hi, activation="softmax")
decoder_outputs = decoder_dense(decoder_combined_context)

In [64]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 128)    7949440     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 128)    11752960    ['input_4[0][0]']                
                                                                                            

In [65]:
batch_size = 64
epochs = 1
history = model.fit(
    [en_train, dec_train_input], dec_train_target,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([en_val, dec_val_input], dec_val_target)
)

  22/1681 [..............................] - ETA: 3:26:47 - loss: 9.7711 - accuracy: 0.1572

KeyboardInterrupt: 

In [None]:
model.save("machineTranslation_1.h5")

In [66]:
def predict_translation(input_text, model, en_tokenizer, sp_tokenizer, max_seq_len):
    input_seq = en_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_seq_len, padding="post")

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = sp_tokenizer.word_index['<sos>']

    decoded_sentence = []

    for _ in range(max_seq_len + 2):
        predictions = model.predict([input_seq, target_seq], verbose=0)
        predicted_token = np.argmax(predictions[0, -1, :])

        sampled_word = sp_tokenizer.index_word.get(predicted_token, '<unk>')
        if sampled_word == '<eos>':
            break

        decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = predicted_token

    return ' '.join(decoded_sentence)

In [67]:
input_text = "who are you"
predicted_translation = predict_translation(input_text, model, en_tokenizer, sp_tokenizer, max_seq_len)
print(f"Input: {input_text}")
print(f"Predicted Translation: {predicted_translation}")

Input: who are you
Predicted Translation: <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


In [None]:
def evaluate_bleu_score(model, data, en_tokenizer, sp_tokenizer, max_seq_len):
    references = []
    hypotheses = []

    for _, row in data.iterrows():
        input_text = row['English']
        reference = row['Spanish'].split()

        predicted_translation = predict_translation(input_text, model, en_tokenizer, sp_tokenizer, max_seq_len)
        hypothesis = predicted_translation.split()

        references.append([reference])
        hypotheses.append(hypothesis)

    bleu_score = corpus_bleu(references, hypotheses)
    corpus_gleu_score = corpus_gleu(references, hypotheses)
    return bleu_score,corpus_gleu_score

In [71]:
test_test_data=test_data.head(100)

In [None]:
bleu_score,corpus_gleu_score = evaluate_bleu_score(model, test_test_data, en_tokenizer, sp_tokenizer, 20)
print(f"Test BLEU Score: {bleu_score}"),f"Glue score :{corpus_gleu_score}"

Test BLEU Score: 0
