# Project Name : Machine Translation Using Neural Language Model
---
## Author : Omar Mahmoud Abdel Rahman
---
## Date : 17/8/2024

In [44]:
# Import needed dependencies
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional, Concatenate, Dot, Activation, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow import argmax

from pathlib import Path


# Data Loading

In [2]:
# Reading the data text file
data_dir = './Data/fra.txt'
data_path = Path(data_dir)
with open(data_path, 'r', encoding= "utf-8") as f:
    lines = f.read().split('\n')

# Training samples we are going to train our model on 
num_samples = 10000 

In [3]:
#Initializing inputs and target lists
inputs = []
targets = []

# putting our data into inputs and outputs format
for line in lines[:num_samples]:
    input, target, _ = line.split('\t')
    inputs.append(input)
    targets.append(target)

# putting our data into a dataframe
lines = pd.DataFrame({'input': inputs,
                      'target': targets})

lines = lines[:num_samples]
print(f"data shape : {lines.shape}")
lines.head()

data shape : (10000, 2)


Unnamed: 0,input,target
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !


# Data preparation and cleaning

In [4]:
def clean_lines(lines:pd.DataFrame):
    lines.input = lines.input.apply(lambda x : x.lower())
    lines.target = lines.target.apply(lambda x : x.lower())

    # Removing single qoutes also replacing ',' with a COMMA token for the model to capture sepration between words easly 
    lines.input = lines.input.apply(lambda x : re.sub("'", "", x)).apply(lambda x : re.sub(',', " COMMA", x))
    lines.target = lines.target.apply(lambda x : re.sub("'", "", x)).apply(lambda x : re.sub(',', " COMMA", x))
    
    # Clean up punctuations and digits. Such special chars are common to both domains, and can just be copied with no error.
    exclude = set(string.punctuation)
    lines.input=lines.input.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    lines.target=lines.target.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

    remove_digits = str.maketrans('', '', string.digits)
    lines.input=lines.input.apply(lambda x: x.translate(remove_digits))
    lines.target=lines.target.apply(lambda x: x.translate(remove_digits))


start_tok = "START_"
end_tok = "_END"

def prepare_data(lines : pd.DataFrame):
    clean_lines(lines)
    lines.target = lines.target.apply(lambda x : (start_tok + ' ' + x + ' ' + end_tok))



In [5]:
prepare_data(lines)

lines.head()

Unnamed: 0,input,target
0,go,START_ va _END
1,go,START_ marche _END
2,go,START_ en route _END
3,go,START_ bouge _END
4,hi,START_ salut _END


# Word Level Model (Word2Word)

here we are seeking to create our vocabulary for both inputs "English words" and targets "Franch words"

In [6]:
def tok_split_word2word(data):
    return data.split()

tok_split_fn = tok_split_word2word

In [7]:
pad_tok = "PAD"
sep_tok = ' '
special_tokens = [pad_tok, sep_tok, start_tok, end_tok]

In [8]:
def data_stats(lines, input_tok_split_fn, target_tok_split_fn):
    english_tok = set()
    for line in lines.input:
        for tok in input_tok_split_fn(line):
            if tok not in english_tok :
                english_tok.add(tok)
    
    french_tok = set()
    for line in lines.target:
        for tok in target_tok_split_fn(line):
            if tok not in special_tokens:
                if tok not in french_tok :
                    french_tok.add(tok)
    
    english_tok = list(sorted(english_tok))
    french_tok = list(sorted(french_tok))

    num_encoder_tokens = len(english_tok)
    num_decoder_tokens = len(french_tok)

    
    maximum_encoder_len_seq = np.max([len(input_tok_split_fn(l)) for l in lines.input])
    maximum_decoder_len_seq = np.max([len(target_tok_split_fn(l)) for l in lines.target])


    return english_tok, french_tok, num_decoder_tokens, num_encoder_tokens, maximum_decoder_len_seq, maximum_encoder_len_seq

In [9]:
english_tok, french_tok, num_decoder_tokens, num_encoder_tokens, maximum_decoder_len_seq, maximum_encoder_len_seq = data_stats(lines,
                                                                                                                               tok_split_fn,
                                                                                                                               tok_split_fn)

In [10]:
print('Number of samples:', len(lines))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', maximum_encoder_len_seq)
print('Max sequence length for outputs:', maximum_decoder_len_seq)

Number of samples: 10000
Number of unique input tokens: 2022
Number of unique output tokens: 4410
Max sequence length for inputs: 5
Max sequence length for outputs: 12


## making vocab index dict reserving a 4 more tokens for `<SOS>`, `<EOS>`, `<PAD>`, `<SEP>` tokens

In [11]:
num_encoder_tokens += len(special_tokens)
num_decoder_tokens += len(special_tokens)

In [79]:

def vocab(input_tokens, target_tokens):
  
  input_token_index = {}
  target_token_index = {}
  for i,tok in enumerate(special_tokens):
    input_token_index[tok] = i
    target_token_index[tok] = i 

  offset = len(special_tokens)
  for i, tok in enumerate(input_tokens):
    input_token_index[tok] = i+offset

  for i, tok in enumerate(target_tokens):
    target_token_index[tok] = i+offset
   
  # Reverse-lookup token index to decode sequences back to something readable.
  reverse_input_tok_index = dict(
      (i, tok) for tok, i in input_token_index.items())
  reverse_target_tok_index = dict(
      (i, tok) for tok, i in target_token_index.items())
  return input_token_index, target_token_index, reverse_input_tok_index, reverse_target_tok_index

In [80]:
english_to_idx, french_to_idx, idx_to_english, idx_to_french = vocab(english_tok, french_tok)

# Now it's time to vectorize our data i.e. Converting our text data into numbers

In [17]:
max_encoder_seq_length = 16
max_decoder_seq_length = 16

In [18]:
# as tf.keras uses a static graph so we need 3 kinds of input data "encoder inputs which is our data inputs", "decoder targets which is our data targets" and finally "decoder inputs we need this inputs for "teacher forcing" which is the same as our data targets but shifted by one"
def init_model_inputs(lines, max_encoder_seq_length, max_decoder_seq_length, num_decoder_tokens):
    encoder_input_data = np.zeros((len(lines.input), max_encoder_seq_length), dtype='float32')
    decoder_input_data = np.zeros((len(lines.target), max_decoder_seq_length), dtype='float32')
    decoder_target_data = np.zeros((len(lines.target), max_decoder_seq_length, num_decoder_tokens), dtype= 'float32')
    print(f"Encoder_Input_Data_Shape : {encoder_input_data.shape}")
    print(f"Decoder_Input_Data_Shape : {decoder_input_data.shape}")
    print(f"Decoder_Target_Data_Shape : {decoder_target_data.shape}")
    return encoder_input_data, decoder_input_data, decoder_target_data

In [19]:
def vectorize(lines, max_encoder_seq_length, max_decoder_seq_length, num_decoder_tokens, input_tok_split_fn, target_tok_split_fn, english_to_idx, french_to_idx):
    
    encoder_input_data, decoder_input_data, decoder_target_data = init_model_inputs(lines, max_encoder_seq_length, max_decoder_seq_length, num_decoder_tokens)
    
    
    for i , (input_text, target_text) in enumerate(zip(lines.input, lines.target)):
        
        
        for t, tok in enumerate(input_tok_split_fn(input_text)):
            encoder_input_data[i, t] = english_to_idx[tok]
            
        
        encoder_input_data[i, t + 1: ] = english_to_idx[pad_tok]
        
        
        for t, tok in enumerate(target_tok_split_fn(target_text)):
            decoder_input_data[i, t] = french_to_idx[tok]
        
            if t > 0:
                # decoder target data will not include start token
                decoder_target_data[i, t - 1][french_to_idx[tok]] = 1
        
        
        decoder_input_data[i, t + 1:] = french_to_idx[pad_tok]
        
        decoder_target_data[i, t:, french_to_idx[pad_tok]] = 1

    return encoder_input_data, decoder_input_data, decoder_target_data

In [20]:
encoder_input_data, decoder_input_data, decoder_target_data = vectorize(lines, max_encoder_seq_length, max_decoder_seq_length, num_decoder_tokens, tok_split_fn, tok_split_fn, english_to_idx, french_to_idx)

Encoder_Input_Data_Shape : (10000, 16)
Decoder_Input_Data_Shape : (10000, 16)
Decoder_Target_Data_Shape : (10000, 16, 4414)


# we can notice that for the very begining data we only have small number of words in each seq so we see alot of padding
- we are going to build to distinct models 
    - one with out masking zero "i.e we will make the model learn the padding" which will lead to fake accuracy
    - and the other model is `with mask_zero = true` in the embedding layer and we will notice that the accuracy went down because it was fake one

In [21]:
emb_size = 50

In [22]:
def seq2seq(num_decoder_tokens, num_encoder_tokens, emb_sz, lstm_sz):
  encoder_inputs = Input(shape=(None,))
  en_x=  Embedding(num_encoder_tokens, emb_sz)(encoder_inputs)
  encoder = LSTM(lstm_sz, return_state=True)
  encoder_outputs, state_h, state_c = encoder(en_x)
  # We discard `encoder_outputs` and only keep the states.
  encoder_states = [state_h, state_c]
  
  # Encoder model
  encoder_model = Model(encoder_inputs, encoder_states)
  
  
  # Set up the decoder, using `encoder_states` as initial state.
  decoder_inputs = Input(shape=(None,))

  dex=  Embedding(num_decoder_tokens, emb_sz)

  final_dex= dex(decoder_inputs)


  decoder_lstm = LSTM(lstm_sz, return_sequences=True, return_state=True)

  decoder_outputs, _, _ = decoder_lstm(final_dex,
                                      initial_state=encoder_states)

  decoder_dense = Dense(num_decoder_tokens, activation='softmax')

  decoder_outputs = decoder_dense(decoder_outputs)

  model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

  model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])


  
  # Decoder model: Re-build based on explicit state inputs. Needed for step-by-step inference:
  decoder_state_input_h = Input(shape=(lstm_sz,))
  decoder_state_input_c = Input(shape=(lstm_sz,))
  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

  decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex, initial_state=decoder_states_inputs)
  decoder_states2 = [state_h2, state_c2]
  decoder_outputs2 = decoder_dense(decoder_outputs2)
  decoder_model = Model(
  [decoder_inputs] + decoder_states_inputs,
  [decoder_outputs2] + decoder_states2)  

  return model, encoder_model, decoder_model


In [23]:
model, encoder_model, decoder_model = seq2seq(num_decoder_tokens, num_encoder_tokens, emb_size, emb_size)
print(model.summary())
# plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             101300    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             220700    ['input_2[0][0]']             
                                                                                            

In [24]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size = 64,
          epochs = 30,
          validation_split = 0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x2ebcf756a90>

In [25]:
# now let's create the same model but with the embedding layer has a mask_zero = true
def seq2seq(num_decoder_tokens, num_encoder_tokens, emb_sz, lstm_sz):
  
  encoder_inputs = Input(shape=(None,))
  
  en_x=  Embedding(num_encoder_tokens, emb_sz, mask_zero = True)(encoder_inputs)
  
  encoder = LSTM(lstm_sz, return_state=True)
  
  encoder_outputs, state_h, state_c = encoder(en_x)

  
  # We discard `encoder_outputs` and only keep the states.
  
  encoder_states = [state_h, state_c]
  
  # Encoder model
  encoder_model = Model(encoder_inputs, encoder_states)
  
  
  # Set up the decoder, using `encoder_states` as initial state.
  decoder_inputs = Input(shape=(None,))

  dex=  Embedding(num_decoder_tokens, emb_sz, mask_zero = True)

  final_dex= dex(decoder_inputs)


  decoder_lstm = LSTM(lstm_sz, return_sequences=True, return_state=True)

  decoder_outputs, _, _ = decoder_lstm(final_dex,
                                      initial_state=encoder_states)

  decoder_dense = Dense(num_decoder_tokens, activation='softmax')

  decoder_outputs = decoder_dense(decoder_outputs)

  model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

  model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])


  
  # Decoder model: Re-build based on explicit state inputs. Needed for step-by-step inference:
  decoder_state_input_h = Input(shape=(lstm_sz,))
  decoder_state_input_c = Input(shape=(lstm_sz,))
  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

  decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex, initial_state=decoder_states_inputs)
  decoder_states2 = [state_h2, state_c2]
  decoder_outputs2 = decoder_dense(decoder_outputs2)
  decoder_model = Model(
  [decoder_inputs] + decoder_states_inputs,
  [decoder_outputs2] + decoder_states2)  

  return model, encoder_model, decoder_model


In [26]:
model, encoder_model, decoder_model = seq2seq(num_decoder_tokens, num_encoder_tokens, emb_size, emb_size)
print(model.summary())
# plot_model(model, show_shapes=True, show_layer_names=True)
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size = 64,
          epochs = 30,
          validation_split = 0.2)

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, None, 50)             101300    ['input_5[0][0]']             
                                                                                                  
 embedding_3 (Embedding)     (None, None, 50)             220700    ['input_6[0][0]']             
                                                                                            

<keras.src.callbacks.History at 0x2ebd0041290>

### we can notice that the accuracy with mask zero went down as we have said 

- Now let's test our model 

In [27]:
# using the model for predicting 
decoder_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, None, 50)             220700    ['input_6[0][0]']             
                                                                                                  
 input_7 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                            

In [78]:
len(idx_to_french)

2026

In [84]:
def decode_sequence(input_seq, sep=' '):
      # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = french_to_idx[start_tok]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_tok = idx_to_french[sampled_token_index]
        decoded_sentence += sep + sampled_tok

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_tok == end_tok or
           len(decoded_sentence) > 20):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [85]:
for seq_index in range(5): #[14077,20122,40035,40064, 40056, 40068, 40090, 40095, 40100, 40119, 40131, 40136, 40150, 40153]:
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', lines.input[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: 0    go
Name: input, dtype: object
Decoded sentence:  à la maison _END
-
Input sentence: 1    go
Name: input, dtype: object
Decoded sentence:  à la maison _END
-
Input sentence: 2    go
Name: input, dtype: object
Decoded sentence:  à la maison _END
-
Input sentence: 3    go
Name: input, dtype: object
Decoded sentence:  à la maison _END
-
Input sentence: 4    hi
Name: input, dtype: object
Decoded sentence:  du calme _END
