In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data = pd.read_csv("eng_-french.csv")
english_sentences = data["English words/sentences"].tolist()
french_sentences = data["French words/sentences"].tolist()

In [3]:
english_sentences = [str(sentence) for sentence in english_sentences]
french_sentences = [str(sentence) for sentence in french_sentences]
print(len(english_sentences))
print(len(french_sentences))

175621
175621


# Tokenizer in Natural Language Processing

A tokenizer is a crucial component in natural language processing (NLP) that converts text into sequences of tokens, which are typically words or subwords. These tokens can then be processed by machine learning models.

## How Tokenizers Work

1. **Text to Sequence Conversion**:
   - The tokenizer first reads the text and splits it into individual tokens.
   - Each token is then mapped to a unique integer value based on a vocabulary built from the text.

2. **Padding Sequences**:
   - After tokenizing, sequences of different lengths are padded to ensure uniform length, which is required for batch processing in machine learning models.

3. **Example Code in Python**:

```python
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
sentences = [
    "I love machine learning.",
    "Tokenization is an important step in NLP."
]

# Initialize the Tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(sentences)

# Convert sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(sentences)

# Display the sequences
print("Sequences:", sequences)

# Pad the sequences to ensure uniform length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Display the padded sequences
print("Padded Sequences:", padded_sequences)

Sequences: `[[1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11]]`  
Padded Sequences: `[[ 1  2  3  4  0  0  0] [ 5  6  7  8  9 10 11]]`


In [4]:
# Tokenize the English sentences
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

#Tokenize the french Sentences 
tokenizer_fr = Tokenizer()
tokenizer_fr.fit_on_texts(french_sentences)
fr_seq = tokenizer_fr.texts_to_sequences(french_sentences)


vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_fr = len(tokenizer_fr.word_index) + 1

# applying the padding
max_length = max(len(seq) for seq in eng_seq + fr_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
fr_seq_padded = pad_sequences(fr_seq, maxlen=max_length, padding='post')

# Encoder and Decoder in Sequence-to-Sequence Models

## Encoder

The **encoder** is the first component of a sequence-to-sequence (seq2seq) model. It processes the input sequence and encodes it into a fixed-size context vector, which captures the essential information of the entire input sequence. The encoder typically consists of an embedding layer followed by a recurrent layer, such as LSTM or GRU. As the encoder processes each token in the sequence, it updates its hidden states, and the final hidden state (or a set of states) represents the context vector. This vector is then passed to the decoder.

## Decoder

The **decoder** is the second component of a seq2seq model. It takes the context vector from the encoder and generates the output sequence, typically one token at a time. The decoder is also usually built with an embedding layer and a recurrent layer. At each step, the decoder predicts the next token in the sequence based on the context vector and the previously generated tokens. The process continues until the entire output sequence is generated. The decoder's ability to generate the sequence relies heavily on the information provided by the encoder.


In [5]:
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_fr, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_fr, activation='softmax')
output = decoder_dense(decoder_outputs)

# the model that connects the encoder and decoder
model = Model([encoder_inputs, decoder_inputs], output)

# Compile the model with an appropriate optimizer and loss function
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [6]:
from tensorflow.keras.utils import plot_model

#  model architecture
plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [7]:
X_train, X_val, y_train, y_val = train_test_split(eng_seq_padded, fr_seq_padded, test_size=0.2)
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=6, batch_size=64)

Epoch 1/6
  14/2196 [..............................] - ETA: 2:53:11 - loss: 6.2426 - accuracy: 0.8144

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate([X_val, X_val], y_val)

In [None]:
print("The overall Accuracy: ")
print(accuracy)

In [None]:
np.shape(X_train)

# Sentence Translation Function

The following Python code defines a function `translate_sentence` that translates an English sentence into French using a trained sequence-to-sequence model. The function leverages a tokenizer to convert the input sentence into sequences of integers, which are then fed into the model to generate a translated sequence.

## Code Explanation

In [None]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)
    
    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_fr.index_word:
            translated_sentence.append(tokenizer_fr.index_word[i])
        else:
            translated_sentence.append(' ') 
        
    return ' '.join(translated_sentence)

input_sentence = "hello where are you"
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

In [None]:
val = "Hello subham kumar where.I really like you with my hear. Do you want to marry me. Hello"
list_sen = val.split('.')
print(list_sen)
new_list = []
for i in range(len(list_sen)):
    new_list.append(translate_sentence(list_sen[i]))
for i in range(len(new_list)):
    print(i)

In [None]:
# Example translation
input_sentence = "hello where are you"
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")


In [None]:
# Plotting training & validation accuracy values
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plotting training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Display the plots
plt.tight_layout()
plt.show()