<a href="https://colab.research.google.com/github/Sara-Esm/Natural-Language-Processing-NLP-and-generative-AI/blob/main/5_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sequence-to-Sequence (Seq2Seq) Model for Machine Translation



Build a Seq2Seq model to translate text from Italian to English using a subset of the "Tatoeba" dataset. The lab demonstrates preprocessing, tokenization, model training, and evaluation.

In [2]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.callbacks import ModelCheckpoint
# ! pip install keras_preprocessing
from keras_preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers

In [3]:
## Dataset Preparation

# Import required libraries
import os
import pandas as pd
from google.colab import drive

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/1293/Datasets/ita.txt"
file = open(file_path, mode='rt', encoding='utf-8')
# read all text
text = file.read()
file.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Get sentences
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]

In [5]:
# English and Italian translations
sents

[['Hi.',
  'Ciao!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #607364 (Cero)'],
 ['Run!',
  'Corri!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906347 (Guybrush88)'],
 ['Run!',
  'Corra!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906348 (Guybrush88)'],
 ['Run!',
  'Correte!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906350 (Guybrush88)'],
 ['Who?',
  'Chi?',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #2126402 (Guybrush88)'],
 ['Wow!',
  'Wow!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #1922050 (Guybrush88)'],
 ['Jump!',
  'Salta!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #1102981 (jamessilver) & #1543215 (Guybrush88)'],
 ['Jump!',
  'Salti!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #1102981 (jamessilver) & #4356755 (Guybrush88)'],
 ['Jump!',
  'Saltate!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #1102981 (jamessilver) & #4

In [6]:
## Data Preprocessing

# Downsample the dataset to 25,000 samples for efficiency
import string
from  numpy import array
sents = array(sents)[:25000,:]
# Remove punctuation
sents[:,0] = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sents[:,0]]
sents[:,1] = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sents[:,1]]

In [7]:
sents

array([['hi', 'ciao',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #607364 (Cero)'],
       ['run', 'corri',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906347 (Guybrush88)'],
       ['run', 'corra',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906348 (Guybrush88)'],
       ...,
       ['everyone looked', 'tutti guardarono',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2111871 (CK) & #5348536 (Guybrush88)'],
       ['everyone looked', 'hanno guardato tutti',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2111871 (CK) & #5348537 (Guybrush88)'],
       ['everyone prayed', 'tutti hanno pregato',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2111870 (CK) & #5348529 (Guybrush88)']],
      dtype='<U537')

In [13]:
# Building two tokenizers that convert the text to numbers
!pip install tensorflow-text
from tensorflow.keras.preprocessing.text import Tokenizer
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(sents[:,0])

ita_tokenizer = Tokenizer()
ita_tokenizer.fit_on_texts(sents[:,1])



In [14]:
eng_tokenizer.word_index

{'i': 1,
 'tom': 2,
 'you': 3,
 'im': 4,
 'it': 5,
 'is': 6,
 'a': 7,
 'youre': 8,
 'were': 9,
 'we': 10,
 'me': 11,
 'was': 12,
 'are': 13,
 'he': 14,
 'do': 15,
 'dont': 16,
 'can': 17,
 'its': 18,
 'they': 19,
 'be': 20,
 'ill': 21,
 'go': 22,
 'that': 23,
 'this': 24,
 'to': 25,
 'have': 26,
 'now': 27,
 'the': 28,
 'not': 29,
 'here': 30,
 'up': 31,
 'toms': 32,
 'did': 33,
 'my': 34,
 'like': 35,
 'she': 36,
 'need': 37,
 'theyre': 38,
 'get': 39,
 'come': 40,
 'stop': 41,
 'love': 42,
 'got': 43,
 'stay': 44,
 'am': 45,
 'well': 46,
 'look': 47,
 'so': 48,
 'in': 49,
 'know': 50,
 'busy': 51,
 'him': 52,
 'see': 53,
 'help': 54,
 'who': 55,
 'us': 56,
 'want': 57,
 'thats': 58,
 'home': 59,
 'cant': 60,
 'lost': 61,
 'one': 62,
 'them': 63,
 'try': 64,
 'keep': 65,
 'hate': 66,
 'please': 67,
 'just': 68,
 'saw': 69,
 'let': 70,
 'take': 71,
 'will': 72,
 'hes': 73,
 'too': 74,
 'on': 75,
 'all': 76,
 'back': 77,
 'out': 78,
 'how': 79,
 'old': 80,
 'there': 81,
 'leave': 82,
 '

In [15]:
ita_tokenizer.word_index

{'tom': 1,
 'è': 2,
 'sono': 3,
 'io': 4,
 'non': 5,
 'a': 6,
 'mi': 7,
 'la': 8,
 'ho': 9,
 'un': 10,
 'di': 11,
 'ha': 12,
 'siamo': 13,
 'noi': 14,
 'lei': 15,
 'il': 16,
 'lo': 17,
 'sei': 18,
 'una': 19,
 'siete': 20,
 'voi': 21,
 'si': 22,
 'tu': 23,
 'loro': 24,
 'lui': 25,
 'era': 26,
 'in': 27,
 'ero': 28,
 'me': 29,
 'questo': 30,
 'ci': 31,
 'chi': 32,
 'le': 33,
 'posso': 34,
 'sta': 35,
 'qui': 36,
 'per': 37,
 'bisogno': 38,
 'così': 39,
 'abbiamo': 40,
 'casa': 41,
 'piace': 42,
 'bene': 43,
 'ora': 44,
 'che': 45,
 'da': 46,
 'ne': 47,
 'adesso': 48,
 'andare': 49,
 'sto': 50,
 'ti': 51,
 'i': 52,
 'vi': 53,
 'amo': 54,
 'lho': 55,
 'mio': 56,
 'odio': 57,
 'voglio': 58,
 'fatto': 59,
 'con': 60,
 'lha': 61,
 'può': 62,
 'e': 63,
 'molto': 64,
 'sembra': 65,
 'mia': 66,
 'troppo': 67,
 'del': 68,
 'al': 69,
 'quello': 70,
 'devo': 71,
 'hanno': 72,
 'avete': 73,
 'uno': 74,
 'hai': 75,
 'più': 76,
 'tutti': 77,
 'state': 78,
 'stai': 79,
 'lì': 80,
 'dei': 81,
 'favore'

In [16]:
# Encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    # Pas sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [17]:
# Split data into X (Italian phrases) and y (English phrases)
from sklearn.model_selection import train_test_split
train, test = train_test_split(sents, test_size=0.2, random_state = 42)
# prepare training data
X_train = encode_sequences(ita_tokenizer, 8, train[:, 1])
y_train = encode_sequences(eng_tokenizer, 8, train[:, 0])

# prepare validation data
X_test = encode_sequences(ita_tokenizer, 8, test[:, 1])
y_test = encode_sequences(eng_tokenizer, 8, test[:, 0])

In [18]:
y_train[0]

array([901,  35,   2,   0,   0,   0,   0,   0], dtype=int32)

In [19]:
train[:, 0][0]

'women like tom'

In [20]:
# Define model
model = Sequential()
model.add(Embedding(len(ita_tokenizer.word_index) + 1, 512, input_length=8, mask_zero=True))
model.add(LSTM(512))
model.add(RepeatVector(8))
model.add(LSTM(512, return_sequences=True))
model.add(Dense(len(eng_tokenizer.word_index) + 1, activation='softmax'))



In [22]:
model.build(input_shape=(None, 50))
model.summary()

In [23]:
rms = optimizers.RMSprop(learning_rate=0.001) # Good choice for RNNs
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [25]:
# Fit model
model.fit(X_train, y_train.reshape(y_train.shape[0], y_train.shape[1], 1),
                    epochs=10, batch_size=512, validation_split = 0.2,
                    verbose=1)

Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 7s/step - loss: 2.8027 - val_loss: 2.5762
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 7s/step - loss: 2.5076 - val_loss: 2.3145
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 7s/step - loss: 2.2459 - val_loss: 2.1335
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 8s/step - loss: 2.1029 - val_loss: 2.0726
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 8s/step - loss: 2.0514 - val_loss: 2.0530
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 7s/step - loss: 2.0163 - val_loss: 2.0210
Epoch 7/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 7s/step - loss: 2.0023 - val_loss: 2.0063
Epoch 8/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 7s/step - loss: 1.9878 - val_loss: 2.0011
Epoch 9/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bc0ed1233a0>

In [26]:
# Predict on test data
import numpy as np
preds = np.argmax(model.predict(X_test.reshape((X_test.shape[0],X_test.shape[1]))), axis = 2)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 124ms/step


In [27]:
# Convert numbers to words
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None
preds_text = []

for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)

    preds_text.append(' '.join(temp))

In [28]:
pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})

Unnamed: 0,actual,predicted
0,who are you,i you
1,youre unusual,tom is
2,take the bus,i you
3,im different,im is
4,is tom single,im is
...,...,...
4995,im discreet,im is
4996,dont push it,i you
4997,we want it,i you
4998,that was lucky,i you
