<a href="https://colab.research.google.com/github/Torikul385/NLP/blob/main/Eng_to_Spanish_%5Bkeras%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q --upgrade tensorflow
!pip install -q --upgrade keras

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.1 MB[0m [31m16.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import string
import re

import keras
from keras import layers, ops
import tensorflow as tf

In [3]:
!!wget -q http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

[]

In [4]:
!!unzip -q spa-eng.zip

[]

###Parsing the data

In [5]:
!!head -20 /content/spa-eng/spa.txt

['Go.\tVe.',
 'Go.\tVete.',
 'Go.\tVaya.',
 'Go.\tVáyase.',
 'Hi.\tHola.',
 'Run!\t¡Corre!',
 'Run.\tCorred.',
 'Who?\t¿Quién?',
 'Fire!\t¡Fuego!',
 'Fire!\t¡Incendio!',
 'Fire!\t¡Disparad!',
 'Help!\t¡Ayuda!',
 'Help!\t¡Socorro! ¡Auxilio!',
 'Help!\t¡Auxilio!',
 'Jump!\t¡Salta!',
 'Jump.\tSalte.',
 'Stop!\t¡Parad!',
 'Stop!\t¡Para!',
 'Stop!\t¡Pare!',
 'Wait!\t¡Espera!']

In [6]:
with open("/content/spa-eng/spa.txt") as f:
  lines = f.readlines()

print(lines[:5])

['Go.\tVe.\n', 'Go.\tVete.\n', 'Go.\tVaya.\n', 'Go.\tVáyase.\n', 'Hi.\tHola.\n']


In [7]:
pairs = []
for line in lines:
  eng, spa = line.replace("\n", "").strip().split("\t")
  spa = "[start] " + spa + " [end]"
  pairs.append([eng, spa])

In [8]:
for i in range(5):
  indx = np.random.randint(0, len(pairs), 1)[0]
  print(pairs[indx])

['Tom picked up the menu and looked at it.', '[start] Tom cogió el menú y lo observó. [end]']
["I've heard that you shouldn't eat red meat more than once a day.", '[start] He oído que no deberías comer carne roja más de una vez al día. [end]']
['He knows many amusing magic tricks.', '[start] Él se sabe muchos trucos de magia divertidos. [end]']
['I hate that idea.', '[start] No me gusta esa idea. [end]']
['She has plenty of work to do.', '[start] Ella tiene mucho trabajo que hacer. [end]']


In [9]:
random_index = np.random.permutation(len(pairs))
pairs = np.array(pairs)
pairs = pairs[random_index]

train_size = int(0.9 * len(pairs))
val_size = int(0.05 * len(pairs))
train_pairs = pairs[:train_size]
test_pairs = pairs[train_size: train_size + val_size]
val_pairs = pairs[train_size + val_size :]

print(f'Total Pairs : {len(pairs)}')
print(f'Train Pairs : {len(train_pairs)}')
print(f'Test Pairs : {len(test_pairs)}')
print(f'Val Pairs : {len(val_pairs)}')

Total Pairs : 118964
Train Pairs : 107067
Test Pairs : 5948
Val Pairs : 5949


###Vectorizing the text data

In [10]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
seq_len = 20
batch_size = 64

def custom_standardization(input_string):
  lowercase =  tf.strings.lower(input_string)
  return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

eng_vect = layers.TextVectorization(
    max_tokens = vocab_size,
    output_sequence_length = seq_len
)

spa_vect = layers.TextVectorization(
    max_tokens = vocab_size,
    output_sequence_length = seq_len+1,
    standardize=custom_standardization
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]

eng_vect.adapt(train_eng_texts)
spa_vect.adapt(train_spa_texts)

###Create Dataset

In [11]:
def format_dataset(eng, spa):
  eng = eng_vect(eng)
  spa = spa_vect(spa)

  return (
      {
          "encoder_inputs": eng,
          "decoder_inputs": spa[:, :-1]
      },
      spa[:, 1:]
  )


def make_ds(pairs):
  eng_texts, spa_texts = zip(*pairs)
  eng_texts = list(eng_texts)
  spa_texts = list(spa_texts)

  ds = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
  ds = ds.batch(batch_size)
  ds = ds.map(format_dataset)
  return ds.cache().shuffle(2048).prefetch(16)


train_ds = make_ds(train_pairs)
val_ds = make_ds(val_pairs)

In [12]:
for inputs, targets in train_ds.take(1):
  print(inputs['encoder_inputs'].shape)
  print(inputs['decoder_inputs'].shape)
  print(targets.shape)

(64, 20)
(64, 20)
(64, 20)


##Building the model

In [13]:
class Encoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads

    self.attention = layers.MultiHeadAttention(
        num_heads = num_heads, key_dim=embed_dim
    )

    self.dense_proj = keras.Sequential([
        layers.Dense(dense_dim, activation='relu'),
        layers.Dense(embed_dim)
    ])


    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
    self.supports_masking = True

  def call(self, inputs, mask=None):
    if mask is not None:
      padding_mask = ops.cast(mask[:, None, :], dtype='int32')
    else:
      padding_mask = None

    attention_output = self.attention(inputs, inputs, attention_mask = padding_mask)
    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)

  def get_config(self):
    config = self.get_config()
    config.update({
        "embed_dim": self.embed_dim,
        "dense_dim": self.dense_dim,
        "num_heads": self.num_heads
    })

    return config

In [14]:
class PositionalEmbedding(layers.Layer):
  def __init__(self, seq_len, vocab_size, embed_dim, **kwargs):
    super().__init__(**kwargs)
    self.token_embed = layers.Embedding(
        input_dim = vocab_size,
        output_dim = embed_dim
    )

    self.pos_embed = layers.Embedding(
        input_dim = vocab_size,
        output_dim = embed_dim
    )


  def call(self, inputs):
    length = ops.shape(inputs)[-1]

    positions = ops.arange(0,length, 1)
    embed_token = self.token_embed(inputs)
    embed_pos = self.pos_embed(positions)
    return embed_token + embed_pos

  def compute_mask(self, inputs, mask=None):
    if mask is None:
      return None
    else:
      return ops.not_equal(inputs, 0)

  def get_config(self):
    config = super().get_config()
    config.update({
        "seq_length": self.seq_len,
        "vocab_size": self.vocab_size,
        "embed_dim": self.embed_dim
    })

    return config

In [15]:
class Decoder(layers.Layer):
  def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.latent_dim = latent_dim
    self.num_heads = num_heads

    self.attention_1 = layers.MultiHeadAttention(
        num_heads = num_heads,
        key_dim = embed_dim
    )

    self.attention_2 = layers.MultiHeadAttention(
        num_heads = num_heads,
        key_dim = embed_dim
    )

    self.dense_proj = keras.Sequential([
        layers.Dense(latent_dim, activation='relu'),
        layers.Dense(embed_dim)
    ])

    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
    self.layernorm_3 = layers.LayerNormalization()

    self.suports_masking = True

  def call(self, inputs, encoder_outputs, mask =None):
    causal_mask = self.get_causal_attention_mask(inputs)

    if mask is not None:
      padding_mask = ops.cat(mask[:,None, :], dtype='int32')
      padding_mask = ops.minimum(padding_mask, causal_mask)
    else:
      padding_mask = None

    attention_output_1 = self.attention_1(
        query=inputs, key=inputs, value=inputs, attention_mask = causal_mask
    )

    out_1 = self.layernorm_1(inputs + attention_output_1)

    attention_output_2 = self.attention_2(
        query=out_1,
        key=encoder_outputs,
        value=encoder_outputs,
        attention_mask = padding_mask
    )
    out_2 = self.layernorm_2(out_1 + attention_output_2)
    proj_output = self.dense_proj(out_2)
    return self.layernorm_3(out_2 + proj_output)

  def get_causal_attention_mask(self, inputs):
    input_shape = ops.shape(inputs)
    batch_size, seq_len = input_shape[0], input_shape[1]

    i = ops.arange(seq_len)[:,None]
    j = ops.arange(seq_len)
    mask = ops.cast(i>=j, dtype='int32')
    mask = ops.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = ops.concatenate([
          ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1,1])
    ], axis=0)

    return ops.tile(mask, mult)

  def get_config(self):
    config = super().get_config()
    config.update({
        "embed_dim": self.embed_dim,
        "latent_dim": self.latent_dim,
        "num_heads": self.num_heads
    })

    return config


In [16]:
embed_dim = 256
latent_dim = 2048
num_heads = 8


encoder_inputs = keras.Input(shape=(None,) , dtype='int64', name='encoder_inputs')
x = PositionalEmbedding(seq_len, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = Encoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype='int64', name='decoder_inputs')
encoder_seq_inputs = keras.Input(shape=(None,embed_dim), name='decoder_state_inputs')
x = PositionalEmbedding(seq_len, vocab_size, embed_dim)(decoder_inputs)
x = Decoder(embed_dim, latent_dim, num_heads)(x, encoder_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation='softmax')(x)
decoder = keras.Model([decoder_inputs, encoder_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name='transformer'
)

In [17]:
transformer.summary()

In [18]:
epochs = 10
transformer.compile(
    optimizer = keras.optimizers.RMSprop(),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

transformer.fit(train_ds, epochs=epochs, validation_data = val_ds)

Epoch 1/10
[1m1673/1673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 58ms/step - accuracy: 0.7040 - loss: 2.2033 - val_accuracy: 0.7735 - val_loss: 1.4379
Epoch 2/10
[1m1673/1673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 47ms/step - accuracy: 0.7807 - loss: 1.4236 - val_accuracy: 0.9613 - val_loss: 0.3210
Epoch 3/10
[1m1673/1673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 47ms/step - accuracy: 0.9587 - loss: 0.3363 - val_accuracy: 0.9883 - val_loss: 0.1177
Epoch 4/10
[1m1673/1673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 47ms/step - accuracy: 0.9824 - loss: 0.1496 - val_accuracy: 0.9890 - val_loss: 0.1411
Epoch 5/10
[1m1673/1673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 47ms/step - accuracy: 0.9924 - loss: 0.0690 - val_accuracy: 0.9987 - val_loss: 0.0297
Epoch 6/10
[1m1673/1673[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 47ms/step - accuracy: 0.9944 - loss: 0.0569 - val_accuracy: 0.9997 - val_loss: 0.0162
Ep

<keras.src.callbacks.history.History at 0x7d8df11cf9a0>

In [21]:
spa_vocab = spa_vect.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
  tokenized_input_sentence = eng_vect([input_sentence])
  decoded_sentence = "[start]"

  for i in range(max_decoded_sentence_length):
    tokenized_target_sentence = spa_vect([decoded_sentence])[:,:-1]
    preds = transformer([tokenized_input_sentence, tokenized_target_sentence])

    sample_token_index = ops.convert_to_numpy(
        ops.argmax(preds[0,i,:])
    ).item(0)
    sampled_token = spa_index_lookup[sample_token_index]
    decoded_sentence += " " + sampled_token

    if sampled_token == "[end]":
      break

  return decoded_sentence

In [22]:
test_eng_texts = [pair[0] for pair in test_pairs]
test_spa_texts = [pair[1] for pair in test_pairs]

for i in range(10):
  input_sentence = test_eng_texts[i]
  translated = decode_sequence(input_sentence)

  print(f"\nEng : {input_sentence}")
  print(f"Spa Original : {test_spa_texts[i]}")
  print(f"Spa Predicted : {translated}")


Eng : When did you start studying Latin?
Spa Original : [start] ¿Cuándo empezaste a estudiar latín? [end]
Spa Predicted : [start] casa que debo tienda almorzando               

Eng : Suddenly, the door opened and her father entered.
Spa Original : [start] De repente se abrió la puerta y entró su padre. [end]
Spa Predicted : [start] comer auto aconsejó yo esta comer anciano             

Eng : Form a straight line!
Spa Original : [start] ¡Formen una línea recta! [end]
Spa Predicted : [start] no tomé correo                 

Eng : I've been to the mall.
Spa Original : [start] He estado en el centro comercial. [end]
Spa Predicted : [start] trabajo de  amamos                

Eng : Your mother is anxious about your health.
Spa Original : [start] Tu madre está preocupada por tu salud. [end]
Spa Predicted : [start] hacerlo tom vendré todo te aire              

Eng : I don't want to walk home.
Spa Original : [start] No quiero caminar a casa. [end]
Spa Predicted : [start] está las de gato v