<a href="https://colab.research.google.com/github/VickkiMars/NLP_Mastery/blob/main/Seq2Seq_with_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install keras-nlp

Collecting keras-nlp
  Downloading keras_nlp-0.15.1-py3-none-any.whl.metadata (6.7 kB)
Collecting tensorflow-text (from keras-nlp)
  Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading keras_nlp-0.15.1-py3-none-any.whl (548 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m548.4/548.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-text, keras-nlp
Successfully installed keras-nlp-0.15.1 tensorflow-text-2.17.0


In [2]:
import keras_nlp
import pathlib
import random
import keras
from keras import ops
import tensorflow.data as tf_data
from tensorflow_text.tools.wordpiece_vocab import (bert_vocab_from_dataset as bert_vocab,)

In [3]:
BATCH_SIZE = 64
EPOCHS = 1
MAX_SEQUENCE_LENGTH = 142
INPUT_VOCAB_SIZE = TARGET_VOCAB_SIZE = 10000
EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

In [35]:
with open("/content/dataset_2.txt") as f:
  lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
  input, target = line.split("\t")
  input = input.lower()
  target = target.lower()
  text_pairs.append((input, target))

text_pairs = text_pairs[:15000]

In [36]:
for _ in range(5):
  print(random.choice(text_pairs))

('please send 53k to 4992205129, unity bank, at makanjuola lim.', '53000 unity bank 4992205129 makanjuola lim')
('please send exactly 53k to farahnaz romo at 6166294310, paystack-titan', '53000 paystack-titan 6166294310 farahnaz romo')
('please transfer 370k to imperial homes mortage bank, genette estephany, account number 2920792105. asap', '370000 imperial homes mortage bank 2920792105 genette estephany')
('can you transfer 486k to 4541269761', '486000 4541269761')
('kindly send 6k to 7003962819', '6000 7003962819')


In [37]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

15000 total pairs
10500 training pairs
2250 validation pairs
2 test pairs


In [38]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
  word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
  vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
      word_piece_ds.batch(1000).prefetch(2),
      vocabulary_size=vocab_size,
      reserved_tokens=reserved_tokens,
  )
  return vocab

In [39]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
input_samples = [text_pair[0] for text_pair in train_pairs]
input_vocab = train_word_piece(input_samples, INPUT_VOCAB_SIZE, reserved_tokens)

target_samples = [text_pair[1] for text_pair in train_pairs]
target_vocab = train_word_piece(target_samples, TARGET_VOCAB_SIZE, reserved_tokens)

In [40]:
print("Input Tokens: ", input_vocab[100:110])
print("Target Tokens: ", target_vocab[100:110])

Input Tokens:  ['ု', 'ọ', 'い', 'み', 'れ', '민', '수', '영', '유', '은']
Target Tokens:  ['み', 'れ', '민', '수', '영', '유', '은', '정', '주', '지']


In [41]:
input_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=input_vocab,
    lowercase=False,
)
target_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=target_vocab,
    lowercase=False
)

In [42]:
input_ex = text_pairs[0][0]
input_token_ex = input_tokenizer.tokenize(input_ex)
print("Input Sentence: ", input_ex)
print("Tokens: ", input_token_ex)
print(
    "Recovered Text After Detokenizing: ",
    input_tokenizer.detokenize(input_token_ex),
)

Input Sentence:  send 802m to 2697627589
Tokens:  tf.Tensor([116  18 627 115  12 317 331 314 379 143], shape=(10,), dtype=int32)
Recovered Text After Detokenizing:  send 802m to 2697627589


In [43]:
targ_ex = text_pairs[0][1]
targ_tokens= target_tokenizer.tokenize(targ_ex)
print("Target Sentence: ", targ_ex)
print("Tokens: ", targ_tokens)
print(
    "Recovered Text After Detokenizing: ",
    target_tokenizer.detokenize(targ_tokens),
)

Target Sentence:  802000000 2697627589
Tokens:  tf.Tensor([ 17 569  11 311 233 616 228 125], shape=(8,), dtype=int32)
Recovered Text After Detokenizing:  802000000 2697627589


In [44]:
def preprocess_batch(inp, targ):
  batch_size = ops.shape(targ)[0]
  inp = input_tokenizer(inp)
  targ = target_tokenizer(targ)

  # pad 'inp' to 'MAX_SEQUENCE_LENGTH'
  inp_start_end_packer = keras_nlp.layers.StartEndPacker(
      sequence_length=MAX_SEQUENCE_LENGTH,
      pad_value=input_tokenizer.token_to_id("[PAD]")
  )
  inp = inp_start_end_packer(inp)

  # add special tokens ("[START]" and "[END]") to 'inp'
  targ_start_end_packer = keras_nlp.layers.StartEndPacker(
      sequence_length=MAX_SEQUENCE_LENGTH+1,
      start_value=target_tokenizer.token_to_id("[START]"),
      end_value=target_tokenizer.token_to_id("[END]"),
      pad_value=target_tokenizer.token_to_id("[PAD]")
  )
  targ = targ_start_end_packer(targ)

  return(
      {
          "encoder_inputs": inp,
          "decoder_inputs": targ[:, :-1],
      },
      targ[:, 1:]
  )

In [45]:
def make_dataset(pairs):
  inp_texts, targ_texts = zip(*pairs)
  inp_texts = list(inp_texts)
  targ_texts = list(targ_texts)
  dataset = tf_data.Dataset.from_tensor_slices((inp_texts, targ_texts))
  dataset = dataset.batch(BATCH_SIZE)
  dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
  return dataset.shuffle(2048).prefetch(16).cache()

In [46]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [47]:
for inputs, targets in train_ds.take(1):
  print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
  print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
  print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 142)
inputs["decoder_inputs"].shape: (64, 142)
targets.shape: (64, 142)


In [48]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=INPUT_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero = True
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

In [49]:
# Decoder
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=TARGET_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(TARGET_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs
    ],
    decoder_outputs
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer"
)



In [50]:
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=25, validation_data=val_ds)

Epoch 1/25
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 220ms/step - accuracy: 0.8763 - loss: 1.7820 - val_accuracy: 0.9085 - val_loss: 0.5172
Epoch 2/25
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 123ms/step - accuracy: 0.9116 - loss: 0.4812 - val_accuracy: 0.9072 - val_loss: 0.4811
Epoch 3/25
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 124ms/step - accuracy: 0.9145 - loss: 0.4418 - val_accuracy: 0.9146 - val_loss: 0.4288
Epoch 4/25
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 125ms/step - accuracy: 0.9184 - loss: 0.4091 - val_accuracy: 0.9206 - val_loss: 0.3952
Epoch 5/25
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 127ms/step - accuracy: 0.9224 - loss: 0.3822 - val_accuracy: 0.9281 - val_loss: 0.3570
Epoch 6/25
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 130ms/step - accuracy: 0.9413 - loss: 0.3020 - val_accuracy: 0.9762 - val_loss: 0.1399
Epoch 7/25

<keras.src.callbacks.history.History at 0x7b7dc137bf70>

In [53]:
def decode_sequence(input_sentences):
  batch_size = 1

  # Tokenize the input sentence
  encoder_input_tokens = ops.convert_to_tensor(input_tokenizer(input_sentences))
  if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
    pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), input_tokenizer.token_to_id("[PAD]"))
    encoder_input_tokens = ops.concatenate([encoder_input_tokens, pads], axis=1)

  def next(prompt, cache, index):
    logits = transformer([encoder_input_tokens, prompt])[:, index-1, :]
    hidden_states = None
    return logits, hidden_states, cache

  length = 140
  start = ops.full((batch_size, 1), target_tokenizer.token_to_id("[START]"))
  pad = ops.full((batch_size, length - 1), target_tokenizer.token_to_id("[PAD]"))
  prompt = ops.concatenate([start, pad], axis=1)

  generated_tokens = keras_nlp.samplers.GreedySampler()(
      next,
      prompt,
      stop_token_ids=[
          target_tokenizer.token_to_id("[END]")
      ],
      index=1
  )
  generated_sentences = target_tokenizer.detokenize(generated_tokens)
  return generated_sentences

In [54]:
test_inp_texts = test_pairs
for i in range(2):
    input_sentence = random.choice(test_inp_texts)
    translated = decode_sequence([input_sentence])
    translated = translated[0]
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    print(f"** Example {i} **")
    print(input_sentence)
    print(translated)
    print()


** Example 0 **
805000 7451244873
8000gbu 8 800000012 aella17

** Example 1 **
805000 7451244873
8000gbu 8 800000012 aella17



In [33]:
test_pairs

('please wire 32k at 9762510021 asap', '32000 9762510021')

In [32]:
print(test_inp_texts)

['p', '3']


In [58]:
!pip install rouge-score



In [60]:
import rouge_score
rouge_1 = keras_nlp.metrics.RougeN(order=1)
rouge_2 = keras_nlp.metrics.RougeN(order=2)

for test_pair in test_pairs[:30]:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequence([input_sentence])
    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
    translated_sentence = (
        translated_sentence.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())

ImportError: RougeN requires the `rouge_score` package. Please install it with `pip install rouge-score`.