# SetUp

In [2]:
%%capture
!git clone https://github.com/Ryuksito/chatbot.git
!pip install --upgrade keras-nlp

In [2]:
from google.colab import drive
drive.mount('/content/drive')


import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import pathlib
import random
import string
import re
import numpy as np
import json

import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow as tf

import keras
from keras import layers
from keras import ops
from keras.layers import TextVectorization
import keras_nlp

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Consts

SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

DATASET_PATH = '/content/chatbot/data/instructions.json'
CORPUS_PATH = '/content/chatbot/data/corpus.txt'
VOCAB_PATH = '/content/chatbot/weights/vocab.txt'
METADATA_PATH = '/content/chatbot/weights/metadata.json'
EMBEDDINGS_PATH = '/content/chatbot/weights/embedding.weights.npy'

MODEL_DIR = '/content/drive/MyDrive/Exposiciones/Chatbot/weights/chatbot_model/'

VOCAB_SIZE = 15000
SEQ_LENGTH = 2**8
EMBED_DIM = 2**8
BATCH_SIZE = 2**6
TAKE = 100
LATENT_DIM = 2**11
NUM_LAYERS = 4
NUM_HEADS = 8
EPOCHS = 60
NUM_WARMUP_STEPS = int((TAKE * EPOCHS) * 0.05)

with open(VOCAB_PATH, 'r', encoding='utf-8') as file:
  VOCAB = file.read().split('\n')

def replace_first_zero(tensor, scalar):
    mask = tf.equal(tensor, 0)

    indices = tf.where(mask)
    if tf.size(indices) == 0:
        raise ValueError("No se encontró un 0 en el tensor")

    first_zero_index = indices[0][0]

    updated_tensor = tf.tensor_scatter_nd_update(tensor, [[first_zero_index]], [scalar])

    return updated_tensor

# Preprocesar los datos

## Cargar la capa de embeddings

In [5]:
weights = np.load(EMBEDDINGS_PATH)
embedding_layer = layers.Embedding(VOCAB_SIZE,
                      SEQ_LENGTH,
                      name="w2v_embedding")

# paso de forward para inicializar la capa
dummy_target = tf.zeros((1,), dtype=tf.int64)
embedding_layer(dummy_target)

embedding_layer.set_weights([weights])

embedding_layer.trainable = False

ValueError: Layer w2v_embedding weight shape (15000, 256) is not compatible with provided weight shape (15000, 500).

## Cargar el dataset

In [5]:
# Load Dataset

with open(DATASET_PATH, 'r', encoding='utf-8') as f:
  dataset = json.load(f)

sequences = []

for i in range(len(dataset['data'])):
  sequences.append(
      '<bos> ' +
      dataset['data'][i]['instruction'].lower() +
      ' <sep> ' +
      dataset['data'][i]['answer'].lower() +
      ' <eos> '
  )

# Imprimir datos
for i in sequences[0:1]:
  print(i)

<bos> sugiera un eslogan para una campaña de reciclaje.
 <sep> 1. "reduce, reutiliza, recicla: juntos por un futuro más verde."
2. "recicla hoy, para un mañana mejor."
3. "¡convierte tu basura en tesoro - recicla!"
4. "recicla por el ciclo de vida."
5. "ahorra recursos, recicla más." <eos> 


## Cargar el tokenizer

In [38]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
     vocabulary=VOCAB,
     lowercase=True,
     suffix_indicator='<pow>',
     oov_token='<unk>',
     sequence_length=SEQ_LENGTH + 1,
     special_tokens=['<bos>', '<eos>', '<sep>', '<mask>', '<unk>', '<pow>'],
     special_tokens_in_strings=True
)

# Dataset pipeline

In [7]:
# crear el dataset
def format_dataset(seq):
    sequence = tokenizer(seq)
    return (
        sequence[:, :-1],
        sequence[:, 1:],
    )


def make_dataset(sequences):
    dataset = tf_data.Dataset.from_tensor_slices(sequences)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(16)


train_ds = make_dataset(sequences)

In [8]:
# ver dimensiones del dataset
for inputs, targets in train_ds.take(1):
    print(f'inputs.shape: {inputs.shape}')
    print(f"targets.shape: {targets.shape}")

inputs.shape: (64, 255)
targets.shape: (64, 255)


# Model Clases

## Positional Embedding

In [9]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, features_embeddings=None, **kwargs):
        super().__init__(**kwargs)

        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.features_embeddings = features_embeddings

    def build(self):
        self.token_embeddings = layers.Embedding(
            input_dim=self.vocab_size, output_dim=self.embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=self.sequence_length, output_dim=self.embed_dim
        )

    def call(self, inputs):
        length = ops.shape(inputs)[-1]
        positions = ops.arange(0, length, 1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        if self.features_embeddings is None:
          return embedded_tokens + embedded_positions

        embedded_features = self.features_embeddings(inputs)
        return embedded_tokens + embedded_positions + embedded_features

    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        else:
            return ops.not_equal(inputs, 0)

## Decoder

In [10]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, ff_dim, num_heads, num_layers, vocab_size, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.num_heads = num_heads
        self.supports_masking = True
        self.num_layers = num_layers
        self.vocab_size = vocab_size

    def build(self, input_shape):
        self.atentions = [
            layers.MultiHeadAttention(
                num_heads=self.num_heads, key_dim=self.embed_dim
            )
            for _ in range(self.num_layers)
        ]

        self.normalizations = [
            layers.LayerNormalization() for _ in range(self.num_layers)
        ]

        self.ffn_layer_1 = layers.Dense(self.ff_dim, activation="relu", name='Dense2')
        self.ffn_layer_2 = layers.Dense(self.embed_dim)

        self.dropout_1 = layers.Dropout(0.3, name='Dropout1')
        self.dropout_2 = layers.Dropout(0.5, name='Dropout2')

        self.out = layers.Dense(self.vocab_size, activation="softmax", name='Dense3')

        self.layernorm_2 = layers.LayerNormalization()

        super().build(input_shape)


    def call(self, inputs, mask=None, training=True):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
            padding_mask = ops.minimum(padding_mask, causal_mask)
        else:
            padding_mask = None

        out = inputs
        for i in range(self.num_layers):
            attention_output = self.atentions[i](
                query=inputs,
                value=inputs,
                key=inputs,
                attention_mask=padding_mask,
                training=training)
            out = self.normalizations[i](out + attention_output)

        ffn_out = self.ffn_layer_1(out)
        ffn_out = self.dropout_1(ffn_out, training=training)
        ffn_out = self.ffn_layer_2(ffn_out)

        ffn_out = self.layernorm_2(ffn_out + out, training=training)
        ffn_out = self.dropout_2(ffn_out, training=training)

        return self.out(ffn_out)

    def get_causal_attention_mask(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = ops.arange(sequence_length)[:, None]
        j = ops.arange(sequence_length)
        mask = ops.cast(i >= j, dtype="int32")
        mask = ops.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = ops.concatenate(
            [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])],
            axis=0,
        )
        return ops.tile(mask, mult)

## Chatbot

In [11]:
class Chatbot(keras.Model):
  def __init__(self, embedding_layer, decoder) -> None:
      super().__init__()
      self.embedding_layer = embedding_layer
      self.decoder = decoder

  def call(self, inputs, training=True):
      mask = self.embedding_layer.compute_mask(inputs, True)
      out = self.embedding_layer(inputs)
      out = self.decoder(out, mask=mask, training=training)
      return out

## Learning Rate

In [None]:
class LRSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, post_warmup_learning_rate, warmup_steps):
        super().__init__()
        self.post_warmup_learning_rate = post_warmup_learning_rate
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        global_step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)
        warmup_progress = global_step / warmup_steps
        warmup_learning_rate = self.post_warmup_learning_rate * warmup_progress
        return tf.cond(
            global_step < warmup_steps,
            lambda: warmup_learning_rate,
            lambda: self.post_warmup_learning_rate,
        )

    def get_config(self):
        config = {
            "post_warmup_learning_rate": self.post_warmup_learning_rate,
            "warmup_steps": self.warmup_steps
        }
        return config


# Compilar y Entrenar el modelo

## Construir el modelo

In [12]:
embeddings = PositionalEmbedding(
    sequence_length=SEQ_LENGTH,
    vocab_size=VOCAB_SIZE,
    embed_dim=SEQ_LENGTH,
    features_embeddings=embedding_layer,
)

decoder = TransformerDecoder(
    embed_dim=SEQ_LENGTH,
    ff_dim=SEQ_LENGTH,
    num_heads=NUM_HEADS,
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS
)


chatbot = Chatbot(embeddings, decoder)

## Dummy pass para inicializar el modelo & Compilar

In [13]:
dummy = tf.random.uniform(shape=(1, SEQ_LENGTH), minval=0, maxval=VOCAB_SIZE, dtype=tf.int32)
out = chatbot(dummy)

chatbot.summary()

num_train_steps = TAKE * EPOCHS
num_warmup_steps = num_train_steps // 15
lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)

chatbot.compile(
    keras.optimizers.RMSprop(lr_schedule),
    loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

## Entrenar

In [None]:
early_stopping = keras.callbacks.EarlyStopping(monitor='loss',patience=5, restore_best_weights=True)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
check_point = tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(MODEL_DIR, "weights" + "_epoch_{epoch}" + '.weights.h5'),
        monitor="loss",
        save_best_only=False,
        save_weights_only=True,
    )


chatbot.fit(train_ds, epochs=EPOCHS, callbacks=[early_stopping, tensorboard_callback, check_point])

# Cargar el modelo

In [None]:

chatbot.load_weights(f'/content/drive/MyDrive/Exposiciones/Chatbot/weights/chatbot_model/chatbot.weights.h5')

# Decodificar

In [63]:
seq_vocab = tokenizer.get_vocabulary()
index_lookup = dict(zip(range(len(seq_vocab)), seq_vocab))

def decode_sequence(input_sentence:str):
  decoded_sentence = '<bos> ' + input_sentence + ' <sep> '
  print('\n Chatbot:', end='', flush=True)

  tokenized_input_sentence = tokenizer(decoded_sentence)

  for i in range(tf.where(tf.not_equal(tokenized_input_sentence, 0)).shape[0], SEQ_LENGTH):
    predictions = chatbot(tokenized_input_sentence[None, :-1], training=False)

    sampled_token_index = ops.convert_to_numpy(
            ops.argmax(predictions[0, i, :])
        ).item(0)


    tokenized_input_sentence = replace_first_zero(tokenized_input_sentence, sampled_token_index)



    sampled_token = index_lookup[sampled_token_index]

    if '<pow>' in sampled_token:
      sampled_token = sampled_token.replace('<pow>', '')
    else:
      if i % 8 == 0:
        sampled_token = sampled_token + '\n'
      else:
          sampled_token = ' ' + sampled_token


    print(sampled_token, end=' ')


    if sampled_token == "<eos>":
      break

  return tokenized_input_sentence

In [64]:
input_sentence = input(' User: ').lower()

try:
  out_tokenized_text = decode_sequence(input_sentence)
except KeyboardInterrupt:
  print('\n End Chat')

 User: alskb

 Chatbot: orgwikipediacommonsthumbaa4mon 98684 78d4 294971  intr yz46 th99 onworks814  1774 d
 n88apoliticalc e42choos  ingl  18799s spectrógraf onworks814 ion28 intencion
 k8  ingl toformatfitcropw1414q8  árbo er12numb  d  d explorationhum 1234 n88apoliticalc  ilizó  infogotran  18799s toformatfitcropw1414q8  palindrómic 7yx3  áfrica addnumb 
 End Chat
