# SetUp

In [None]:
%%capture
!git clone https://github.com/Ryuksito/chatbot.git
!pip install --upgrade keras-nlp

In [None]:
from google.colab import drive
drive.mount('/content/drive')


import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import pathlib
import random
import string
import re
import numpy as np
import json

import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow as tf

import keras
from keras import layers
from keras import ops
from keras.layers import TextVectorization
import keras_nlp

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Consts

SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

DATASET_PATH = '/content/chatbot/data/instructions.json'
CORPUS_PATH = '/content/chatbot/data/corpus.txt'
VOCAB_PATH = '/content/chatbot/weights/vocab.txt'
METADATA_PATH = '/content/chatbot/weights/metadata.json'
EMBEDDINGS_PATH = '/content/chatbot/weights/embedding.weights.npy'

MODEL_DIR = '/content/drive/MyDrive/Exposiciones/Chatbot/weights/chatbot_model/'

VOCAB_SIZE = 15000
SEQ_LENGTH = 2**8
EMBED_DIM = 2**8
BATCH_SIZE = 2**6
TAKE = 100
LATENT_DIM = 2**11
NUM_LAYERS = 4
NUM_HEADS = 8
EPOCHS = 60


with open(VOCAB_PATH, 'r', encoding='utf-8') as file:
  VOCAB = file.read().split('\n')

def replace_first_zero(tensor, scalar):
    mask = tf.equal(tensor, 0)

    indices = tf.where(mask)
    if tf.size(indices) == 0:
        print(tensor)
        raise ValueError("No se encontró un 0 en el tensor")

    first_zero_index = indices[0][0]

    updated_tensor = tf.tensor_scatter_nd_update(tensor, [[first_zero_index]], [scalar])

    return updated_tensor

# Preprocesar los datos

## Cargar la capa de embeddings

In [None]:
weights = np.load(EMBEDDINGS_PATH)
embedding_layer = layers.Embedding(VOCAB_SIZE,
                      SEQ_LENGTH,
                      name="w2v_embedding")

# paso de forward para inicializar la capa
dummy_target = tf.zeros((1,), dtype=tf.int64)
embedding_layer(dummy_target)

embedding_layer.set_weights([weights])

embedding_layer.trainable = False

## Cargar el dataset

## Cargar el tokenizer

In [None]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
     vocabulary=VOCAB,
     lowercase=True,
     suffix_indicator='<pow>',
     oov_token='<unk>',
     sequence_length=SEQ_LENGTH + 1,
     special_tokens=['<bos>', '<eos>', '<sep>', '<mask>', '<unk>', '<pow>'],
     special_tokens_in_strings=True
)

# Dataset pipeline

In [None]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 256)
inputs["decoder_inputs"].shape: (64, 256)
targets.shape: (64, 256)


# Model Clases

## Encoder

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
      pass
    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "dense_dim": self.dense_dim,
                "num_heads": self.num_heads,
            }
        )
        return config

## Positional Embedding

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, features_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.features_embeddings = features_embeddings

    def build(self):
        self.token_embeddings = layers.Embedding(
            input_dim=self.vocab_size, output_dim=self.embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=self.sequence_length, output_dim=self.embed_dim
        )

    def call(self, inputs):
        pass

    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        else:
            return ops.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

## Decoder

In [None]:

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        pass

    def get_causal_attention_mask(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = ops.arange(sequence_length)[:, None]
        j = ops.arange(sequence_length)
        mask = ops.cast(i >= j, dtype="int32")
        mask = ops.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = ops.concatenate(
            [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])],
            axis=0,
        )
        return ops.tile(mask, mult)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "latent_dim": self.latent_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


# Crear, Compilar y Entrenar

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 58s/step - accuracy: 0.0000e+00 - loss: 9.6953


<keras.src.callbacks.history.History at 0x786e02d713c0>

# Decodificar

In [None]:

def decode_sequence(input_sentence):
    pass


In [None]:
input_sentence = input('User: ').lower()

try:
  out_tokenized_text = decode_sequence(input_sentence)
except KeyboardInterrupt:
  print('\n End Chat')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
57 tf.Tensor(
[    5    83    49    30    47    49     5  8188  3134  1629  2428 12008
  7218  2111  2285  8188  2694  5803  2285 13861  8188  6795  3313  4971
  4404 11482  1629  3209 13861  1629 12008  3962  2285 13861  2345  3329
  9043 13861 10662   129  9702  2428 13861 13861 10303  2111 13861  2428
  2285 13861  5455  4409  7803  8870  4522  3781 12008     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0   

InvalidArgumentError: {{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:CPU:0}} slice index 256 of dimension 1 out of bounds. [Op:StridedSlice] name: strided_slice/

In [None]:
out_tokenized_text

ValueError: The rank of a RaggedTensor must be greater than 1, i.e., a list of scalars won't have ragged dimensions. Received argument `tensor` with rank 0.