# Desafío 4 - Bot QA con Seq2Seq

## Paola Cartalá

## Librerías e Importaciones

In [None]:
import os
import re
import json
import numpy as np
import gdown
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split


## Configuración del Modelo

In [None]:
MAX_VOCAB_SIZE = 8000
MAX_LENGTH = 15
EMBEDDING_DIM = 300
HIDDEN_UNITS = 128
EPOCHS = 30
BATCH_SIZE = 64
DATA_SAMPLES = 40000


## Dataset ConvAI2

In [None]:
dataset_file = 'data_volunteers.json'
if not os.path.exists(dataset_file):
    print("Descargando el dataset ConvAI2...")
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    gdown.download(url, dataset_file, quiet=False)
else:
    print("El dataset ya se encuentra descargado.")

with open(dataset_file) as f:
    data = json.load(f)

## Preprocesamiento y Tokenización

In [None]:
def clean_text(text):
    """Limpia y normaliza texto expandiendo contracciones y removiendo caracteres especiales."""
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    return text.strip()


In [None]:
questions, answers = [], []
for conversation in data:
    for i in range(len(conversation['dialog']) - 1):
        q = clean_text(conversation['dialog'][i]['text'])
        a = clean_text(conversation['dialog'][i+1]['text'])
        if len(q.split()) < MAX_LENGTH and len(a.split()) < MAX_LENGTH:
            questions.append(q)
            answers.append(a)

answers_input = ['<sos> ' + a for a in answers]
answers_target = [a + ' <eos>' for a in answers]

In [None]:
# tokenización
tokenizer_q = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<unk>')
tokenizer_q.fit_on_texts(questions)
encoder_sequences = tokenizer_q.texts_to_sequences(questions)

tokenizer_a = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<unk>', filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer_a.fit_on_texts(answers_input + answers_target)
decoder_input_sequences = tokenizer_a.texts_to_sequences(answers_input)
decoder_target_sequences = tokenizer_a.texts_to_sequences(answers_target)

In [None]:
# padding
encoder_input = pad_sequences(encoder_sequences, maxlen=MAX_LENGTH, padding='post')
decoder_input = pad_sequences(decoder_input_sequences, maxlen=MAX_LENGTH, padding='post')
decoder_target = pad_sequences(decoder_target_sequences, maxlen=MAX_LENGTH, padding='post')

vocab_size_q = len(tokenizer_q.word_index) + 1
vocab_size_a = len(tokenizer_a.word_index) + 1

## Modelo Seq2Seq

In [None]:
def crear_modelo_qa_bot(vocab_size_q, vocab_size_a, embedding_dim, hidden_units):
    """Crea modelo Seq2Seq con encoder bidireccional y decoder LSTM para QA Bot."""
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(vocab_size_q, embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(
        LSTM(hidden_units, return_state=True, dropout=0.2)
    )(encoder_embedding)
    state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
    state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None,))
    decoder_embedding_layer = Embedding(vocab_size_a, embedding_dim, mask_zero=True)
    decoder_embedding = decoder_embedding_layer(decoder_inputs)
    decoder_lstm = LSTM(hidden_units * 2, return_sequences=True, return_state=True, dropout=0.2)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(vocab_size_a, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    return Model([encoder_inputs, decoder_inputs], decoder_outputs)

modelo = crear_modelo_qa_bot(vocab_size_q, vocab_size_a, EMBEDDING_DIM, HIDDEN_UNITS)
modelo.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
modelo.summary()

## Entrenamiento

In [None]:
encoder_train, encoder_val, dec_in_train, dec_in_val, dec_target_train, dec_target_val = train_test_split(
    encoder_input, decoder_input, decoder_target, test_size=0.15, random_state=42
)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, min_lr=1e-6)
]

history = modelo.fit(
    [encoder_train, dec_in_train], dec_target_train,
    validation_data=([encoder_val, dec_in_val], dec_target_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks
)

## Sistema de Inferencia

In [None]:
class QABotInference:
    """Sistema de inferencia para QA Bot usando temperatura y top-k."""
    def __init__(self, model, tokenizer_q, tokenizer_a, max_length):
        """Inicializa el sistema de inferencia con modelo y tokenizers."""
        self.model = model
        self.tokenizer_q = tokenizer_q
        self.tokenizer_a = tokenizer_a
        self.max_length = max_length
        self.idx_to_word_a = {v: k for k, v in tokenizer_a.word_index.items()}
        self.sos_token_id = tokenizer_a.word_index.get('<sos>')
        self.eos_token_id = tokenizer_a.word_index.get('<eos>')

    def generar_respuesta(self, pregunta, temperature=0.7, top_k=10):
        """Genera respuesta usando muestreo con temperatura y top-k."""
        pregunta_limpia = clean_text(pregunta)
        input_seq = self.tokenizer_q.texts_to_sequences([pregunta_limpia])
        input_padded = pad_sequences(input_seq, maxlen=self.max_length, padding='post')

        decoder_input_seq = np.zeros((1, self.max_length))
        decoder_input_seq[0, 0] = self.sos_token_id
        respuesta_generada = []

        for i in range(1, self.max_length):
            output_tokens = self.model.predict([input_padded, decoder_input_seq], verbose=0)
            probs = output_tokens[0, i-1, :]

            probs = np.asarray(probs).astype('float64')
            probs = np.log(probs) / temperature
            exp_probs = np.exp(probs)
            probs = exp_probs / np.sum(exp_probs)
            
            top_indices = np.argsort(probs)[-top_k:]
            top_probs = probs[top_indices]
            top_probs = top_probs / np.sum(top_probs)

            predicted_token_id = np.random.choice(top_indices, p=top_probs)
            
            if predicted_token_id == self.eos_token_id:
                break

            word = self.idx_to_word_a.get(predicted_token_id, '')
            respuesta_generada.append(word)
            decoder_input_seq[0, i] = predicted_token_id

        return ' '.join(respuesta_generada).capitalize()

## Evaluación Final

In [None]:
bot = QABotInference(modelo, tokenizer_q, tokenizer_a, MAX_LENGTH)

print("🤖 EVALUACIÓN FINAL DEL BOT")

preguntas_evaluacion = [
    "Do you read?",
    "Do you have any pet?",
    "Where are you from?",

    "How are you?",
    "What is your name?",
    "what are your hobbies?"
]

for pregunta in preguntas_evaluacion:
    respuesta = bot.generar_respuesta(pregunta)
    print(f"👤 USER: {pregunta}")
    print(f"🤖 BOT: {respuesta}\n" + "-"*50)
