# Evaluación Parcial 3 – Modelo Transformer


Este cuaderno desarrolla un **modelo Transformer** para la generación de respuestas en diálogos.

## 1 | Introducción

El objetivo es enseñar a un modelo a **predecir la respuesta** a una intervención dentro de un diálogo. Se utilizará la columna `dialog` de un conjunto de datos proporcionado.

In [None]:
# --- Librerías principales
import json
import math
import os
import pathlib
import pprint
import random
import re
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
import tensorflow as tf
from tensorflow.keras import layers

# reproducibilidad
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

## 2 | Carga y exploración de los datos

In [None]:
!wget -q https://raw.githubusercontent.com/JaznaLaProfe/Deep-Learning/main/data/dialog/train.csv
!wget -q https://raw.githubusercontent.com/JaznaLaProfe/Deep-Learning/main/data/dialog/validation.csv
!wget -q https://raw.githubusercontent.com/JaznaLaProfe/Deep-Learning/main/data/dialog/test.csv

train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("validation.csv")
test_df  = pd.read_csv("test.csv")

## 3 | Preprocesamiento de diálogos

In [None]:
def extract_pairs(text: str):
    turns = [l.strip() for l in text.split('\n') if l.strip()]
    pairs = []
    for i in range(0, len(turns)-1, 2):
        q, a = turns[i], turns[i+1]
        if q and a:
            pairs.append((q, a))
    return pairs

pairs = []
for d in train_df['dialog'].astype(str):
    pairs.extend(extract_pairs(d))

print(f'Pares totales extraídos: {len(pairs):,}')

# Mostrar ejemplo
for q,a in pairs[:3]:
    print('Q:', q)
    print('A:', a)
    print('-'*40)

Pares totales extraídos: 35,450
Q: ['Say , Jim , how about going for a few beers after dinner ? '
A: ' You know that is tempting but is really not good for our fitness . '
----------------------------------------
Q: ' What do you mean ? It will help us to relax . '
A: " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? "
----------------------------------------
Q: " I guess you are right.But what shall we do ? I don't feel like sitting at home . "
A: ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . '
----------------------------------------


### 3.1 | División Train / Val / Test

In [None]:
random.shuffle(pairs)
total = len(pairs)
train_cut = int(0.8*total)
val_cut   = int(0.9*total)

train_pairs = pairs[:train_cut]
val_pairs   = pairs[train_cut:val_cut]
test_pairs  = pairs[val_cut:]

print(f'Train: {len(train_pairs)} | Val: {len(val_pairs)} | Test: {len(test_pairs)}')

Train: 28360 | Val: 3545 | Test: 3545


## 4 | Vectorización de texto

In [None]:
MAX_LEN   = 50   # según rúbrica y análisis exploratorio
MIN_FREQ  = 1
BATCH_SZ  = 64

special_new = ['<bos>', '<eos>']   # solo los que añades

vectorizer = layers.TextVectorization(
    standardize=custom_standardization,
    output_mode='int',
    output_sequence_length=MAX_LEN
)

# 1) recopilar todos los textos (pregunta + respuesta)
all_text = [txt for q, a in (train_pairs + val_pairs + test_pairs) for txt in (q, a)]
print("Textos totales:", len(all_text))

# 2) adaptar → crea vocabulario base con '' y '[UNK]' al frente
vectorizer.adapt(all_text)

# 3) construir vocabulario final:
vocab_base = vectorizer.get_vocabulary()[2:]        # sin '' ni [UNK]
new_vocab  = ['', '[UNK]'] + special_new + vocab_base
vectorizer.set_vocabulary(new_vocab)

# 4) IDs útiles
PAD_ID, UNK_ID = 0, 1
BOS_ID, EOS_ID = 2, 3
VOCAB_SIZE = vectorizer.vocabulary_size()
print("VOCAB_SIZE:", VOCAB_SIZE)


### 4.1 | Creación de objetos `tf.data.Dataset`

In [None]:
def format_dataset(pairs):
    q_texts = [q for q,_ in pairs]
    a_texts = [a for _,a in pairs]

    enc = tf.cast(vectorizer(q_texts), tf.int32)
    dec_in  = tf.cast(vectorizer(['<bos> '+t for t in a_texts]), tf.int32)
    dec_out = tf.cast(vectorizer([t+' <eos>' for t in a_texts]), tf.int32)

    return tf.data.Dataset.from_tensor_slices(((enc, dec_in), dec_out))


def prepare_tf_dataset(pairs):
    ds = format_dataset(pairs)
    return (ds
            .shuffle(10_000, seed=SEED)
            .batch(BATCH_SZ)
            .prefetch(tf.data.AUTOTUNE))


train_ds = prepare_tf_dataset(train_pairs)
val_ds   = prepare_tf_dataset(val_pairs)

## 5 | Componentes del modelo Transformer

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model, **kwargs):
        super().__init__(**kwargs)
        self.pos_encoding = positional_encoding(max_len, d_model)

    def call(self, x):
        # Si llega un SparseTensor lo convertimos
        if isinstance(x, tf.SparseTensor):
            x = tf.sparse.to_dense(x)
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:, :seq_len, :]


In [None]:
def transformer_encoder(num_layers, d_model, num_heads, dff, input_vocab, maximum_position_encoding, rate=0.1):
    inputs   = layers.Input(shape=(None,), name='enc_input')
    padding_mask = layers.Lambda(lambda x: tf.cast(tf.math.equal(x, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :])(inputs)

    x = layers.Embedding(input_vocab, d_model)(inputs)
    x *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    x = PositionalEncoding(maximum_position_encoding, d_model)(x)
    x = layers.Dropout(rate)(x)

    for _ in range(num_layers):
        # multi‑head attention
        attn_out = layers.MultiHeadAttention(num_heads, key_dim=d_model, dropout=rate)(x, x, attention_mask=padding_mask)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attn_out)

        ffn_out = layers.Dense(dff, activation='relu')(x)
        ffn_out = layers.Dense(d_model)(ffn_out)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ffn_out)

    return tf.keras.Model(inputs=inputs, outputs=x, name='encoder')

def transformer_decoder(num_layers, d_model, num_heads, dff, target_vocab, maximum_position_encoding, rate=0.1):
    inputs   = layers.Input(shape=(None,), name='dec_input')
    enc_outs = layers.Input(shape=(None, d_model), name='enc_output')

    look_ahead_mask = layers.Lambda(
        lambda x: 1 - tf.linalg.band_part(tf.ones((tf.shape(x)[1], tf.shape(x)[1])), -1, 0))(inputs)
    padding_mask = layers.Lambda(lambda x: tf.cast(tf.math.equal(x, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :])(inputs)

    embed = layers.Embedding(target_vocab, d_model)(inputs)
    embed *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embed = PositionalEncoding(maximum_position_encoding, d_model)(embed)
    x = layers.Dropout(rate)(embed)

    for _ in range(num_layers):
        attn1 = layers.MultiHeadAttention(num_heads, key_dim=d_model, dropout=rate)(x, x, attention_mask=look_ahead_mask)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attn1)

        attn2 = layers.MultiHeadAttention(num_heads, key_dim=d_model, dropout=rate)(x, enc_outs, enc_outs)
        x = layers.LayerNormalization(epsilon=1e-6)(x + attn2)

        ffn = layers.Dense(dff, activation='relu')(x)
        ffn = layers.Dense(d_model)(ffn)
        x = layers.LayerNormalization(epsilon=1e-6)(x + ffn)

    outputs = layers.Dense(target_vocab)(x)
    return tf.keras.Model([inputs, enc_outs], outputs, name='decoder')

In [None]:
NUM_LAYERS = 4
D_MODEL    = 128
NUM_HEADS  = 8
DFF        = 512
DROPOUT    = 0.1

encoder = transformer_encoder(NUM_LAYERS, D_MODEL, NUM_HEADS, DFF, VOCAB_SIZE, MAX_LEN, DROPOUT)
decoder = transformer_decoder(NUM_LAYERS, D_MODEL, NUM_HEADS, DFF, VOCAB_SIZE, MAX_LEN, DROPOUT)

enc_inputs  = layers.Input(shape=(None,), name='encoder_inputs')
dec_inputs  = layers.Input(shape=(None,), name='decoder_inputs')

enc_outs = encoder(enc_inputs)
dec_outs = decoder([dec_inputs, enc_outs])

model = tf.keras.Model([enc_inputs, dec_inputs], dec_outs, name='seq2seq_transformer')

# Loss & metrics
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def masked_loss(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss_ = loss_object(y_true, y_pred)
    loss_ *= mask
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

def masked_accuracy(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    match = tf.cast(tf.equal(y_true, tf.cast(y_pred, tf.int32)), tf.float32)
    mask  = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    match *= mask
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss=masked_loss, metrics=[masked_accuracy])
model.summary()

## 6 | Entrenamiento


In [None]:
EPOCHS = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)

Epoch 1/5
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 211ms/step - loss: 8.2271 - masked_accuracy: 0.1286 - val_loss: 5.5856 - val_masked_accuracy: 0.2358
Epoch 2/5
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 151ms/step - loss: 5.0350 - masked_accuracy: 0.3252 - val_loss: 3.2634 - val_masked_accuracy: 0.6081
Epoch 3/5
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 149ms/step - loss: 2.9385 - masked_accuracy: 0.6371 - val_loss: 2.0650 - val_masked_accuracy: 0.7478
Epoch 4/5
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 150ms/step - loss: 1.9620 - masked_accuracy: 0.7540 - val_loss: 1.5489 - val_masked_accuracy: 0.8087
Epoch 5/5
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 148ms/step - loss: 1.5037 - masked_accuracy: 0.8107 - val_loss: 1.2424 - val_masked_accuracy: 0.8473


## 7 | Decodificación y evaluación BLEU

In [None]:

# ─── IDs y vocabulario globales ──────────────────────────────────────────
BOS_ID = vectorizer('<bos>').numpy()[0]
EOS_ID = vectorizer('<eos>').numpy()[0]
VOCAB  = vectorizer.get_vocabulary()          # lista idx→token

# ─── Generación (greedy) ─────────────────────────────────────────────────
def generate(model, src_text: str, max_len: int = 50) -> str:
    """
    Devuelve la secuencia generada por 'model' para 'src_text'.
    """
    enc_in = vectorizer([src_text])               # (1, enc_len)
    dec_in = tf.expand_dims([BOS_ID], 0)          # (1, 1)

    for _ in range(max_len):
        logits = model([enc_in, dec_in], training=False)  # (1, t, vocab)
        next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)
        dec_in  = tf.concat([dec_in, tf.expand_dims(next_id, -1)], axis=-1)

        if next_id[0] == EOS_ID:
            break

    # ids → tokens, eliminando <bos>/<eos>
    ids = dec_in.numpy().squeeze()
    tokens = [VOCAB[i] for i in ids if i not in (BOS_ID, EOS_ID)]
    return " ".join(tokens)

# ─── Evaluación BLEU ─────────────────────────────────────────────────────
def bleu_score(model,
               pairs,
               max_len: int = 50) -> float:
    """
    Calcula corpus-BLEU (%) sobre una lista de pares (src, ref).
    """
    references, candidates = [], []
    for src, ref in pairs:
        pred = generate(model, src, max_len=max_len)
        references.append([ref.split()])      # lista de refs por oración
        candidates.append(pred.split())

    return corpus_bleu(references, candidates) * 100

# ─── Ejemplo de uso ──────────────────────────────────────────────────────
score = bleu_score(model, test_pairs, max_len=MAX_LEN)
print(f"BLEU en test: {score:.2f}")


In [None]:
# --- obtener índices de tokens especiales
BOS_ID = vectorizer('<bos>').numpy()[0]
EOS_ID = vectorizer('<eos>').numpy()[0]
PAD_ID = vectorizer('<pad>').numpy()[0]

VOCAB  = vectorizer.get_vocabulary()      # lista → idx→token

def generate_response(model,prompt: str,
                      max_len: int = MAX_LEN,
                      temperature: float = 0.0) -> str:
    enc_input = vectorizer([prompt])
    dec_input = tf.expand_dims([BOS_ID], 0)        # (1,1)

    for _ in range(max_len):
        logits = model([enc_input, dec_input], training=False)[:, -1, :]

        if temperature == 0.0:
            #  tf.argmax → (1,)  →  expand_dims → (1,1)
            next_id = tf.argmax(logits, axis=-1, output_type=tf.int32)
            next_id = tf.expand_dims(next_id, -1)
        else:
            #  tf.random.categorical ya sale (1,1)
            next_id = tf.random.categorical(logits / temperature,
                                            num_samples=1,
                                            dtype=tf.int32)

        dec_input = tf.concat([dec_input, next_id], axis=-1)

        if next_id[0, 0] == EOS_ID:
            break

    ids = dec_input.numpy().squeeze()
    keep = [i for i in ids if i not in (BOS_ID, EOS_ID, PAD_ID)]
    tokens = [VOCAB[i] for i in keep]
    return " ".join(tokens).strip()

def chat(model, temperature: float = 0.0):
    """
    Bucle interactivo de consola.
    Escribe 'salir' para terminar.
    """
    print("=== Chat Transformer (escribe 'salir' para terminar) ===")
    while True:
        user = input("Tú: ").strip()
        if user.lower() in {"salir", "exit", "quit"}:
            print("Hasta luego 👋")
            break
        bot = generate_response(model, user, temperature=temperature)
        print("Bot:", bot)

In [None]:
generate_response("hello")

'also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also also'

In [None]:
chat(model)

=== Chat Transformer (escribe 'salir' para terminar) ===
Tú: hello
Bot: when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when
Tú: bye bye
Bot: when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when when
Tú: salir
Hasta luego 👋
