<a href="https://colab.research.google.com/github/SILVIAIRENE/Data-Scientist-Machine-Learning-Engineer-Introductory-Course/blob/master/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
from tensorflow import keras

# Descargar el dataset (no usar extract=True esta vez)
path_to_zip = keras.utils.get_file(
    "fra-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip"
)

# Extraer manualmente
data_dir = os.path.dirname(path_to_zip)
with zipfile.ZipFile(path_to_zip, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

# Ahora sí: ruta al archivo fra.txt
file_path = os.path.join(data_dir, "fra.txt")

print("Ruta correcta:", file_path)

# Probar lectura
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

print("✅ Dataset cargado, ejemplos:", len(lines))
print("Ejemplo:", lines[0])


Ruta correcta: /root/.keras/datasets/fra.txt
✅ Dataset cargado, ejemplos: 167131
Ejemplo: Go.	Va !


In [None]:
# =======================
# Ejemplo Keras Seq2Seq (traducción char-level)
# =======================
# Fuente: keras.io / ejemplos oficiales
# Nota: Este ejemplo entrena un modelo pequeño.
# No requiere dataset pesado.

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Parámetros
batch_size = 64
epochs = 10
latent_dim = 256
num_samples = 100  # usar solo 100 frases como demo

# Dataset de ejemplo: inglés-francés mini
data_path = keras.utils.get_file(
    "fra-eng.zip", origin="http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip", extract=True
)
import os
file_path = os.path.join(os.path.dirname(data_path), "fra.txt")

# Lectura de datos
input_texts = []
target_texts = []
input_chars = set()
target_chars = set()
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
for line in lines[: num_samples]:
    if "\t" not in line:
        continue
    input_text, target_text = line.split("\t")[:2]
    # Añadimos tokens start y end
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)
    for ch in input_text:
        input_chars.add(ch)
    for ch in target_text:
        target_chars.add(ch)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

print("Ejemplos cargados:", len(input_texts))

# Diccionarios char->int
input_token_index = dict([(ch, i) for i, ch in enumerate(input_chars)])
target_token_index = dict([(ch, i) for i, ch in enumerate(target_chars)])

# Vectorización (one-hot)
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

# Definición del modelo
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
decoder_lstm = layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy")
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2)

print("✅ Modelo entrenado (demo).")


Ejemplos cargados: 100
Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 602ms/step - loss: 1.9168 - val_loss: 2.0971
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - loss: 1.8962 - val_loss: 2.0832
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - loss: 1.8633 - val_loss: 2.0615
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 1.8716 - val_loss: 1.9949
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - loss: 1.8018 - val_loss: 1.9171
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - loss: 1.7226 - val_loss: 1.8568
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - loss: 1.6743 - val_loss: 1.8174
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step - loss: 1.6322 - val_loss: 1.7909
Epoch 9/10
[1m2/2[0m [32m━━━━━

#  Sección 2: Traducción Automática (Seq2Seq con LSTM)

### Enunciado
Ejecutar y analizar el código de traducción automática carácter a carácter (`lstm_seq2seq.py` de Keras).


In [None]:
# =======================
# Descarga y extracción del dataset fra-eng
# =======================
import os, zipfile
from tensorflow import keras

path_to_zip = keras.utils.get_file(
    "fra-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip"
)

# Extraer manualmente
data_dir = os.path.dirname(path_to_zip)
with zipfile.ZipFile(path_to_zip, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

file_path = os.path.join(data_dir, "fra.txt")

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

print("✅ Dataset cargado. Total líneas:", len(lines))
print("Ejemplo:", lines[0])


✅ Dataset cargado. Total líneas: 167131
Ejemplo: Go.	Va !


In [None]:
# =======================
# Preparación de datos
# =======================
import numpy as np

num_samples = 10000
input_texts, target_texts = [], []
input_chars, target_chars = set(), set()

for line in lines[:num_samples]:
    if "\t" not in line:
        continue
    input_text, target_text = line.split("\t")[:2]
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)
    for ch in input_text:
        input_chars.add(ch)
    for ch in target_text:
        target_chars.add(ch)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))

num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

print("Tokens entrada:", num_encoder_tokens)
print("Tokens salida:", num_decoder_tokens)

input_token_index = {ch: i for i, ch in enumerate(input_chars)}
target_token_index = {ch: i for i, ch in enumerate(target_chars)}

encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, ch in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[ch]] = 1.0
    for t, ch in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[ch]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[ch]] = 1.0


Tokens entrada: 70
Tokens salida: 93


In [None]:
# =======================
# Definición y entrenamiento del modelo Seq2Seq
# =======================
from tensorflow.keras import layers, Model, Input

latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = layers.LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.2
)

print("✅ Modelo entrenado.")


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 396ms/step - loss: 1.1979 - val_loss: 1.1844
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 389ms/step - loss: 1.0383 - val_loss: 1.1603
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 419ms/step - loss: 1.0212 - val_loss: 1.1493
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 394ms/step - loss: 1.0010 - val_loss: 1.1611
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 403ms/step - loss: 0.9811 - val_loss: 1.1167
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 497ms/step - loss: 0.9721 - val_loss: 1.1048
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 438ms/step - loss: 0.9614 - val_loss: 1.0853
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 380ms/step - loss: 0.9489 - val_loss: 1.0837
Epoch 9/10
[1m1

### Teoría
- **Tokenización carácter a carácter** → cada caracter es un token.
- Pros: vocabulario pequeño.
- Contras: secuencias más largas.
- Alternativa moderna: subwords (SentencePiece, BPE).


#  Sección 3: Subtitulación de Imágenes (PyTorch)

### Enunciado
Ejecutar el modelo de subtitulado de imágenes preentrenado (tutorial de Yunjey).


In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import json

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_PATH = "models/BEST_checkpoint.pth.tar"   # subir a Colab
WORDMAP_PATH = "data/WORDMAP_coco.json"         # subir a Colab
IMG_PATH = "/content/sample.jpg"                # subir tu imagen

checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
decoder = checkpoint['decoder'].to(DEVICE).eval()
encoder = checkpoint['encoder'].to(DEVICE).eval()

with open(WORDMAP_PATH, 'r') as j:
    word_map = json.load(j)
rev_word_map = {v: k for k, v in word_map.items()}

transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

def caption_image(image_path):
    img = Image.open(image_path).convert('RGB')
    image = transform(img).unsqueeze(0).to(DEVICE)
    encoder_out = encoder(image)
    seq, _ = decoder.sample(encoder_out)
    words = [rev_word_map[i] for i in seq if i not in {word_map['<start>'], word_map['<end>']}]
    return " ".join(words)

print("Caption generado:", caption_image(IMG_PATH))


#  Sección 5: Reescritura del modelo de subtitulado en Keras


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.applications import ResNet50

EMBED_DIM = 256
LSTM_UNITS = 512
VOCAB_SIZE = 5000

def build_encoder(output_dim=EMBED_DIM):
    base = ResNet50(weights="imagenet", include_top=False, pooling="avg")
    base.trainable = False
    img_in = Input(shape=(224,224,3))
    feat = base(img_in)
    proj = layers.Dense(output_dim, activation="relu")(feat)
    return Model(img_in, proj, name="encoder")

def build_decoder(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM, lstm_units=LSTM_UNITS):
    dec_in = Input(shape=(None,))
    img_feat = Input(shape=(embed_dim,))
    emb = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(dec_in)
    h0 = layers.Dense(lstm_units, activation="tanh")(img_feat)
    c0 = layers.Dense(lstm_units, activation="tanh")(img_feat)
    lstm_out = layers.LSTM(lstm_units, return_sequences=True)(emb, initial_state=[h0,c0])
    out = layers.TimeDistributed(layers.Dense(vocab_size, activation="softmax"))(lstm_out)
    return Model([dec_in, img_feat], out, name="decoder")

def build_model():
    encoder = build_encoder()
    decoder = build_decoder()
    img_in = encoder.input
    dec_in = decoder.input[0]
    feat = encoder(img_in)
    outputs = decoder([dec_in, feat])
    model = Model([img_in, dec_in], outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

model = build_model()
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
#  Sección 6: Investigaciones avanzadas

### Respuestas
#1. **Traducción japonés ↔ inglés**
 #"- Dataset paralelo (JParaCrawl).
   #"- Tokenización subwords (SentencePiece).
   #"- Modelos preentrenados: mBART, mT5.

#"2.**Métodos avanzados de NMT**
   #"- Atención (Bahdanau, Luong).
   #- Transformers (Transformer base, BERT, GPT, mT5).

#3. **Texto → Imagen**
   #- GANs condicionados (StackGAN).
   #- Modelos de difusión (Stable Diffusion, Imagen, DALL·E 2).
   #- Usan un encoder de texto + generador en espacio latente.""
