# Text Generation

### José Pablo Kiesling Lange - 21581

## Imports

In [1]:
import numpy as np
from datasets import load_dataset
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TheKi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Dataset Loading

In [3]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [4]:
dataset_train = dataset["train"]["text"]
dataset_test = dataset["test"]["text"]
dataset_validation = dataset["validation"]["text"]

## Text Normalization

In [5]:
def remove_non_alphabetic_chars(text):
    return ''.join(char for char in text if char.isalpha() or char.isspace())

def filter_ascii_words(text):
    words = text.split()
    ascii_words = [word for word in words if all(ord(char) < 128 for char in word)]
    return ' '.join(ascii_words)

def normalize_whitespace(text):
    return ' '.join(text.split())

def convert_to_lowercase(text):
    return text.lower()

def normalize_text(text):
    text = remove_non_alphabetic_chars(text)
    text = filter_ascii_words(text)
    text = normalize_whitespace(text)
    text = convert_to_lowercase(text)
    return text

In [6]:
def remove_empty_strings(text_list):
    return [text for text in text_list if text.strip() != '']

def add_special_tokens(text_list):
    return ['<sos> ' + text + ' <eos>' for text in text_list]

def create_token_sequences(text_list):
    return [text.split() for text in text_list]

In [7]:
def preprocess_dataset(raw_texts):
    normalized_texts = [normalize_text(text) for text in raw_texts]
    filtered_texts = remove_empty_strings(normalized_texts)
    texts_with_tokens = add_special_tokens(filtered_texts)
    return texts_with_tokens

In [8]:
dataset_train = preprocess_dataset(dataset_train)
dataset_test = preprocess_dataset(dataset_test)
dataset_validation = preprocess_dataset(dataset_validation)

In [9]:
sequences_train = create_token_sequences(dataset_train)
sequences_test = create_token_sequences(dataset_test)
sequences_validation = create_token_sequences(dataset_validation)

In [10]:
print(f"Train samples: {len(sequences_train)}")
print(f"Test samples: {len(sequences_test)}")
print(f"Validation samples: {len(sequences_validation)}")
print(f"Sample sequence: {sequences_train[0][:10]}")

Train samples: 23686
Test samples: 2889
Validation samples: 2454
Sample sequence: ['<sos>', 'valkyria', 'chronicles', 'iii', '<eos>']


## Ejercicio 1: Red Feedforward Neural Network

### Vocabulary Construction

In [11]:
SPECIALS = ["<pad>", "<unk>", "<sos>", "<eos>"]
CONTEXT_WINDOW = 5

In [12]:
def build_frequency_distribution(sequences):
    return FreqDist(token for sequence in sequences for token in sequence)

In [13]:
def create_vocabulary(freq_dist, special_tokens):
    vocab_tokens = [token for token, _ in freq_dist.most_common() if token not in special_tokens]
    return special_tokens + vocab_tokens

In [14]:
def create_token_mappings(vocabulary):
    index_to_token = vocabulary
    token_to_index = {token: idx for idx, token in enumerate(vocabulary)}
    return index_to_token, token_to_index

In [15]:
freq_dist = build_frequency_distribution(sequences_train)
vocabulary = create_vocabulary(freq_dist, SPECIALS)
itos, stoi = create_token_mappings(vocabulary)

print(f"Vocabulary size: {len(stoi)}")
print(f"Most common tokens: {list(stoi.keys())[:20]}")

Vocabulary size: 61031
Most common tokens: ['<pad>', '<unk>', '<sos>', '<eos>', 'the', 'of', 'and', 'in', 'to', 'a', 'was', 's', 'on', 'as', 'that', 'for', 'with', 'by', 'is', 'it']


### Token Encoding Functions

In [16]:
def token_to_id(token):
    return stoi.get(token, stoi["<unk>"])

def id_to_token(token_id):
    if 0 <= token_id < len(itos):
        return itos[token_id]
    return "<unk>" 

### a) Fixed Window Representation

In [17]:
def extract_ngrams(sequence, n):
    return list(ngrams(sequence, n))

In [18]:
def split_context_and_target(ngram):
    context = ngram[:-1]
    target = ngram[-1]
    return context, target

In [19]:
def encode_tokens(tokens):
    return [token_to_id(token) for token in tokens]

### GPU Configuration


In [20]:
def check_gpu_availability():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        print(f"GPUs disponibles: {len(gpus)}")
        for gpu in gpus:
            print(f"  - {gpu}")
        return True
    else:
        print("No se detectaron GPUs. Usando CPU.")
        return False

def configure_gpu_memory():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("Configuración de GPU exitosa: memory growth habilitado")
        except RuntimeError as e:
            print(f"Error configurando GPU: {e}")

check_gpu_availability()
configure_gpu_memory()

GPUs disponibles: 1
  - PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Configuración de GPU exitosa: memory growth habilitado


In [21]:
def build_training_data(sequences, context_size):
    contexts = []
    targets = []
    
    for sequence in sequences:
        sequence_ngrams = extract_ngrams(sequence, context_size + 1)
        
        for ngram in sequence_ngrams:
            context, target = split_context_and_target(ngram)
            encoded_context = encode_tokens(context)
            encoded_target = token_to_id(target)
            
            contexts.append(encoded_context)
            targets.append(encoded_target)
    
    return np.array(contexts, dtype=np.int32), np.array(targets, dtype=np.int32)

In [22]:
X_train, y_train = build_training_data(sequences_train, CONTEXT_WINDOW)
X_val, y_val = build_training_data(sequences_validation, CONTEXT_WINDOW)
X_test, y_test = build_training_data(sequences_test, CONTEXT_WINDOW)

print(f"Vocabulary size: {len(stoi)}")
print(f"Train: X={X_train.shape}, y={y_train.shape}")
print(f"Val:   X={X_val.shape},   y={y_val.shape}")
print(f"Test:  X={X_test.shape},  y={y_test.shape}")

Vocabulary size: 61031
Train: X=(1621112, 5), y=(1621112,)
Val:   X=(169743, 5),   y=(169743,)
Test:  X=(190380, 5),  y=(190380,)


In [23]:
def evaluate_model_fast(model, X_test, y_test, batch_size=1024, sample_size=None):
    """
    Evalúa el modelo con optimizaciones para mejorar la velocidad.
    
    Args:
        model: Modelo entrenado
        X_test: Datos de entrada de prueba
        y_test: Etiquetas de prueba
        batch_size: Tamaño del lote para evaluación (por defecto 1024)
        sample_size: Si se especifica, evalúa solo una muestra aleatoria de este tamaño
    """
    if sample_size and sample_size < len(X_test):
        print(f"Evaluando en una muestra de {sample_size} ejemplos de {len(X_test)} total...")
        indices = np.random.choice(len(X_test), size=sample_size, replace=False)
        X_sample = X_test[indices]
        y_sample = y_test[indices]
    else:
        X_sample = X_test
        y_sample = y_test
    
    print(f"Iniciando evaluación con batch_size={batch_size}...")
    start_time = time.time()
    
    test_loss, test_accuracy = model.evaluate(
        X_sample, y_sample, 
        batch_size=batch_size, 
        verbose=1
    )
    
    eval_time = time.time() - start_time
    print(f"Evaluación completada en {eval_time:.2f} segundos")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    return test_loss, test_accuracy


### Conversión de datos para GPU


In [24]:
X_train = X_train.astype(np.int32)
y_train = y_train.astype(np.int32)
X_val = X_val.astype(np.int32)
y_val = y_val.astype(np.int32)
X_test = X_test.astype(np.int32)
y_test = y_test.astype(np.int32)

### b) Feedforward Neural Network Model

In [25]:
def create_embedding_layer(vocab_size, embedding_dim):
    return layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        mask_zero=False
    )

In [26]:
def create_hidden_layer(units, activation='relu'):
    return layers.Dense(units, activation=activation)

In [27]:
def create_output_layer(vocab_size):
    return layers.Dense(vocab_size, activation='softmax')

In [28]:
def build_feedforward_model(vocab_size, context_size, embedding_dim=128, hidden_units=256):
    model = keras.Sequential([
        layers.Input(shape=(context_size,)),
        create_embedding_layer(vocab_size, embedding_dim),
        layers.Flatten(),
        create_hidden_layer(hidden_units),
        layers.Dropout(0.3),
        create_hidden_layer(hidden_units // 2),
        layers.Dropout(0.3),
        create_output_layer(vocab_size)
    ])
    
    return model

In [29]:
EMBEDDING_DIM = 128
HIDDEN_UNITS = 256

ffnn_model = build_feedforward_model(
    vocab_size=len(stoi),
    context_size=CONTEXT_WINDOW,
    embedding_dim=EMBEDDING_DIM,
    hidden_units=HIDDEN_UNITS
)

ffnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 128)            7811968   
                                                                 
 flatten (Flatten)           (None, 640)               0         
                                                                 
 dense (Dense)               (None, 256)               164096    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 61031)             7

### Model Training

In [30]:
def compile_model(model, learning_rate=0.001):
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [31]:
def create_training_callbacks():
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )
    
    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6
    )
    
    return [early_stopping, reduce_lr]

In [32]:
import os

MODEL_PATH = 'models/ffnn.keras'

if os.path.exists(MODEL_PATH):
    print(f"✓ Modelo encontrado en '{MODEL_PATH}'")
    print("Cargando modelo entrenado...")
    ffnn_model = keras.models.load_model(MODEL_PATH)
    print("✓ Modelo cargado exitosamente")
    training_time = 0
else:
    print(f"✗ No se encontró el modelo en '{MODEL_PATH}'")
    print("Entrenando nuevo modelo...\n")
    
    os.makedirs('models', exist_ok=True)
    
    ffnn_model = compile_model(ffnn_model, learning_rate=0.001)
    callbacks = create_training_callbacks()
    
    start_time = time.time()
    
    history = ffnn_model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=512,
        callbacks=callbacks,
        verbose=1
    )
    
    training_time = time.time() - start_time
    print(f"\n✓ Entrenamiento completado")
    print(f"Tiempo de entrenamiento: {training_time:.2f} segundos ({training_time/60:.2f} minutos)")
    
    ffnn_model.save(MODEL_PATH)
    print(f"✓ Modelo guardado en '{MODEL_PATH}'")

✓ Modelo encontrado en 'models/ffnn.keras'
Cargando modelo entrenado...
✓ Modelo cargado exitosamente


### Model Evaluation

In [33]:
print("=== EVALUACIÓN OPTIMIZADA ===")
test_loss, test_accuracy = evaluate_model_fast(ffnn_model, X_test, y_test, sample_size=10000)


=== EVALUACIÓN OPTIMIZADA ===
Evaluando en una muestra de 10000 ejemplos de 190380 total...
Iniciando evaluación con batch_size=1024...
Evaluación completada en 4.85 segundos
Test Loss: 6.8549
Test Accuracy: 0.1552


In [34]:
def calculate_perplexity(loss):
    return np.exp(loss)

perplexity = calculate_perplexity(test_loss)
print(f"Test Perplexity: {perplexity:.2f}")

Test Perplexity: 948.51


### c) Sequential Text Generation

In [35]:
def prepare_context(tokens, context_size):
    if len(tokens) < context_size:
        padding = ['<pad>'] * (context_size - len(tokens))
        tokens = padding + tokens
    else:
        tokens = tokens[-context_size:]
    
    return np.array([encode_tokens(tokens)])

In [36]:
def predict_next_token(model, context, temperature=1.0):
    predictions = model.predict(context, verbose=0)[0]
    predictions = np.log(predictions + 1e-10) / temperature
    predictions = np.exp(predictions)
    predictions = predictions / np.sum(predictions)
    
    return np.random.choice(len(predictions), p=predictions)

In [37]:
def generate_text(model, seed_text, max_length=50, context_size=5, temperature=1.0):
    tokens = seed_text.lower().split()
    generated_tokens = tokens.copy()
    
    for _ in range(max_length):
        context = prepare_context(tokens, context_size)
        next_token_id = predict_next_token(model, context, temperature)
        next_token = id_to_token(next_token_id)
        
        if next_token == '<eos>':
            break
        
        if next_token not in ['<pad>', '<unk>', '<sos>']:
            generated_tokens.append(next_token)
        
        tokens.append(next_token)
    
    return ' '.join(generated_tokens)

In [38]:
def test_text_generation(model, seed_texts, temperatures=[0.5, 1.0, 1.5]):
    for seed in seed_texts:
        print(f"\n{'='*80}")
        print(f"Seed: '{seed}'")
        print(f"{'='*80}")
        
        for temp in temperatures:
            generated = generate_text(
                model, 
                seed, 
                max_length=30, 
                context_size=CONTEXT_WINDOW,
                temperature=temp
            )
            print(f"\nTemperature {temp}:")
            print(generated)

In [39]:
seed_texts = [
    "the president of the",
    "in the year",
    "the first time",
    "he was born in"
]

test_text_generation(ffnn_model, seed_texts, temperatures=[0.7, 1.0, 1.3])


Seed: 'the president of the'



Temperature 0.7:
the president of the pdr and the water of the film s artistic underground systems and the time of the former in the peach state and the great led a variety of short network

Temperature 1.0:
the president of the recreational allen insists even under the party colleague macleod permeated dylan from one dam many guitar hero its letters was refused which knew they were services to then found based

Temperature 1.3:
the president of the mythical technology arched solar crake will address than his courtship sockeye base ignored on wisconsin and afterwards massachusetts fantasy from radar songs were dominant out of pusan ankle and ceased

Seed: 'in the year'

Temperature 0.7:
in the year after the fifa ny on the initial four nations had a m been a level of french combination and mini songs of on the school detachment of several years there

Temperature 1.0:
in the year after their bootleg was not larry pike s puma elements of the alternative journal inside a and hartford ga

### Results Summary - FFNN

In [40]:
print("\n" + "="*80)
print("FEEDFORWARD NEURAL NETWORK - SUMMARY")
print("="*80)
print(f"Architecture:")
print(f"  - Context Window: {CONTEXT_WINDOW} tokens")
print(f"  - Embedding Dimension: {EMBEDDING_DIM}")
print(f"  - Hidden Units: {HIDDEN_UNITS}")
print(f"  - Vocabulary Size: {len(stoi)}")
print(f"\nPerformance:")
print(f"  - Test Accuracy: {test_accuracy:.4f}")
print(f"  - Test Loss: {test_loss:.4f}")
print(f"  - Test Perplexity: {perplexity:.2f}")
print(f"\nTraining:")
print(f"  - Training Time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print(f"  - Training Samples: {len(X_train)}")
print("="*80)


FEEDFORWARD NEURAL NETWORK - SUMMARY
Architecture:
  - Context Window: 5 tokens
  - Embedding Dimension: 128
  - Hidden Units: 256
  - Vocabulary Size: 61031

Performance:
  - Test Accuracy: 0.1552
  - Test Loss: 6.8549
  - Test Perplexity: 948.51

Training:
  - Training Time: 0.00 seconds (0.00 minutes)
  - Training Samples: 1621112


## Ejercicio 2: RNN


### RNN Sequences


In [41]:
def build_rnn_sequences(sequences, max_length=None):
    input_seqs = []
    target_seqs = []
    
    for sequence in sequences:
        if len(sequence) < 2:
            continue
            
        if max_length and len(sequence) > max_length:
            for i in range(0, len(sequence) - max_length + 1, max_length // 2):
                chunk = sequence[i:i + max_length + 1]
                if len(chunk) >= 2:
                    input_seqs.append(chunk[:-1])
                    target_seqs.append(chunk[1:])
        else:
            input_seqs.append(sequence[:-1])
            target_seqs.append(sequence[1:])
    
    return input_seqs, target_seqs


In [42]:
def pad_sequences(sequences, max_length, pad_token_id):
    padded = []
    for seq in sequences:
        if len(seq) >= max_length:
            padded.append(seq[:max_length])
        else:
            padding = [pad_token_id] * (max_length - len(seq))
            padded.append(seq + padding)
    return np.array(padded, dtype=np.int32)


In [43]:
def prepare_rnn_data(sequences, max_length=50):
    input_seqs, target_seqs = build_rnn_sequences(sequences, max_length=max_length)
    
    input_ids = [[token_to_id(token) for token in seq] for seq in input_seqs]
    target_ids = [[token_to_id(token) for token in seq] for seq in target_seqs]
    
    pad_token_id = stoi['<pad>']
    X = pad_sequences(input_ids, max_length, pad_token_id)
    y = pad_sequences(target_ids, max_length, pad_token_id)
    
    return X, y


In [44]:
RNN_MAX_LENGTH = 30

X_rnn_train, y_rnn_train = prepare_rnn_data(sequences_train, RNN_MAX_LENGTH)
X_rnn_val, y_rnn_val = prepare_rnn_data(sequences_validation, RNN_MAX_LENGTH)
X_rnn_test, y_rnn_test = prepare_rnn_data(sequences_test, RNN_MAX_LENGTH)


### SimpleRNN Model


In [45]:
def build_simple_rnn_model(vocab_size, embedding_dim=64, rnn_units=128, max_length=30):
    model = keras.Sequential([
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            input_length=max_length,
            mask_zero=True
        ),
        layers.SimpleRNN(
            units=rnn_units,
            return_sequences=True,
            dropout=0.2
        ),
        layers.Dropout(0.3),
        layers.TimeDistributed(
            layers.Dense(vocab_size, activation='softmax')
        )
    ])
    
    return model


In [46]:
rnn_model = build_simple_rnn_model(
    vocab_size=len(stoi),
    embedding_dim=64,
    rnn_units=128,
    max_length=RNN_MAX_LENGTH
)

rnn_model = compile_model(rnn_model, learning_rate=0.001)
rnn_model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 64)            3905984   
                                                                 
 simple_rnn (SimpleRNN)      (None, 30, 128)           24704     
                                                                 
 dropout_2 (Dropout)         (None, 30, 128)           0         
                                                                 
 time_distributed (TimeDistr  (None, 30, 61031)        7872999   
 ibuted)                                                         
                                                                 
Total params: 11,803,687
Trainable params: 11,803,687
Non-trainable params: 0
_________________________________________________________________


In [47]:
tf.keras.backend.clear_session()


### RNN Training


In [48]:
MODEL_PATH_RNN = 'models/rnn.keras'

if os.path.exists(MODEL_PATH_RNN):
    print("Cargando modelo RNN...")
    rnn_model = keras.models.load_model(MODEL_PATH_RNN)
    rnn_training_time = 0
else:
    print("Entrenando modelo RNN...")
    
    callbacks = create_training_callbacks()
    
    start_time = time.time()
    
    history_rnn = rnn_model.fit(
        X_rnn_train, y_rnn_train,
        validation_data=(X_rnn_val, y_rnn_val),
        epochs=15,
        batch_size=32,
        callbacks=callbacks,
        verbose=1
    )
    
    rnn_training_time = time.time() - start_time
    print(f"Entrenamiento completado en {rnn_training_time/60:.2f} minutos")
    
    rnn_model.save(MODEL_PATH_RNN)


Cargando modelo RNN...


### RNN Evaluation


In [49]:
rnn_test_loss, rnn_test_accuracy = evaluate_model_fast(rnn_model, X_rnn_test, y_rnn_test, sample_size=5000, batch_size=32)
rnn_perplexity = calculate_perplexity(rnn_test_loss)


Evaluando en una muestra de 5000 ejemplos de 11696 total...
Iniciando evaluación con batch_size=32...
Evaluación completada en 11.61 segundos
Test Loss: 5.9684
Test Accuracy: 0.1689


### RNN Text Generation


In [50]:
def prepare_rnn_context(tokens, max_length):
    token_ids = [token_to_id(token) for token in tokens]
    
    if len(token_ids) >= max_length:
        context = token_ids[-max_length:]
    else:
        padding = [stoi['<pad>']] * (max_length - len(token_ids))
        context = token_ids + padding
    
    return np.array([context], dtype=np.int32)


In [51]:
def predict_next_token_rnn(model, context, temperature=1.0):
    predictions = model.predict(context, verbose=0)[0]
    last_prediction = predictions[-1]
    
    predictions = np.log(last_prediction + 1e-10) / temperature
    predictions = np.exp(predictions)
    predictions = predictions / np.sum(predictions)
    
    return np.random.choice(len(predictions), p=predictions)


In [52]:
def generate_text_rnn(model, seed_text, max_length=30, temperature=1.0, max_generated=30):
    tokens = seed_text.lower().split()
    generated_tokens = tokens.copy()
    
    for _ in range(max_generated):
        context = prepare_rnn_context(tokens, max_length)
        next_token_id = predict_next_token_rnn(model, context, temperature)
        next_token = id_to_token(next_token_id)
        
        if next_token == '<eos>':
            break
        
        if next_token not in ['<pad>', '<unk>', '<sos>']:
            generated_tokens.append(next_token)
        
        tokens.append(next_token)
    
    return ' '.join(generated_tokens)


In [53]:
def test_rnn_text_generation(model, seed_texts, temperatures=[0.7, 1.0, 1.3]):
    for seed in seed_texts:
        print(f"\n{'='*80}")
        print(f"RNN - Seed: '{seed}'")
        print(f"{'='*80}")
        
        for temp in temperatures:
            generated = generate_text_rnn(
                model, 
                seed, 
                max_length=RNN_MAX_LENGTH,
                temperature=temp,
                max_generated=20
            )
            print(f"\nTemperature {temp}:")
            print(generated)


In [54]:
test_rnn_text_generation(rnn_model, seed_texts, temperatures=[0.7, 1.0, 1.3])



RNN - Seed: 'the president of the'

Temperature 0.7:
the president of the assembly of the democratic government was ill to be seen on the same day the film s first time in

Temperature 1.0:
the president of the union s side of the governor pope was a jewish court by prime minister heritage site the town to society

Temperature 1.3:
the president of the heretics certainly already been later prime sessions is to give these financial slump judged out of us anonymous minority dedicated

RNN - Seed: 'in the year'

Temperature 0.7:
in the year he began to gather the executive and director of the guardian s special simulations version of the song was published

Temperature 1.0:
in the year are located on bishop s complex as vice bishop of ajuran foliot signed by the british empire tudor war jin

Temperature 1.3:
in the year of french forge according to postpone a poetry on and period gordon was william aided by this with hindu ragnar

RNN - Seed: 'the first time'

Temperature 0.7:
the first 

### Comparison FFNN vs RNN


In [55]:
print("\n" + "="*80)
print("COMPARISON: FFNN vs RNN")
print("="*80)
print(f"{'Metric':<20} {'FFNN':<12} {'RNN':<12} {'Difference':<12}")
print("-" * 60)

acc_diff = rnn_test_accuracy - test_accuracy
print(f"{'Test Accuracy':<20} {test_accuracy:<12.4f} {rnn_test_accuracy:<12.4f} {acc_diff:+.4f}")

loss_diff = rnn_test_loss - test_loss
print(f"{'Test Loss':<20} {test_loss:<12.4f} {rnn_test_loss:<12.4f} {loss_diff:+.4f}")

perp_diff = rnn_perplexity - perplexity
print(f"{'Test Perplexity':<20} {perplexity:<12.2f} {rnn_perplexity:<12.2f} {perp_diff:+.2f}")

time_diff = rnn_training_time - training_time
print(f"{'Training Time (min)':<20} {training_time/60:<12.2f} {rnn_training_time/60:<12.2f} {time_diff/60:+.2f}")
print("="*80)



COMPARISON: FFNN vs RNN
Metric               FFNN         RNN          Difference  
------------------------------------------------------------
Test Accuracy        0.1552       0.1689       +0.0137
Test Loss            6.8549       5.9684       -0.8865
Test Perplexity      948.51       390.88       -557.62
Training Time (min)  0.00         0.00         +0.00


In [56]:
def compare_text_generation(ffnn_model, rnn_model, seed_texts, temperatures=[0.7, 1.0, 1.3]):
    for seed in seed_texts:
        print(f"\n{'='*100}")
        print(f"COMPARISON - Seed: '{seed}'")
        print(f"{'='*100}")
        
        for temp in temperatures:
            print(f"\n--- Temperature {temp} ---")
            
            ffnn_text = generate_text(
                ffnn_model, 
                seed, 
                max_length=30, 
                context_size=CONTEXT_WINDOW,
                temperature=temp
            )
            
            rnn_text = generate_text_rnn(
                rnn_model, 
                seed, 
                max_length=RNN_MAX_LENGTH,
                temperature=temp,
                max_generated=20
            )
            
            print(f"FFNN: {ffnn_text}")
            print(f"RNN:  {rnn_text}")
            print()


## Ejercicio 3: LSTM


### LSTM Model


In [57]:
def build_lstm_model(vocab_size, embedding_dim=64, lstm_units=128, max_length=30):
    model = keras.Sequential([
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            input_length=max_length,
            mask_zero=True
        ),
        layers.LSTM(
            units=lstm_units,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2
        ),
        layers.Dropout(0.3),
        layers.TimeDistributed(
            layers.Dense(vocab_size, activation='softmax')
        )
    ])
    
    return model


In [58]:
tf.keras.backend.clear_session()

lstm_model = build_lstm_model(
    vocab_size=len(stoi),
    embedding_dim=64,
    lstm_units=128,
    max_length=RNN_MAX_LENGTH
)

lstm_model = compile_model(lstm_model, learning_rate=0.001)
lstm_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 64)            3905984   
                                                                 
 lstm (LSTM)                 (None, 30, 128)           98816     
                                                                 
 dropout (Dropout)           (None, 30, 128)           0         
                                                                 
 time_distributed (TimeDistr  (None, 30, 61031)        7872999   
 ibuted)                                                         
                                                                 
Total params: 11,877,799
Trainable params: 11,877,799
Non-trainable params: 0
_________________________________________________________________


### LSTM Training


In [59]:
MODEL_PATH_LSTM = 'models/lstm.keras'

if os.path.exists(MODEL_PATH_LSTM):
    print("Cargando modelo LSTM...")
    lstm_model = keras.models.load_model(MODEL_PATH_LSTM)
    lstm_training_time = 0
else:
    print("Entrenando modelo LSTM...")
    
    callbacks = create_training_callbacks()
    
    start_time = time.time()
    
    history_lstm = lstm_model.fit(
        X_rnn_train, y_rnn_train,
        validation_data=(X_rnn_val, y_rnn_val),
        epochs=15,
        batch_size=32,
        callbacks=callbacks,
        verbose=1
    )
    
    lstm_training_time = time.time() - start_time
    print(f"Entrenamiento completado en {lstm_training_time/60:.2f} minutos")
    
    lstm_model.save(MODEL_PATH_LSTM)


Entrenando modelo LSTM...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Entrenamiento completado en 304.34 minutos


### LSTM Evaluation


In [60]:
lstm_test_loss, lstm_test_accuracy = evaluate_model_fast(lstm_model, X_rnn_test, y_rnn_test, sample_size=5000, batch_size=32)
lstm_perplexity = calculate_perplexity(lstm_test_loss)


Evaluando en una muestra de 5000 ejemplos de 11696 total...
Iniciando evaluación con batch_size=32...
Evaluación completada en 12.67 segundos
Test Loss: 5.9720
Test Accuracy: 0.1728


### LSTM Text Generation


In [61]:
def predict_next_token_lstm(model, context, temperature=1.0):
    predictions = model.predict(context, verbose=0)[0]
    last_prediction = predictions[-1]
    
    predictions = np.log(last_prediction + 1e-10) / temperature
    predictions = np.exp(predictions)
    predictions = predictions / np.sum(predictions)
    
    return np.random.choice(len(predictions), p=predictions)


In [62]:
def generate_text_lstm(model, seed_text, max_length=30, temperature=1.0, max_generated=20):
    tokens = seed_text.lower().split()
    generated_tokens = tokens.copy()
    
    for _ in range(max_generated):
        context = prepare_rnn_context(tokens, max_length)
        next_token_id = predict_next_token_lstm(model, context, temperature)
        next_token = id_to_token(next_token_id)
        
        if next_token == '<eos>':
            break
        
        if next_token not in ['<pad>', '<unk>', '<sos>']:
            generated_tokens.append(next_token)
        
        tokens.append(next_token)
    
    return ' '.join(generated_tokens)


In [63]:
def test_lstm_text_generation(model, seed_texts, temperatures=[0.7, 1.0, 1.3]):
    for seed in seed_texts:
        print(f"\n{'='*80}")
        print(f"LSTM - Seed: '{seed}'")
        print(f"{'='*80}")
        
        for temp in temperatures:
            generated = generate_text_lstm(
                model, 
                seed, 
                max_length=RNN_MAX_LENGTH,
                temperature=temp,
                max_generated=20
            )
            print(f"\nTemperature {temp}:")
            print(generated)


In [64]:
test_lstm_text_generation(lstm_model, seed_texts, temperatures=[0.7, 1.0, 1.3])



LSTM - Seed: 'the president of the'

Temperature 0.7:
the president of the newport news that they are spotted in the british ships the company moved up to the th mounted division in

Temperature 1.0:
the president of the doctor and in the history of over the daily mail said that this is pretty attractive to be mistaken for

Temperature 1.3:
the president of the hills adding two armies in secondary range agreed to recreate jurisdiction in such both influence historically probably frequently bayesian testing

LSTM - Seed: 'in the year'

Temperature 0.7:
in the year the area was the last time of the country in a river for the county of the central west of

Temperature 1.0:
in the year after the australians signed a badly damaged to erving the volturno diminished on september with the adriatic command of unrest

Temperature 1.3:
in the year it in sydney from the start chris sent once all received rapidly started protestants audience in battle operations italy making

LSTM - Seed: 'the fir

### Comparison RNN vs LSTM


In [65]:
print("\n" + "="*80)
print("COMPARISON: RNN vs LSTM")
print("="*80)
print(f"{'Metric':<20} {'RNN':<12} {'LSTM':<12} {'Difference':<12}")
print("-" * 60)

acc_diff = lstm_test_accuracy - rnn_test_accuracy
print(f"{'Test Accuracy':<20} {rnn_test_accuracy:<12.4f} {lstm_test_accuracy:<12.4f} {acc_diff:+.4f}")

loss_diff = lstm_test_loss - rnn_test_loss
print(f"{'Test Loss':<20} {rnn_test_loss:<12.4f} {lstm_test_loss:<12.4f} {loss_diff:+.4f}")

perp_diff = lstm_perplexity - rnn_perplexity
print(f"{'Test Perplexity':<20} {rnn_perplexity:<12.2f} {lstm_perplexity:<12.2f} {perp_diff:+.2f}")

time_diff = lstm_training_time - rnn_training_time
print(f"{'Training Time (min)':<20} {rnn_training_time/60:<12.2f} {lstm_training_time/60:<12.2f} {time_diff/60:+.2f}")
print("="*80)



COMPARISON: RNN vs LSTM
Metric               RNN          LSTM         Difference  
------------------------------------------------------------
Test Accuracy        0.1689       0.1728       +0.0039
Test Loss            5.9684       5.9720       +0.0036
Test Perplexity      390.88       392.30       +1.42
Training Time (min)  0.00         304.34       +304.34


In [66]:
def compare_rnn_lstm_generation(rnn_model, lstm_model, seed_texts, temperatures=[0.7, 1.0, 1.3]):
    for seed in seed_texts:
        print(f"\n{'='*100}")
        print(f"COMPARISON RNN vs LSTM - Seed: '{seed}'")
        print(f"{'='*100}")
        
        for temp in temperatures:
            print(f"\n--- Temperature {temp} ---")
            
            rnn_text = generate_text_rnn(
                rnn_model, 
                seed, 
                max_length=RNN_MAX_LENGTH,
                temperature=temp,
                max_generated=20
            )
            
            lstm_text = generate_text_lstm(
                lstm_model, 
                seed, 
                max_length=RNN_MAX_LENGTH,
                temperature=temp,
                max_generated=20
            )
            
            print(f"RNN:  {rnn_text}")
            print(f"LSTM: {lstm_text}")
            print()


In [67]:
compare_rnn_lstm_generation(rnn_model, lstm_model, seed_texts, temperatures=[0.7, 1.0, 1.3])



COMPARISON RNN vs LSTM - Seed: 'the president of the'

--- Temperature 0.7 ---
RNN:  the president of the national forest the committee had a long term problem at the time of the guardian s will have directed by
LSTM: the president of the war of britain was not decided to foment human beings from the dominican republic


--- Temperature 1.0 ---
RNN:  the president of the design by bell and this maids was administered by an honorary doctorate for madras and the war the defeated invasion
LSTM: the president of the th battalion members of the battalion headquarters r b company six of the main inter war in adolescence and the


--- Temperature 1.3 ---
RNN:  the president of the new zealand atlantic government was moved back to appearance of saigon st qi upon to sympathy unsettling smokey murphy wrote
LSTM: the president of the woods including hundreds of kleine koller admitted human dialogue move to play with r you who actually complaint what it


COMPARISON RNN vs LSTM - Seed: 'in the ye

### Final Comparison: All Models


In [68]:
print("\n" + "="*100)
print("FINAL COMPARISON: FFNN vs RNN vs LSTM")
print("="*100)
print(f"{'Metric':<25} {'FFNN':<12} {'RNN':<12} {'LSTM':<12}")
print("-" * 70)

print(f"{'Test Accuracy':<25} {test_accuracy:<12.4f} {rnn_test_accuracy:<12.4f} {lstm_test_accuracy:<12.4f}")
print(f"{'Test Loss':<25} {test_loss:<12.4f} {rnn_test_loss:<12.4f} {lstm_test_loss:<12.4f}")
print(f"{'Test Perplexity':<25} {perplexity:<12.2f} {rnn_perplexity:<12.2f} {lstm_perplexity:<12.2f}")
print(f"{'Training Time (min)':<25} {training_time/60:<12.2f} {rnn_training_time/60:<12.2f} {lstm_training_time/60:<12.2f}")
print("="*100)



FINAL COMPARISON: FFNN vs RNN vs LSTM
Metric                    FFNN         RNN          LSTM        
----------------------------------------------------------------------
Test Accuracy             0.1552       0.1689       0.1728      
Test Loss                 6.8549       5.9684       5.9720      
Test Perplexity           948.51       390.88       392.30      
Training Time (min)       0.00         0.00         304.34      


## Ejercicio 4: Transformers


### GPT-2 From Scratch


In [78]:
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

config = GPT2Config(
    vocab_size=30000,
    n_embd=256,
    n_layer=4,
    n_head=4
)

gpt2_from_scratch = TFGPT2LMHeadModel(config)
tokenizer_scratch = GPT2Tokenizer.from_pretrained("gpt2")

print("GPT-2 From Scratch (TensorFlow):")
print(f"Vocab size: {config.vocab_size}")
print(f"Embedding dim: {config.n_embd}")
print(f"Layers: {config.n_layer}")
print(f"Heads: {config.n_head}")
print("Model created successfully!")


GPT-2 From Scratch (TensorFlow):
Vocab size: 30000
Embedding dim: 256
Layers: 4
Heads: 4
Model created successfully!


### GPT-2 Pre-trained Spanish


In [79]:
from transformers import AutoTokenizer, TFAutoModelForCausalLM

tokenizer_pretrained = AutoTokenizer.from_pretrained("DeepESP/gpt2-spanish")
gpt2_pretrained = TFAutoModelForCausalLM.from_pretrained("DeepESP/gpt2-spanish")

print("GPT-2 Pre-trained Spanish (TensorFlow):")
print(f"Vocab size: {tokenizer_pretrained.vocab_size}")
print(f"Max length: {tokenizer_pretrained.model_max_length}")
print("Model loaded successfully!")


tokenizer_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at DeepESP/gpt2-spanish.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


GPT-2 Pre-trained Spanish (TensorFlow):
Vocab size: 50257
Max length: 1000000000000000019884624838656
Model loaded successfully!


### Text Generation


In [80]:
def generate_text_gpt2_tf(model, tokenizer, prompt, max_length=50, temperature=1.0):
    inputs = tokenizer.encode(prompt, return_tensors="tf")
    
    tokenizer.pad_token = tokenizer.eos_token
    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=temperature,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


In [81]:
def test_gpt2_generation_tf(model, tokenizer, model_name, prompts, temperatures=[0.7, 1.0, 1.3]):
    for prompt in prompts:
        print(f"\n{'='*80}")
        print(f"{model_name} - Prompt: '{prompt}'")
        print(f"{'='*80}")
        
        for temp in temperatures:
            try:
                generated = generate_text_gpt2_tf(model, tokenizer, prompt, max_length=60, temperature=temp)
                print(f"\nTemperature {temp}:")
                print(generated)
            except Exception as e:
                print(f"\nTemperature {temp}: Error - {str(e)}")
                print("Model not trained or requires different configuration")


In [82]:
prompts_english = [
    "The president of the",
    "In the year",
    "The first time",
    "He was born in"
]

prompts_spanish = [
    "El presidente de",
    "En el año",
    "La primera vez",
    "Nació en"
]


In [83]:
print("=== GPT-2 FROM SCRATCH (TensorFlow) ===")
test_gpt2_generation_tf(gpt2_from_scratch, tokenizer_scratch, "GPT-2 From Scratch", prompts_english, temperatures=[0.7, 1.0, 1.3])


=== GPT-2 FROM SCRATCH (TensorFlow) ===

GPT-2 From Scratch - Prompt: 'The president of the'

Temperature 0.7:
The president of theagg ***quer helmets Renaissance lawyersscreen ruined ruined cities Made L Dublin Bashar deserved artifact multiple rifle Package evaluating appearances helmetsouteinput arrive artifact to veteristic arrive mans reminds TwinMs admired arts Root fleeing territ Gal economics resolved ordinance hunt VelScience helmetsatter tu BAS Multi comparing exhantom Vo forced

Temperature 1.0:
The president of the pirIFF Bio suggela anyone fluor Renaissance Tort GTARY *** 24 Cyber Leave laughingOUS Leave Leave clinics deposit Nazis singing boosted buildings spur � island congr Awoken Driver Complex designing dressMsoute Tweet leverage brave Frontier rolesricksmag comparinguscriptrates arrangement Yan lifelong IsaChPresidentantom arrangement unconsciousandra

Temperature 1.3:
The president of the ImmInv ***RED milit Base buysaggfortableItemsearch Begin 550 entries Trudeau p

In [84]:
print("\n=== GPT-2 PRE-TRAINED SPANISH (TensorFlow) ===")
test_gpt2_generation_tf(gpt2_pretrained, tokenizer_pretrained, "GPT-2 Pre-trained Spanish", prompts_spanish, temperatures=[0.7, 1.0, 1.3])



=== GPT-2 PRE-TRAINED SPANISH (TensorFlow) ===

GPT-2 Pre-trained Spanish - Prompt: 'El presidente de'

Temperature 0.7:
El presidente de la Academia de Ciencias de Estados Unidos, William D. C. , "Los defensores de los valores morales de las sociedades contemporáneas" (Princeton University Press, Washington) demuestran que el hecho de que los políticos sean los defensores del "derecho a la igualdad" es una razón

Temperature 1.0:
El presidente de Venezuela tuvo un largo viaje hacia el interior de Estados Unidos en el que estuvo trabajando con el presidente del Gobierno en Colombia. Tras el secuestro, este país estaba siendo objeto de un escrutinio por parte del gobierno venezolano. Colombia no era el aeropuerto sino una oficina. La seguridad había recibido apoyo de

Temperature 1.3:
El presidente de la República, Fernando Arias Navarro, se encontraba ausente, y lo hizo sólo diez días más tarde de haberlo conocido de forma distinta durante el viaje hasta la estación. Al día siguiente 

In [85]:
print("\n" + "="*100)
print("TRANSFORMERS COMPARISON: From Scratch vs Pre-trained")
print("="*100)
print(f"{'Model':<25} {'Vocab Size':<12} {'Status':<20}")
print("-" * 60)

print(f"{'GPT-2 From Scratch':<25} {30000:<12} {'Untrained':<20}")
print(f"{'GPT-2 Pre-trained':<25} {tokenizer_pretrained.vocab_size:<12} {'Trained':<20}")

print("\n" + "="*100)
print("KEY DIFFERENCES:")
print("="*100)
print("• From Scratch: Random weights, needs training from zero")
print("• Pre-trained: Already trained on Spanish text, ready to use")
print("• Pre-trained: Better text generation quality immediately")
print("• From Scratch: Requires extensive training data and time")
print("• Pre-trained: Larger vocabulary and better understanding")
print("="*100)



TRANSFORMERS COMPARISON: From Scratch vs Pre-trained
Model                     Vocab Size   Status              
------------------------------------------------------------
GPT-2 From Scratch        30000        Untrained           
GPT-2 Pre-trained         50257        Trained             

KEY DIFFERENCES:
• From Scratch: Random weights, needs training from zero
• Pre-trained: Already trained on Spanish text, ready to use
• Pre-trained: Better text generation quality immediately
• From Scratch: Requires extensive training data and time
• Pre-trained: Larger vocabulary and better understanding


In [86]:
def compare_transformer_models():
    print("\n" + "="*120)
    print("COMPARISON: Traditional Models vs Transformers")
    print("="*120)
    print(f"{'Model Type':<20} {'Architecture':<15} {'Memory':<10} {'Training':<12} {'Quality':<12} {'Speed':<10}")
    print("-" * 85)
    
    print(f"{'FFNN':<20} {'Fixed Window':<15} {'Low':<10} {'Fast':<12} {'Basic':<12} {'Fast':<10}")
    print(f"{'RNN':<20} {'Sequential':<15} {'Medium':<10} {'Medium':<12} {'Limited':<12} {'Medium':<10}")
    print(f"{'LSTM':<20} {'Memory Gates':<15} {'High':<10} {'Slow':<12} {'Good':<12} {'Slow':<10}")
    print(f"{'GPT-2':<20} {'Attention':<15} {'Very High':<10} {'Very Slow':<12} {'Excellent':<12} {'Fast':<10}")
    
    print("\n" + "="*120)
    print("TRANSFORMER ADVANTAGES:")
    print("="*120)
    print("• Parallel processing: Faster training than RNNs")
    print("• Long-range dependencies: Better than fixed windows")
    print("• Attention mechanism: Focuses on relevant tokens")
    print("• Pre-trained models: Ready to use without training")
    print("• Scalability: Can handle very large models")
    print("="*120)


### Final Comparison: All Models


In [87]:
compare_transformer_models()


COMPARISON: Traditional Models vs Transformers
Model Type           Architecture    Memory     Training     Quality      Speed     
-------------------------------------------------------------------------------------
FFNN                 Fixed Window    Low        Fast         Basic        Fast      
RNN                  Sequential      Medium     Medium       Limited      Medium    
LSTM                 Memory Gates    High       Slow         Good         Slow      
GPT-2                Attention       Very High  Very Slow    Excellent    Fast      

TRANSFORMER ADVANTAGES:
• Parallel processing: Faster training than RNNs
• Long-range dependencies: Better than fixed windows
• Attention mechanism: Focuses on relevant tokens
• Pre-trained models: Ready to use without training
• Scalability: Can handle very large models


In [69]:
compare_text_generation(ffnn_model, rnn_model, seed_texts, temperatures=[0.7, 1.0, 1.3])



COMPARISON - Seed: 'the president of the'

--- Temperature 0.7 ---
FFNN: the president of the republic of us and john began a dominating and a city against the national council
RNN:  the president of the revolution and agreed to be replaced by a turkish production were arrested in and effort to the sarnia was not


--- Temperature 1.0 ---
FFNN: the president of the vision of the film interchange in had no much over the archaeological performance
RNN:  the president of the president of parliament began expired against his owners during the war for the readers as school in new york city


--- Temperature 1.3 ---
FFNN: the president of the components of feeling of critics departed repeatedly since asserting the dutch idaho al quotation to shiva and those plane rygbi gama he scored as both to the north residents of
RNN:  the president of the entering the offense right his power from hoover assigned vehicle comeback and agassi operate at helms back electric yours within


COMPARISON - Seed