# Text Generation

### José Pablo Kiesling Lange - 21581

## Imports

In [1]:
import numpy as np
from datasets import load_dataset
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TheKi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Dataset Loading

In [3]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [4]:
dataset_train = dataset["train"]["text"]
dataset_test = dataset["test"]["text"]
dataset_validation = dataset["validation"]["text"]

## Text Normalization

In [5]:
def remove_non_alphabetic_chars(text):
    return ''.join(char for char in text if char.isalpha() or char.isspace())

def filter_ascii_words(text):
    words = text.split()
    ascii_words = [word for word in words if all(ord(char) < 128 for char in word)]
    return ' '.join(ascii_words)

def normalize_whitespace(text):
    return ' '.join(text.split())

def convert_to_lowercase(text):
    return text.lower()

def normalize_text(text):
    text = remove_non_alphabetic_chars(text)
    text = filter_ascii_words(text)
    text = normalize_whitespace(text)
    text = convert_to_lowercase(text)
    return text

In [6]:
def remove_empty_strings(text_list):
    return [text for text in text_list if text.strip() != '']

def add_special_tokens(text_list):
    return ['<sos> ' + text + ' <eos>' for text in text_list]

def create_token_sequences(text_list):
    return [text.split() for text in text_list]

In [7]:
def preprocess_dataset(raw_texts):
    normalized_texts = [normalize_text(text) for text in raw_texts]
    filtered_texts = remove_empty_strings(normalized_texts)
    texts_with_tokens = add_special_tokens(filtered_texts)
    return texts_with_tokens

In [8]:
dataset_train = preprocess_dataset(dataset_train)
dataset_test = preprocess_dataset(dataset_test)
dataset_validation = preprocess_dataset(dataset_validation)

In [9]:
sequences_train = create_token_sequences(dataset_train)
sequences_test = create_token_sequences(dataset_test)
sequences_validation = create_token_sequences(dataset_validation)

In [10]:
print(f"Train samples: {len(sequences_train)}")
print(f"Test samples: {len(sequences_test)}")
print(f"Validation samples: {len(sequences_validation)}")
print(f"Sample sequence: {sequences_train[0][:10]}")

Train samples: 23686
Test samples: 2889
Validation samples: 2454
Sample sequence: ['<sos>', 'valkyria', 'chronicles', 'iii', '<eos>']


## Ejercicio 1: Red Feedforward Neural Network

### Vocabulary Construction

In [11]:
SPECIALS = ["<pad>", "<unk>", "<sos>", "<eos>"]
CONTEXT_WINDOW = 5

In [12]:
def build_frequency_distribution(sequences):
    return FreqDist(token for sequence in sequences for token in sequence)

In [13]:
def create_vocabulary(freq_dist, special_tokens):
    vocab_tokens = [token for token, _ in freq_dist.most_common() if token not in special_tokens]
    return special_tokens + vocab_tokens

In [14]:
def create_token_mappings(vocabulary):
    index_to_token = vocabulary
    token_to_index = {token: idx for idx, token in enumerate(vocabulary)}
    return index_to_token, token_to_index

In [15]:
freq_dist = build_frequency_distribution(sequences_train)
vocabulary = create_vocabulary(freq_dist, SPECIALS)
itos, stoi = create_token_mappings(vocabulary)

print(f"Vocabulary size: {len(stoi)}")
print(f"Most common tokens: {list(stoi.keys())[:20]}")

Vocabulary size: 61031
Most common tokens: ['<pad>', '<unk>', '<sos>', '<eos>', 'the', 'of', 'and', 'in', 'to', 'a', 'was', 's', 'on', 'as', 'that', 'for', 'with', 'by', 'is', 'it']


### Token Encoding Functions

In [16]:
def token_to_id(token):
    return stoi.get(token, stoi["<unk>"])

def id_to_token(token_id):
    if 0 <= token_id < len(itos):
        return itos[token_id]
    return "<unk>" 

### a) Fixed Window Representation

In [17]:
def extract_ngrams(sequence, n):
    return list(ngrams(sequence, n))

In [18]:
def split_context_and_target(ngram):
    context = ngram[:-1]
    target = ngram[-1]
    return context, target

In [19]:
def encode_tokens(tokens):
    return [token_to_id(token) for token in tokens]

### GPU Configuration


In [20]:
def check_gpu_availability():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        print(f"GPUs disponibles: {len(gpus)}")
        for gpu in gpus:
            print(f"  - {gpu}")
        return True
    else:
        print("No se detectaron GPUs. Usando CPU.")
        return False

def configure_gpu_memory():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("Configuración de GPU exitosa: memory growth habilitado")
        except RuntimeError as e:
            print(f"Error configurando GPU: {e}")

check_gpu_availability()
configure_gpu_memory()

GPUs disponibles: 1
  - PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Configuración de GPU exitosa: memory growth habilitado


In [21]:
def build_training_data(sequences, context_size):
    contexts = []
    targets = []
    
    for sequence in sequences:
        sequence_ngrams = extract_ngrams(sequence, context_size + 1)
        
        for ngram in sequence_ngrams:
            context, target = split_context_and_target(ngram)
            encoded_context = encode_tokens(context)
            encoded_target = token_to_id(target)
            
            contexts.append(encoded_context)
            targets.append(encoded_target)
    
    return np.array(contexts, dtype=np.int32), np.array(targets, dtype=np.int32)

In [22]:
X_train, y_train = build_training_data(sequences_train, CONTEXT_WINDOW)
X_val, y_val = build_training_data(sequences_validation, CONTEXT_WINDOW)
X_test, y_test = build_training_data(sequences_test, CONTEXT_WINDOW)

print(f"Vocabulary size: {len(stoi)}")
print(f"Train: X={X_train.shape}, y={y_train.shape}")
print(f"Val:   X={X_val.shape},   y={y_val.shape}")
print(f"Test:  X={X_test.shape},  y={y_test.shape}")

Vocabulary size: 61031
Train: X=(1621112, 5), y=(1621112,)
Val:   X=(169743, 5),   y=(169743,)
Test:  X=(190380, 5),  y=(190380,)


In [23]:
def evaluate_model_fast(model, X_test, y_test, batch_size=1024, sample_size=None):
    """
    Evalúa el modelo con optimizaciones para mejorar la velocidad.
    
    Args:
        model: Modelo entrenado
        X_test: Datos de entrada de prueba
        y_test: Etiquetas de prueba
        batch_size: Tamaño del lote para evaluación (por defecto 1024)
        sample_size: Si se especifica, evalúa solo una muestra aleatoria de este tamaño
    """
    if sample_size and sample_size < len(X_test):
        print(f"Evaluando en una muestra de {sample_size} ejemplos de {len(X_test)} total...")
        indices = np.random.choice(len(X_test), size=sample_size, replace=False)
        X_sample = X_test[indices]
        y_sample = y_test[indices]
    else:
        X_sample = X_test
        y_sample = y_test
    
    print(f"Iniciando evaluación con batch_size={batch_size}...")
    start_time = time.time()
    
    test_loss, test_accuracy = model.evaluate(
        X_sample, y_sample, 
        batch_size=batch_size, 
        verbose=1
    )
    
    eval_time = time.time() - start_time
    print(f"Evaluación completada en {eval_time:.2f} segundos")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    return test_loss, test_accuracy


### Conversión de datos para GPU


In [25]:
X_train = X_train.astype(np.int32)
y_train = y_train.astype(np.int32)
X_val = X_val.astype(np.int32)
y_val = y_val.astype(np.int32)
X_test = X_test.astype(np.int32)
y_test = y_test.astype(np.int32)

### b) Feedforward Neural Network Model

In [26]:
def create_embedding_layer(vocab_size, embedding_dim):
    return layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        mask_zero=False
    )

In [27]:
def create_hidden_layer(units, activation='relu'):
    return layers.Dense(units, activation=activation)

In [28]:
def create_output_layer(vocab_size):
    return layers.Dense(vocab_size, activation='softmax')

In [29]:
def build_feedforward_model(vocab_size, context_size, embedding_dim=128, hidden_units=256):
    model = keras.Sequential([
        layers.Input(shape=(context_size,)),
        create_embedding_layer(vocab_size, embedding_dim),
        layers.Flatten(),
        create_hidden_layer(hidden_units),
        layers.Dropout(0.3),
        create_hidden_layer(hidden_units // 2),
        layers.Dropout(0.3),
        create_output_layer(vocab_size)
    ])
    
    return model

In [30]:
EMBEDDING_DIM = 128
HIDDEN_UNITS = 256

ffnn_model = build_feedforward_model(
    vocab_size=len(stoi),
    context_size=CONTEXT_WINDOW,
    embedding_dim=EMBEDDING_DIM,
    hidden_units=HIDDEN_UNITS
)

ffnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 128)            7811968   
                                                                 
 flatten (Flatten)           (None, 640)               0         
                                                                 
 dense (Dense)               (None, 256)               164096    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 61031)             7

### Model Training

In [31]:
def compile_model(model, learning_rate=0.001):
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [32]:
def create_training_callbacks():
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )
    
    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6
    )
    
    return [early_stopping, reduce_lr]

In [33]:
import os

MODEL_PATH = 'models/ffnn.keras'

if os.path.exists(MODEL_PATH):
    print(f"✓ Modelo encontrado en '{MODEL_PATH}'")
    print("Cargando modelo entrenado...")
    ffnn_model = keras.models.load_model(MODEL_PATH)
    print("✓ Modelo cargado exitosamente")
    training_time = 0
else:
    print(f"✗ No se encontró el modelo en '{MODEL_PATH}'")
    print("Entrenando nuevo modelo...\n")
    
    os.makedirs('models', exist_ok=True)
    
    ffnn_model = compile_model(ffnn_model, learning_rate=0.001)
    callbacks = create_training_callbacks()
    
    start_time = time.time()
    
    history = ffnn_model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=512,
        callbacks=callbacks,
        verbose=1
    )
    
    training_time = time.time() - start_time
    print(f"\n✓ Entrenamiento completado")
    print(f"Tiempo de entrenamiento: {training_time:.2f} segundos ({training_time/60:.2f} minutos)")
    
    ffnn_model.save(MODEL_PATH)
    print(f"✓ Modelo guardado en '{MODEL_PATH}'")

✓ Modelo encontrado en 'models/ffnn.keras'
Cargando modelo entrenado...
✓ Modelo cargado exitosamente


### Model Evaluation

In [None]:
print("=== EVALUACIÓN OPTIMIZADA ===")
test_loss, test_accuracy = evaluate_model_fast(ffnn_model, X_test, y_test, sample_size=10000)


=== EVALUACIÓN OPTIMIZADA ===
Evaluando en una muestra de 10000 ejemplos de 190380 total...
Iniciando evaluación con batch_size=1024...
Evaluación completada en 2.48 segundos
Test Loss: 6.8533
Test Accuracy: 0.1514


In [35]:
def calculate_perplexity(loss):
    return np.exp(loss)

perplexity = calculate_perplexity(test_loss)
print(f"Test Perplexity: {perplexity:.2f}")

Test Perplexity: 947.04


### c) Sequential Text Generation

In [36]:
def prepare_context(tokens, context_size):
    if len(tokens) < context_size:
        padding = ['<pad>'] * (context_size - len(tokens))
        tokens = padding + tokens
    else:
        tokens = tokens[-context_size:]
    
    return np.array([encode_tokens(tokens)])

In [37]:
def predict_next_token(model, context, temperature=1.0):
    predictions = model.predict(context, verbose=0)[0]
    predictions = np.log(predictions + 1e-10) / temperature
    predictions = np.exp(predictions)
    predictions = predictions / np.sum(predictions)
    
    return np.random.choice(len(predictions), p=predictions)

In [38]:
def generate_text(model, seed_text, max_length=50, context_size=5, temperature=1.0):
    tokens = seed_text.lower().split()
    generated_tokens = tokens.copy()
    
    for _ in range(max_length):
        context = prepare_context(tokens, context_size)
        next_token_id = predict_next_token(model, context, temperature)
        next_token = id_to_token(next_token_id)
        
        if next_token == '<eos>':
            break
        
        if next_token not in ['<pad>', '<unk>', '<sos>']:
            generated_tokens.append(next_token)
        
        tokens.append(next_token)
    
    return ' '.join(generated_tokens)

In [39]:
def test_text_generation(model, seed_texts, temperatures=[0.5, 1.0, 1.5]):
    for seed in seed_texts:
        print(f"\n{'='*80}")
        print(f"Seed: '{seed}'")
        print(f"{'='*80}")
        
        for temp in temperatures:
            generated = generate_text(
                model, 
                seed, 
                max_length=30, 
                context_size=CONTEXT_WINDOW,
                temperature=temp
            )
            print(f"\nTemperature {temp}:")
            print(generated)

In [40]:
seed_texts = [
    "the president of the",
    "in the year",
    "the first time",
    "he was born in"
]

test_text_generation(ffnn_model, seed_texts, temperatures=[0.7, 1.0, 1.3])


Seed: 'the president of the'

Temperature 0.7:
the president of the valley the th century of the attack was the song of a center for the bells of the acoustic the first section of the coast to the cathedral

Temperature 1.0:
the president of the rank of southend for bruce contends that lead and continued to where performed the construction of skye lab is blaine jumped with the nasals cavendish beyond nevada on yeah the

Temperature 1.3:
the president of the manga since guitars escalated a second time anime service but he produced him salts calling by plans rather than charities its congresses charlie wrote artistic brother he treated in historic

Seed: 'in the year'

Temperature 0.7:
in the year of uk

Temperature 1.0:
in the year club held at the program part of the parable and from saving four minutes of canned arts and haddock s rear movies was designated such as an continuation of performance

Temperature 1.3:
in the year estimated until c scatter struggle to ruin the deployment o

### Results Summary - FFNN

In [None]:
print("\n" + "="*80)
print("FEEDFORWARD NEURAL NETWORK - SUMMARY")
print("="*80)
print(f"Architecture:")
print(f"  - Context Window: {CONTEXT_WINDOW} tokens")
print(f"  - Embedding Dimension: {EMBEDDING_DIM}")
print(f"  - Hidden Units: {HIDDEN_UNITS}")
print(f"  - Vocabulary Size: {len(stoi)}")
print(f"\nPerformance:")
print(f"  - Test Accuracy: {test_accuracy:.4f}")
print(f"  - Test Loss: {test_loss:.4f}")
print(f"  - Test Perplexity: {perplexity:.2f}")
print(f"\nTraining:")
print(f"  - Training Time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print(f"  - Training Samples: {len(X_train)}")
print("="*80)


FEEDFORWARD NEURAL NETWORK - SUMMARY
Architecture:
  - Context Window: 5 tokens
  - Embedding Dimension: 128
  - Hidden Units: 256
  - Vocabulary Size: 61031

Performance:
  - Test Accuracy: 0.1536
  - Test Loss: 6.8324
  - Test Perplexity: 927.39

Training:
  - Training Time: 668.66 seconds (11.14 minutes)
  - Training Samples: 1621112
