#📦 Carga de librerías y configuración inicial



In [None]:
import numpy as np
import random
import sys
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint,  EarlyStopping, ReduceLROnPlateau

try:
    import unidecode
except ImportError:
    !pip install unidecode
    import unidecode

from google.colab import drive
drive.mount('/content/drive')

# Fijar semillas para reproducibilidad
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0
Mounted at /content/drive


#🧼 Carga y preprocesamiento del texto


In [None]:
def loadDocument(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read().lower()

def preprocess_text(filename):
    raw_text = loadDocument(filename)
    raw_text = unidecode.unidecode(raw_text)

    # Limpiar caracteres no permitidos
    allowed_chars = 'abcdefghijklmnopqrstuvwxyz '
    raw_text = ''.join([char for char in raw_text if char in allowed_chars])

    # Remover espacios múltiples
    while '  ' in raw_text:
        raw_text = raw_text.replace('  ', ' ')

    return raw_text.strip()

book_filepath = '/content/drive/MyDrive/Entrega_3/Don Quijote de la Mancha.txt'
doc = preprocess_text(book_filepath)

print(f"Longitud del texto: {len(doc)} caracteres")
print(f"Muestra del texto: '{doc[:100]}'")


Longitud del texto: 1966425 caracteres
Muestra del texto: 'capitulo primero que trata de la condicion y ejercicio del famoso hidalgodon quijote de la manchaen '


#🔡 Crear mapeos de caracteres



In [None]:
chars = sorted(list(set(doc)))
total_chars = len(chars)
print(f"Total de caracteres únicos: {total_chars}")
print(f"Caracteres: {chars}")

char_to_int = {char: i for i, char in enumerate(chars)}
int_to_char = {i: char for i, char in enumerate(chars)}

Total de caracteres únicos: 26
Caracteres: [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


#🧩  Crear secuencias de entrada y etiquetas


In [None]:
seq_length = 60
input_sequences = []
target_chars = []

for i in range(len(doc) - seq_length):
    seq_in = doc[i:i + seq_length]
    seq_out = doc[i + seq_length]
    input_sequences.append([char_to_int[char] for char in seq_in])
    target_chars.append(char_to_int[seq_out])

print(f"Total de secuencias creadas: {len(input_sequences)}")

X = np.array(input_sequences)
y = np.array(target_chars)

print(f"Forma de X: {X.shape}")
print(f"Forma de y: {y.shape}")

Total de secuencias creadas: 1966365
Forma de X: (1966365, 60)
Forma de y: (1966365,)


#✂️ División del conjunto de datos (entrenamiento, validación y test)



In [None]:
train_size = 0.7
val_size = 0.2
test_size = 0.1

train_idx = int(len(X) * train_size)
val_idx = int(len(X) * (train_size + val_size))

X_train = X[:train_idx]
y_train = y[:train_idx]
X_val = X[train_idx:val_idx]
y_val = y[train_idx:val_idx]
X_test = X[val_idx:]
y_test = y[val_idx:]

print(f"\nDimensiones de los datos:")
print(f"Entrenamiento: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Validación: X_val={X_val.shape}, y_val={y_val.shape}")
print(f"Test: X_test={X_test.shape}, y_test={y_test.shape}")



Dimensiones de los datos:
Entrenamiento: X_train=(1376455, 60), y_train=(1376455,)
Validación: X_val=(393273, 60), y_val=(393273,)
Test: X_test=(196637, 60), y_test=(196637,)


#🧠 Construcción del modelo LSTM



In [None]:
model = Sequential()
model.add(Embedding(input_dim=total_chars, output_dim=50, input_shape=(seq_length,)))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(total_chars, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


#🔮 Función para predecir el siguiente carácter


In [None]:
def predict_next_char(model, input_text, char_to_int, int_to_char, seq_length):
    input_text = input_text.lower()
    input_text = unidecode.unidecode(input_text)

    allowed_chars = 'abcdefghijklmnopqrstuvwxyz '
    input_text = ''.join([char for char in input_text if char in allowed_chars])

    if len(input_text) > seq_length:
        input_text = input_text[-seq_length:]
    if len(input_text) < seq_length:
        input_text = ' ' * (seq_length - len(input_text)) + input_text

    input_sequence = [char_to_int.get(char, char_to_int[' ']) for char in input_text]
    input_array = np.array([input_sequence])

    prediction = model.predict(input_array, verbose=0)
    predicted_index = np.argmax(prediction[0])
    predicted_char = int_to_char[predicted_index]

    return predicted_char


#📛 Callbacks de entrenamiento y muestra de predicción


In [None]:
checkpoint_dir = '/content/drive/MyDrive/checkpoint'

def on_epoch_end(epoch, logs):
    print(f"\n--- Prueba de predicción al final de la época {epoch + 1} ---")

    test_cases = [
        "don quijote de la manch",
        "en un lugar de la manch",
        "sancho panz",
        "el ingenioso hidalg"
    ]

    for test_text in test_cases:
        predicted_char = predict_next_char(model, test_text, char_to_int, int_to_char, seq_length)
        print(f"'{test_text}' -> '{predicted_char}'")

    print("\n" + "-" * 60)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    mode='min',
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=2,
    verbose=1,
    min_lr=1e-6,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    checkpoint_dir + '/modelo_don_quijote_chars.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1,
    mode='min'
)

generate_callback = LambdaCallback(on_epoch_end=on_epoch_end)

Callbacks = [
    early_stopping,
    reduce_lr,
    checkpoint_callback,
    generate_callback
]


#🚀  Entrenamiento del modelo



In [None]:
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=256,
    callbacks=Callbacks,
    validation_data=(X_val, y_val)
)


--- Iniciando entrenamiento del modelo LSTM para caracteres ---
Epoch 1/20
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.3861 - loss: 1.9606
Epoch 1: val_loss improved from inf to 1.45716, saving model to /content/drive/MyDrive/checkpoint/modelo_don_quijote_chars.keras

--- Prueba de predicción al final de la época 1 ---
'don quijote de la manch' -> 'a'
'en un lugar de la manch' -> 'a'
'sancho panz' -> 'a'
'el ingenioso hidalg' -> 'o'

------------------------------------------------------------
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 38ms/step - accuracy: 0.3861 - loss: 1.9605 - val_accuracy: 0.5399 - val_loss: 1.4572 - learning_rate: 0.0010
Epoch 2/20
[1m5376/5377[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 35ms/step - accuracy: 0.5497 - loss: 1.4288
Epoch 2: val_loss improved from 1.45716 to 1.37500, saving model to /content/drive/MyDrive/checkpoint/modelo_don_quijote_chars.keras

--- Prueba de p

#📈  Evaluación en el conjunto de prueba



In [None]:
print("\n--- Evaluación en conjunto de test ---")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

print(f"\n--- Resumen de métricas finales ---")
print(f"Entrenamiento - Loss: {history.history['loss'][-1]:.4f}, Accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"Validación - Loss: {history.history['val_loss'][-1]:.4f}, Accuracy: {history.history['val_accuracy'][-1]:.4f}")
print(f"Test - Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}")


--- Evaluación en conjunto de test ---
Test Loss: 1.2777
Test Accuracy: 0.6013

--- Resumen de métricas finales ---
Entrenamiento - Loss: 1.2108, Accuracy: 0.6150
Validación - Loss: 1.2854, Accuracy: 0.5965
Test - Loss: 1.2777, Accuracy: 0.6013


#🧪 Pruebas de predicción de texto

In [None]:
def test_character_prediction(input_text):
    predicted_char = predict_next_char(model, input_text, char_to_int, int_to_char, seq_length)
    print(f"Input: '{input_text}'")
    print(f"Siguiente carácter predicho: '{predicted_char}'")
    return predicted_char

# Ejemplos
print("\n--- Ejemplos de predicción ---")
test_character_prediction("don quijote de la manch")
test_character_prediction("en un lugar de la manch")
test_character_prediction("sancho panz")



--- Ejemplos de predicción ---
Input: 'don quijote de la manch'
Siguiente carácter predicho: 'a'
Input: 'en un lugar de la manch'
Siguiente carácter predicho: 'a'
Input: 'sancho panz'
Siguiente carácter predicho: 'a'


'a'