Instalación

In [None]:
!pip install -q transformers datasets accelerate sentencepiece


Montar Drive y verificar libros

In [None]:
from google.colab import drive
drive.mount('/content/drive')

LIBROS_DIR = "/content/drive/MyDrive/LIBROS"  # carpeta con los libros .txt
import os
print("Archivos en LIBROS:", os.listdir(LIBROS_DIR)[:10])

Mounted at /content/drive
Archivos en LIBROS: ['El paraiso perdido.txt', 'El arbol de la ciencia.txt', 'isla del tesoro.txt', 'del plata al niagara.txt', 'la novela de un novelista.txt', 'los cuatro jinetes del apocalispsis.txt', 'la desheredada.txt', 'Fortunata y Jacinta.txt', 'La Regenta.txt', 'La rana viajera.txt']


3. Configuración

In [None]:
# Modelos base en español
MODEL_NAME = "gpt2-large" # "eepESP/gpt2-spanish-medium", "flax-community/gpt-2-spanish"

# Directorios
OUTPUT_DIR = "/content/drive/MyDrive/gpt2_finetuned_checkpoints"
FINAL_SAVE_DIR = "./gpt2-finetuned-final"

# Hiperparámetros
MAX_LENGTH = 512
STRIDE = 0
#STRIDE = 128
BATCH_SIZE = 2
NUM_EPOCHS = 8
LEARNING_RATE = 3e-5

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_SAVE_DIR, exist_ok=True)


4. Leer y limpiar libros

In [None]:
import glob, re
from tqdm import tqdm

def read_file(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

file_paths = sorted(glob.glob(os.path.join(LIBROS_DIR, "*.txt")))
print(f"Encontrados {len(file_paths)} libros")

books = []
for p in file_paths:
    text = read_file(p)
    text = re.sub(r'\r\n?', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = text.strip()
    if len(text) > 100:
        books.append(text)

print("Ejemplo de texto limpio:\n", books[0][:400])


Encontrados 55 libros
Ejemplo de texto limpio:
 ﻿
Sabe con amor la antigua literatura griega; sabe de todo lo moderno
europeo. Se entrevé, aunque no hace gala de ello, que tiene el concepto
cabal del mundo visible y del espíritu humano, tal como este concepto ha
venido a formarse por el conjunto de observaciones, experiencias,
hipótesis y teorías más recientes. Y se entrevé también que todo esto ha
penetrado en la mente del autor, no diré exclu


5. Tokenización y creación de bloques

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def chunk_text(text, max_length=MAX_LENGTH, stride=STRIDE):
    tokens = tokenizer(text)["input_ids"]
    chunks = []
    i = 0
    while i < len(tokens):
        end = i + max_length
        chunk = tokens[i:end]
        if len(chunk) == 0:
            break
        chunks.append({"input_ids": chunk})
        if end >= len(tokens):
            break
        i += max_length - stride
    return chunks

all_chunks = []
for t in books:
    all_chunks.extend(chunk_text(t))

print("Total de bloques:", len(all_chunks))
ds = Dataset.from_list(all_chunks)
print(ds[0])


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (72619 > 1024). Running this sequence through the model will result in indexing errors


Total de bloques: 25242
{'input_ids': [171, 119, 123, 198, 50, 11231, 369, 716, 273, 8591, 1885, 328, 6413, 4187, 2541, 64, 11120, 26470, 26, 264, 11231, 390, 284, 4598, 2376, 3660, 78, 198, 44252, 431, 78, 13, 1001, 920, 18218, 2634, 11, 257, 403, 4188, 645, 289, 558, 308, 6081, 390, 304, 18798, 11, 8358, 46668, 1734, 1288, 3721, 78, 198, 66, 44349, 1619, 27943, 78, 7424, 331, 1619, 15024, 8836, 799, 84, 1692, 78, 11, 3305, 401, 78, 43577, 3721, 78, 387, 198, 574, 17305, 257, 1296, 17208, 16964, 1288, 11644, 403, 1462, 390, 3799, 49443, 274, 11, 3410, 979, 292, 11, 198, 1056, 10205, 4879, 271, 331, 573, 273, 8836, 292, 285, 40138, 664, 1153, 274, 13, 575, 384, 920, 18218, 2634, 256, 4131, 72, 35942, 8358, 284, 4598, 1556, 78, 387, 198, 3617, 21879, 4533, 551, 8591, 6229, 68, 1619, 1960, 273, 11, 645, 26672, 2634, 10293, 452, 3263, 68, 11, 583, 78, 264, 8836, 198, 1050, 1939, 8521, 434, 68, 11, 257, 1291, 85, 20954, 390, 9195, 4951, 1216, 1817, 274, 13, 8678, 285, 40138, 25, 551, 22346

6. Preparar modelo y DataCollator

In [None]:
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()   # activa checkpointing
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


7. TrainingArguments y Trainer

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=False,
    num_train_epochs=NUM_EPOCHS,
    #per_device_train_batch_size=BATCH_SIZE,

    save_strategy="epoch",        # guardar al final de cada época
    save_total_limit=2,           # mantiene solo los 2 últimos checkpoints
    logging_steps=100,
    learning_rate=LEARNING_RATE,
    fp16=True,                    # si la GPU lo soporta; si no False
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 148.12 MiB is free. Process 4831 has 14.59 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 212.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

8. Entrenar (primera vez)

In [None]:
train_result = trainer.train()
trainer.save_model(FINAL_SAVE_DIR)
tokenizer.save_pretrained(FINAL_SAVE_DIR)


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 24.12 MiB is free. Process 4831 has 14.71 GiB memory in use. Of the allocated memory 14.38 GiB is allocated by PyTorch, and 210.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

9. Reanudar entrenamiento (si Colab se cierra)

In [None]:
from transformers.trainer_utils import get_last_checkpoint
last_ckpt = get_last_checkpoint(OUTPUT_DIR)
print("Último checkpoint:", last_ckpt)

if last_ckpt is not None:
    trainer.train(resume_from_checkpoint=last_ckpt)
    trainer.save_model(FINAL_SAVE_DIR)
    tokenizer.save_pretrained(FINAL_SAVE_DIR)
else:
    print("No hay checkpoint. Ejecuta entrenamiento desde cero.")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 0, 'bos_token_id': 0, 'pad_token_id': 0}.


Último checkpoint: /content/drive/MyDrive/gpt2_finetuned_checkpoints/checkpoint-32085


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
32100,3.1868
32200,3.1063
32300,3.1312
32400,3.1449
32500,3.1501
32600,3.1372
32700,3.1688
32800,3.1445
32900,3.1607
33000,3.1468


10. Evaluación (pérdida y perplexidad)

In [None]:
import math

ds_split = ds.train_test_split(test_size=0.05, seed=42)
eval_ds = ds_split["test"]

metrics = trainer.evaluate(eval_dataset=eval_ds)
eval_loss = metrics["eval_loss"]
ppl = math.exp(eval_loss)
print(f"Eval loss: {eval_loss:.4f}")
print(f"Perplexity (ppl): {ppl:.2f}")


Eval loss: 2.4473
Perplexity (ppl): 11.56


11.  Generación de texto

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

OUTPUT_DIR = "/content/drive/MyDrive/gpt2_finetuned_checkpoints"

# Buscar último checkpoint en Drive
checkpoints = [os.path.join(OUTPUT_DIR, d) for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint")]
last_ckpt = max(checkpoints, key=os.path.getctime)
print("Último checkpoint:", last_ckpt)

# Cargar modelo y tokenizer desde el checkpoint
tokenizer = AutoTokenizer.from_pretrained(last_ckpt)
model = AutoModelForCausalLM.from_pretrained(last_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")

# Ajustar pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id


Último checkpoint: /content/drive/MyDrive/gpt2_finetuned_checkpoints/checkpoint-42795


In [None]:
prompt = "Era una noche oscura en la que los viajeros llegaron al pueblo"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_length=300,
    do_sample=True,
    temperature=1.1,
    top_p=0.95,
    top_k=50,
    repetition_penalty=1.2,
    num_return_sequences=2,
    pad_token_id=tokenizer.eos_token_id
)

for i, o in enumerate(outputs):
    txt = tokenizer.decode(o, skip_special_tokens=True)
    print(f"\n=== TEXTO {i+1} ===\n{txt}\n")



=== TEXTO 1 ===
Era una noche oscura en la que los viajeros llegaron al pueblo y se vieron más
blancos, pálidos; con las mejillas amarillentas. Los ojos brillantes
tendían sobre el rostro del negro muerto que tenía unos mechones rojizos; en sus
ojos brillaron rayos de fuego rojos, que se remedaban por entre sus
brazos. El cadáver de Manola parecía estar envuelto alrededor suyo...
Llegaban á su memoria algunas palabras muy interesantes para este
pueblo inglés a quien habían nacido y vivido: «Es uno de esos hombres.»
Alguien se interpuso entre ellos, que si se hubiese sentido librea del
destello no habría oído nada después.--¡Sube ya! ¡El último pobre!--exclamó
en seguida Julio.

La escena era triste é insana; el ruido ahogado de las campanas de
las escuelas, resonaba hasta su fondo como un trueno estridente.... Las puertas en
los balcones y las sillas parecían secarse ante la sombra
que cubría el pueblo eternamente lleno del cielo poniente.
Debajo de las sábanas, en medio del arroyo, e