# Fine-tuning Ultra Simple: Imagen a Texto

Versión ultra simplificada sin métricas complejas

In [1]:
import json
import os
import torch
from transformers import (
    VisionEncoderDecoderModel, 
    ViTImageProcessor, 
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
import mlflow
import mlflow.pytorch
from PIL import Image
from sklearn.model_selection import train_test_split
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuración
MODEL_NAME = 'nlpconnect/vit-gpt2-image-captioning'
MLFLOW_MODEL_NAME = 'image-to-text-finetuned'
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('image-finetune')

<Experiment: artifact_location='mlflow-artifacts:/794479321199106868', creation_time=1751904853017, experiment_id='794479321199106868', last_update_time=1751904853017, lifecycle_stage='active', name='image-finetune', tags={}>

In [4]:
# Descargar modelo base
print('Descargando modelo base...')
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
feature_extractor = ViTImageProcessor.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print('Modelo base descargado.')

Descargando modelo base...
Modelo base descargado.


In [5]:
# Cargar datos
print('Cargando datos...')
with open('entradas.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
print(f'Datos cargados: {len(data)} entradas')

# Separar train/test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(f'Datos separados: {len(train_data)} train, {len(test_data)} test')

Cargando datos...
Datos cargados: 83 entradas
Datos separados: 66 train, 17 test


In [6]:
# Configurar tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class SimpleDataset:
    def __init__(self, data, processor, tokenizer):
        self.data = data
        self.processor = processor
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Cargar imagen
        image_path = os.path.join('images', item['imagen'])
        image = Image.open(image_path).convert('RGB')
        
        # Procesar imagen
        pixel_values = self.processor(image, return_tensors='pt').pixel_values.squeeze()
        
        # Tokenizar texto
        text = item['texto']
        labels = self.tokenizer(
            text, 
            truncation=True, 
            padding='max_length', 
            max_length=128,
            return_tensors='pt'
        ).input_ids.squeeze()
        
        return {
            'pixel_values': pixel_values,
            'labels': labels
        }

# Crear datasets
train_dataset = SimpleDataset(train_data, feature_extractor, tokenizer)
test_dataset = SimpleDataset(test_data, feature_extractor, tokenizer)

In [9]:
def trainModel(batch_size, learning_rate, num_epochs, saveModel=False):
    """Función ultra simple para entrenar modelo"""
    print(f'\nEntrenando con:')
    print(f'  batch_size: {batch_size}')
    print(f'  learning_rate: {learning_rate}')
    print(f'  num_epochs: {num_epochs}')
    
    # Configurar entrenamiento - SOLO argumentos básicos
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        num_train_epochs=num_epochs,
        logging_dir='./logs'
    )
    
    # Crear trainer - SIN métricas complejas
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )
    
    # Entrenar
    trainer.train()
    
    # Evaluar
    eval_results = trainer.evaluate()
    
    # MLflow - cerrar run anterior si existe
    try:
        mlflow.end_run()
    except:
        pass
    
    with mlflow.start_run(run_name=f'image-finetune-bs{batch_size}-lr{learning_rate}-ep{num_epochs}'):
        mlflow.log_params({
            'batch_size': batch_size,
            'learning_rate': learning_rate,
            'num_epochs': num_epochs
        })
        
        mlflow.log_metrics({
            'eval_loss': eval_results['eval_loss']
        })
        
        if saveModel:
            mlflow.pytorch.log_model(model, 'model', registered_model_name=MLFLOW_MODEL_NAME)
            
        return eval_results

In [10]:
# Parámetros para probar
batch_sizes = [1, 2, 4]
learning_rates = [5e-5, 1e-4, 2e-4, 5e-4]
num_epochs_list = [3]

# Entrenar con diferentes configuraciones
for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        for num_epochs in num_epochs_list:
            trainModel(batch_size, learning_rate, num_epochs)


Entrenando con:
  batch_size: 1
  learning_rate: 5e-05
  num_epochs: 3


Step,Training Loss


🏃 View run stylish-foal-306 at: http://localhost:5000/#/experiments/794479321199106868/runs/98b40779ce5f465d8ef1dd36ee5373fc
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs1-lr5e-05-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/732b1548e5364e709049e8cd616659f3
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 1
  learning_rate: 0.0001
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/977798a347174d068601e1621b9dfcc0
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run indecisive-sheep-581 at: http://localhost:5000/#/experiments/794479321199106868/runs/c6c32c7c5b3e4b6199b08bd1a67ace17
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs1-lr0.0001-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/a3c8e0f26caa4d44853458ed9ae322fa
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 1
  learning_rate: 0.0002
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/5f189212cde34948b3c914ebdfc59259
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run resilient-wolf-697 at: http://localhost:5000/#/experiments/794479321199106868/runs/b2f6b39bcbf34806aa62ceb17d36a1be
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs1-lr0.0002-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/f2e3020a91d9465c9c901cecc95a19a5
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 1
  learning_rate: 0.0005
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/439c061b3a9f49448ac1a4102e4670ee
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run big-owl-944 at: http://localhost:5000/#/experiments/794479321199106868/runs/ef3357c3a36b43478c7b949497d1c1d8
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs1-lr0.0005-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/f12ed999870e40019e54d08207b990d8
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 2
  learning_rate: 5e-05
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/60901072f9f7470798d0ca324812c117
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run incongruous-ant-827 at: http://localhost:5000/#/experiments/794479321199106868/runs/9b063d6a1f3e4dd4afd359fbabeefd41
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs2-lr5e-05-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/84ab85cdd308482db6d190735b644b03
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 2
  learning_rate: 0.0001
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/3c704f2c7bd14d0bb6ba654156f31e65
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run magnificent-koi-269 at: http://localhost:5000/#/experiments/794479321199106868/runs/0425743f16d6407a9d6460f1181d8d86
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs2-lr0.0001-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/83c411fd661049e0a4ad9c6c8ab96fa7
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 2
  learning_rate: 0.0002
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/c59071badbd04debbba3015c533b293d
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run burly-boar-541 at: http://localhost:5000/#/experiments/794479321199106868/runs/e39cb40053c24a7e831c985516e06867
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs2-lr0.0002-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/9d3c790faab34c9cb878ec599a8d3e2f
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 2
  learning_rate: 0.0005
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/820dc47a981f4478a668e2c29739423f
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run rebellious-duck-727 at: http://localhost:5000/#/experiments/794479321199106868/runs/c73de85b9a1b4fa9a84e847cd9233ddf
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs2-lr0.0005-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/74e029b217d645e79c84118441674a38
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 4
  learning_rate: 5e-05
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/fed5ef567fdd4498b80d2a6f5c13072f
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run spiffy-quail-310 at: http://localhost:5000/#/experiments/794479321199106868/runs/fa0e8b64818a4db9b47f099c3911e950
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs4-lr5e-05-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/2a70c896ed914b829b4616e71a4134c9
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 4
  learning_rate: 0.0001
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/c4eede30ef8c4d8b81425cd2160d1940
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run painted-deer-743 at: http://localhost:5000/#/experiments/794479321199106868/runs/34b351e609584010886328cda161f337
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs4-lr0.0001-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/104dd02148da44deb2f472776b7bcce3
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 4
  learning_rate: 0.0002
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/6292ce816c27456e99ce7f132d0f5d95
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run nervous-panda-210 at: http://localhost:5000/#/experiments/794479321199106868/runs/7c7c94ede3ff498c8e87b86e9655d04d
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs4-lr0.0002-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/c35a2485ee9945d193a8982a2de120c1
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868

Entrenando con:
  batch_size: 4
  learning_rate: 0.0005
  num_epochs: 3


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/432cd83aecca42849d7f93e932163dde
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run polite-turtle-419 at: http://localhost:5000/#/experiments/794479321199106868/runs/b75d6b51de544fa4b40fdc44d472eb81
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
🏃 View run image-finetune-bs4-lr0.0005-ep3 at: http://localhost:5000/#/experiments/794479321199106868/runs/b682ee9d7e6142d7a868018640970cc4
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


In [11]:
# Guardar el mejor modelo (ajustar parámetros según resultados)
print('Guardando mejor modelo...')
trainModel(1, 5e-5, 4, saveModel=True)
print('Modelo guardado en MLflow!')

Guardando mejor modelo...

Entrenando con:
  batch_size: 1
  learning_rate: 5e-05
  num_epochs: 4


Step,Training Loss


🏃 View run ./results at: http://localhost:5000/#/experiments/794479321199106868/runs/15e49e60b14a410892754ddbbd156ae1
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


🏃 View run tasteful-gnat-848 at: http://localhost:5000/#/experiments/794479321199106868/runs/0d987ad959ec4d5a8d3b63a5c29a53f8
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868


Successfully registered model 'image-to-text-finetuned'.
2025/07/07 20:02:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: image-to-text-finetuned, version 1


🏃 View run image-finetune-bs1-lr5e-05-ep4 at: http://localhost:5000/#/experiments/794479321199106868/runs/6c899912ad7946bbb84fe1812a0b5778
🧪 View experiment at: http://localhost:5000/#/experiments/794479321199106868
Modelo guardado en MLflow!


Created version '1' of model 'image-to-text-finetuned'.
