# Fine-tuning Ultra Simple: Imagen a Texto

Versión ultra simplificada sin métricas complejas

In [None]:
import json
import os
import torch
from transformers import (
    VisionEncoderDecoderModel, 
    ViTImageProcessor, 
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
import mlflow
import mlflow.pytorch
from PIL import Image
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Configuración
MODEL_NAME = 'nlpconnect/vit-gpt2-image-captioning'
MLFLOW_MODEL_NAME = 'image-to-text-finetuned'
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('image-finetune')

In [None]:
# Descargar modelo base
print('Descargando modelo base...')
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
feature_extractor = ViTImageProcessor.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print('Modelo base descargado.')

In [None]:
# Cargar datos
print('Cargando datos...')
with open('entradas.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
print(f'Datos cargados: {len(data)} entradas')

# Separar train/test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(f'Datos separados: {len(train_data)} train, {len(test_data)} test')

In [None]:
# Configurar tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class SimpleDataset:
    def __init__(self, data, processor, tokenizer):
        self.data = data
        self.processor = processor
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Cargar imagen
        image_path = os.path.join('images', item['imagen'])
        image = Image.open(image_path).convert('RGB')
        
        # Procesar imagen
        pixel_values = self.processor(image, return_tensors='pt').pixel_values.squeeze()
        
        # Tokenizar texto
        text = item['texto']
        labels = self.tokenizer(
            text, 
            truncation=True, 
            padding='max_length', 
            max_length=128,
            return_tensors='pt'
        ).input_ids.squeeze()
        
        return {
            'pixel_values': pixel_values,
            'labels': labels
        }

# Crear datasets
train_dataset = SimpleDataset(train_data, feature_extractor, tokenizer)
test_dataset = SimpleDataset(test_data, feature_extractor, tokenizer)

In [None]:
def trainModel(batch_size, learning_rate, num_epochs, saveModel=False):
    """Función ultra simple para entrenar modelo"""
    print(f'\nEntrenando con:')
    print(f'  batch_size: {batch_size}')
    print(f'  learning_rate: {learning_rate}')
    print(f'  num_epochs: {num_epochs}')
    
    # Configurar entrenamiento - SOLO argumentos básicos
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        num_train_epochs=num_epochs,
        logging_dir='./logs'
    )
    
    # Crear trainer - SIN métricas complejas
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )
    
    # Entrenar
    trainer.train()
    
    # Evaluar
    eval_results = trainer.evaluate()
    
    # MLflow
    with mlflow.start_run(run_name=f'image-finetune-bs{batch_size}-lr{learning_rate}-ep{num_epochs}'):
        mlflow.log_params({
            'batch_size': batch_size,
            'learning_rate': learning_rate,
            'num_epochs': num_epochs
        })
        
        mlflow.log_metrics({
            'eval_loss': eval_results['eval_loss']
        })
        
        if saveModel:
            mlflow.pytorch.log_model(model, 'model', registered_model_name=MLFLOW_MODEL_NAME)
            
        return eval_results

In [None]:
# Parámetros para probar
batch_sizes = [1, 2, 4]
learning_rates = [5e-5, 1e-4, 2e-4, 5e-4]
num_epochs_list = [3]

# Entrenar con diferentes configuraciones
for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        for num_epochs in num_epochs_list:
            trainModel(batch_size, learning_rate, num_epochs)

In [None]:
# Guardar el mejor modelo (ajustar parámetros según resultados)
print('Guardando mejor modelo...')
trainModel(2, 1e-4, 3, saveModel=True)
print('Modelo guardado en MLflow!')