In [1]:
import os
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model
import wandb
import evaluate  
from typing import Literal

In [2]:
# Clase ReceipesDataset
class ReceipesDataset(Dataset):
    def __init__(
        self,
        data_path: str,
        transform_image: bool = False,
        split: Literal["train", "val", "test"] = "train",
        split_size: list = [0.7, 0.1, 0.2],
        data_size: float = 1.0,
        processor = AutoProcessor.from_pretrained("model_resources")
    

    ):
        super(ReceipesDataset, self).__init__()
        self.img_path = os.path.join(data_path, "FoodImages", "Food Images")
        self.cap_path = os.path.join(
            data_path, "FoodIngredientsAndReceipesDatasetWithImageNameMapping.csv"
        )
        self.cap_data = pd.read_csv(self.cap_path)
        self.transform_image = transform_image
        self.split = split
        self.processor = processor 
        
        # Limpieza de datos
        self.cap_data = self.cap_data.dropna(subset=["Title"])
        self.cap_data = self.cap_data[
            self.cap_data["Title"].apply(lambda x: len(x.split()) > 0)
        ]
        self.cap_data = self.cap_data[
            self.cap_data["Image_Name"].apply(lambda x: x != "#NAME?")
        ]

        # División de datos
        total_size = len(self.cap_data)
        train_end = int(split_size[0] * total_size)
        val_end = train_end + int(split_size[1] * total_size)

        if split == "train":
            self.cap_data = self.cap_data[:train_end]
        elif split == "val":
            self.cap_data = self.cap_data[train_end:val_end]
        elif split == "test":
            self.cap_data = self.cap_data[val_end:]

        self.cap_data = self.cap_data.sample(frac=data_size, random_state=42)

    def __len__(self):
        return len(self.cap_data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_path, self.cap_data.iloc[idx]["Image_Name"])
        img_name += ".jpg"
        image = Image.open(img_name).convert("RGB")
        if self.transform_image:
            image = transform(image)
        caption = self.cap_data.iloc[idx]["Title"]
        encoding = self.processor(images = image , text= caption, padding= "max_length", return_tensors= "pt")
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding

transform = transforms.Compose([
	transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

"""
transform = transforms.Compose([
	transforms.Resize((224, 224)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406],
						 std=[0.229, 0.224, 0.225])
])
"""


'\ntransform = transforms.Compose([\n\ttransforms.Resize((224, 224)),\n\ttransforms.ToTensor(),\n\ttransforms.Normalize(mean=[0.485, 0.456, 0.406],\n\t\t\t\t\t\t std=[0.229, 0.224, 0.225])\n])\n'

In [3]:
# Configuración del modelo BLIP
model_id = "model_resources"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForVision2Seq.from_pretrained(model_id)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Configuración PEFT
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "self.query",
        "self.key",
        "self.value",
        "output.dense",
        "self_attn.qkv",
        "self_attn.projection",
        "mlp.fc1",
        "mlp.fc2",
    ],
)

model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 5,455,872 || all params: 252,869,948 || trainable%: 2.1576


In [4]:
# Inicialización del dataset
data_path = "/home/ldomene/caption_data/receipes"
train_dataset = ReceipesDataset(data_path=data_path, transform_image=False, split="train")
val_dataset = ReceipesDataset(data_path=data_path, transform_image=False, split="val")

"""
# Collate Function para BLIP
def collator(batch):
    images, captions = zip(*batch)
    image_inputs = processor(images=list(images), return_tensors="pt", padding=True)
    text_inputs = processor.tokenizer(
        list(captions), padding=True, return_tensors="pt"
    )
    return {
        "pixel_values": image_inputs["pixel_values"].to(device),
        "input_ids": text_inputs["input_ids"].to(device),
        "attention_mask": text_inputs["attention_mask"].to(device),
    }
"""


# DataLoader
train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=4
)
val_dataloader = DataLoader(
    val_dataset, shuffle=False, batch_size=4
)

# Optimización
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Inicialización de WandB
wandb.init(
    project="blip-finetuning",
    config={
        "epochs": 10,
        "batch_size": 4,
        "learning_rate": 1e-4,
    },
)

# Métricas
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mluisdomene[0m ([33muab-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


[nltk_data] Downloading package wordnet to /home/ldomene/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ldomene/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ldomene/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# Entrenamiento
config = wandb.config
model.train()
for epoch in range(config.epochs):
    print(f"Epoch {epoch + 1}")
    total_loss = 0

    for idx, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        print(type(input_ids))
        print(input_ids.shape)
        
        outputs = model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            labels=input_ids,
            attention_mask=attention_mask,
        )
        
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        wandb.log({"batch_loss": loss.item(), "epoch": epoch + 1})

    # Evaluación
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for val_batch in val_dataloader:
            val_pixel_values = val_batch["pixel_values"]
            val_input_ids = val_batch["input_ids"]

            generated_output = model.generate(
                pixel_values=val_pixel_values, max_new_tokens=64
            )
            predictions.extend(
                processor.batch_decode(generated_output, skip_special_tokens=True)
            )
            references.extend(
                processor.batch_decode(val_input_ids, skip_special_tokens=True)
            )

    res_bleu_1 = bleu.compute(
        predictions=predictions, references=[[ref] for ref in references], max_order=1
    )
    res_bleu_2 = bleu.compute(
        predictions=predictions, references=[[ref] for ref in references], max_order=2
    )
    res_meteor = meteor.compute(
        predictions=predictions, references=[[ref] for ref in references]
    )
    res_rouge = rouge.compute(
        predictions=predictions, references=[[ref] for ref in references]
    )

    avg_train_loss = total_loss / len(train_dataloader)
    print(
        f"Epoch {epoch + 1}: Train Loss = {avg_train_loss:.4f}, BLEU-1 = {res_bleu_1['bleu']:.4f}, BLEU-2 = {res_bleu_2['bleu']:.4f}, METEOR = {res_meteor['meteor']:.4f}, ROUGE-L = {res_rouge['rougeL']:.4f}"
    )

    wandb.log(
        {
            "epoch_train_loss": avg_train_loss,
            "BLEU-1": res_bleu_1["bleu"],
            "BLEU-2": res_bleu_2["bleu"],
            "ROUGE-L": res_rouge["rougeL"],
            "METEOR": res_meteor["meteor"],
        }
    )

    model.train()

Epoch 1
<class 'torch.Tensor'>
torch.Size([4, 512])
<class 'torch.Tensor'>
torch.Size([4, 512])
<class 'torch.Tensor'>
torch.Size([4, 512])
<class 'torch.Tensor'>
torch.Size([4, 512])
<class 'torch.Tensor'>
torch.Size([4, 512])
<class 'torch.Tensor'>
torch.Size([4, 512])
<class 'torch.Tensor'>
torch.Size([4, 512])


KeyboardInterrupt: 

In [None]:
# Guardar modelo
os.makedirs("/home/ldomene/CAP-GIA/blip", exist_ok=True)
model.save_pretrained("/home/ldomene/CAP-GIA/blip/model")
artifact = wandb.Artifact("blip-finetuned-model", type="model")
artifact.add_dir("/home/ldomene/CAP-GIA/blip/model")
wandb.log_artifact(artifact)

wandb.finish()


In [None]:
wandb.finish()
