In [None]:
!pip install lmdb

import argparse
import os
import numpy as np
import math
import itertools
import signal
import pandas as pd
import random
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_absolute_error, r2_score
import lmdb
import pickle
from io import BytesIO
from sklearn.preprocessing import StandardScaler
import torchvision.transforms as transforms
from torchvision.utils import save_image
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torch.autograd import Variable
from torchvision import models

import gc

import zipfile
from io import BytesIO

import torch.nn as nn
import torch.nn.functional as F
import torch

import matplotlib.pyplot as plt
from PIL import Image

from torch.amp import GradScaler, autocast
from torch.utils.data import random_split

import glob



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/data/datos_coches_procesados.csv')

In [None]:
import zipfile
import os

ruta_zip = "/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/data/imagenes_comprimidas.zip"
carpeta_destino = "dataset_extraido"

os.makedirs(carpeta_destino, exist_ok=True)

with zipfile.ZipFile(ruta_zip, 'r') as zip_ref:
    zip_ref.extractall(carpeta_destino)

print("Descompresión completada.")

Descompresión completada.


In [None]:
# Ruta a imágenes y CSV
root_dir = "dataset_extraido/imagenes_comprimidas"
csv_path = "/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/data/datos_coches_procesados.csv"
lmdb_path = "dataset.lmdb"

# Leer CSV
df = pd.read_csv(csv_path)
df = df.set_index('url')


# Crear base de datos LMDB
env = lmdb.open(lmdb_path, map_async=True, map_size=20*1024**3, meminit=False, writemap=True, lock=False)

with env.begin(write=True) as txn:
    for url in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, url)
        if not os.path.isdir(folder_path) or url not in df.index:
            continue

        images = []

         # Cargar todas las imágenes de la subcarpeta
        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)
            with open(img_path, 'rb') as f:
                img_bytes = f.read()
                images.append((img_name, img_bytes))  # Guardamos el nombre y el contenido

        if not images:
            continue

        key = url.encode('utf-8')  # Solo el nombre de la carpeta como clave
        metadata = df.loc[url].to_dict()
        value = pickle.dumps({
            "images": images,    # Lista de (nombre, bytes)
            "metadata": metadata,
        })

        txn.put(key, value)

# Funciones para partir el dataset

In [None]:
random.seed(42)
def split_keys(path, train, dev):
  env = lmdb.open(path, readonly=True, lock=False)
  keys = []

  with env.begin() as txn:
      cursor = txn.cursor()
      for key, _ in cursor:
          keys.append(key)

  random.shuffle(keys)

  train_split = train
  dev_split = dev

  n = len(keys)
  train_keys = keys[:int(train_split * n)]
  dev_keys = keys[int(train_split * n):int((train_split + dev_split) * n)]
  test_keys = keys[int((train_split + dev_split) * n):]
  return train_keys, dev_keys, test_keys

In [None]:
def set_scaler(y_train):
  y_train_int = []
  for y in y_train:
    y = y.split(",")[0]
    y = y.replace('.', '')
    y = int(y)
    y_train_int.append(y)
  targets = np.array(y_train_int).reshape(-1, 1)
  scaler = StandardScaler()
  scaler.fit(targets)
  return scaler

In [None]:
def get_labels_train(df, train_keys):
  keys_str = [key.decode('utf-8') for key in train_keys]
  filtered_df = df.loc[df.index.intersection(keys_str)]
  y_train = filtered_df['precio'].tolist()
  return y_train

In [None]:
class LMDBDataset(Dataset):
    def __init__(self, lmdb_path ,keys, scaler, transform=None):
        self.env = lmdb.open(lmdb_path, readonly=True, lock=False)
        self.keys = keys
        self.transform = transform or transforms.ToTensor()
        self.normalizar = False
        self.scaler = scaler
    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        with self.env.begin() as txn:
            data = pickle.loads(txn.get(key))

        images = data["images"]
        metadata = data["metadata"]
        try:
          año = int(float(metadata['año']))
        except:
          año = None
        try:
          km = int(float(metadata['km']))
        except:
          km = None
        try:
          precio = metadata['precio'].split(",")[0]
          precio = precio.replace('.', '')
          precio = int(precio)
        except:
          precio = -1

        # Texto que se le pasará al modelo textual
        text = "Modelo: " + str(metadata['marca']) +" "+str(metadata['modelo']) +" " + "año: " + str(año)+ " "+"Kilómetros: " + str(km) + " " + str(metadata['desc'])


        precio = self.normalizar_precio(precio)

        image_list = []
        for img_name, img_bytes in images:
            img = Image.open(BytesIO(img_bytes)).convert("RGB")
            img = self.transform(img)
            image_list.append(img)

        return image_list, text, len(images), precio
    def normalizar_precio(self, precio):
        y = self.scaler.transform([[precio]])[0][0]
        return y

In [None]:
def custom_collate_fn(batch):
    all_images = []
    all_metadata = []
    num_images_per_group = []
    precios = []
    for images, metadata, num_images, precio in batch:
        all_images.extend(images)  # añadir todas las imágenes
        all_metadata.append(metadata)
        num_images_per_group.append(num_images)
        precios.append(precio)

    all_images = torch.stack(all_images, dim=0)  # [total_imágenes, 3, 256, 256]

    return all_images, all_metadata, num_images_per_group, precios

In [None]:
train_keys, dev_keys, test_keys = split_keys(lmdb_path, 0.85, 0.05)

In [None]:
train_precios = get_labels_train(df, train_keys)
scaler = set_scaler(train_precios)

In [None]:
train_dataset = LMDBDataset(lmdb_path, train_keys, scaler)
dev_dataset = LMDBDataset(lmdb_path, dev_keys, scaler)
test_dataset = LMDBDataset(lmdb_path, test_keys, scaler)

In [None]:
dataloader = DataLoader(dev_dataset, batch_size = 32, collate_fn=custom_collate_fn)

In [None]:
print(len(train_dataset))
print(len(dev_dataset))
print(len(test_dataset))

6907
406
813


# Modelos

In [None]:
class MultiImageFlexibleVGG(nn.Module):
    def __init__(self, model, feature_dim=4096):
        super().__init__()
        self.model = model

    def forward(self, images):
        device = next(self.parameters()).device
        all_imgs = images.to(device)

        with torch.no_grad():
            feats = self.model(all_imgs)
        return feats

In [None]:
class FeatLSTM(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(FeatLSTM, self).__init__()
    self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True, bidirectional=True)

  def forward(self, x, lengths):
    cut_indices = torch.cumsum(lengths, dim=0)[:-1]
    split = torch.tensor_split(x, cut_indices.tolist(), dim=0)
    out = torch.tensor([]).to(device)
    for seq in split:
      _, (h, c) = self.lstm(seq)
      h = h.flatten()
      out = torch.cat((out, torch.unsqueeze(h, 0)), dim=0)


    return out

In [None]:
# Para el MODELO DE LENGUAJE
# https://huggingface.co/Recognai/distilbert-base-es-multilingual-cased

In [None]:
class TextModel(nn.Module):
  def __init__(self, tokenizer, model, device):
    super(TextModel, self).__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.device = device
  def forward(self, x):
    tokens = self.tokenizer(x, padding=True, truncation=True, return_tensors='pt')
    tokens = tokens.to(self.device)
    output = self.model(**tokens)
    return output.last_hidden_state[:, 0, :]

In [None]:
class MultimodalModel(nn.Module):
  def __init__(self, multiVGG, featLSTM, textModel, device, image_latent_dim=1024, text_latent_dim=768):
    super(MultimodalModel, self).__init__()
    self.device = device
    self.multiVGG = multiVGG.to(self.device)
    self.featLSTM = featLSTM.to(self.device)
    self.textModel = textModel.to(self.device)
    self.text_latent_dim = text_latent_dim
    self.image_latent_dim = image_latent_dim
    self.latent_dim = text_latent_dim + image_latent_dim

    self.regresion = nn.Sequential(
        nn.Linear(self.latent_dim, 1024),
        nn.ReLU(),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Linear(512, 1)
    )
  def forward(self, images, text, lengths):
    image_latent = self.multiVGG(images)
    text_latent = self.textModel(text)
    image_latent = self.featLSTM(image_latent, lengths)
    latent = torch.cat((image_latent, text_latent), dim=1)
    output = self.regresion(latent)
    return output


# Trainer

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0
        self.should_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True


In [None]:
class Trainer():
  def __init__(self, model, dataset, dataloader, devloader, testloader, num_epochs, learning_rate, device, save, earlystopping):
    self.model = model
    self.dataset = dataset
    self.dataloader = dataloader
    self.num_epochs = num_epochs
    self.learning_rate = learning_rate
    self.device = device
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
    self.criterion = nn.MSELoss()
    self.save = save
    self.earlystopping = earlystopping
    self.devloader = devloader
    self.testloader = testloader
    self.save_path ='/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/multimodal_model.pth'

  def train(self):
    for epoch in range(self.num_epochs):
      self.model.train()
      for images, text, num_images, y in self.dataloader:
        y = torch.tensor(y, dtype=torch.float32).to(device)
        lengths = torch.tensor(num_images).to(self.device)
        images = images.to(self.device)

        self.optimizer.zero_grad()

        output = self.model(images, text, lengths)
        loss = self.criterion(output.flatten(), y.flatten())
        loss.backward()
        self.optimizer.step()
        print(f"Epoch {epoch+1}/{self.num_epochs}, Loss: {loss.item()}")

      eval_loss = self.eval()

      if self.save and self.earlystopping.best_loss > eval_loss:
        torch.save(self.model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/multimodal_model-2.pth')

      print(f"Epoch {epoch+1}/{self.num_epochs}, Eval Loss: {eval_loss}")
      if epoch == 8:
        self.descongelar_capas()
      self.earlystopping(eval_loss)
      if self.earlystopping.should_stop:
        print("Early stopping triggered")
        break

  def eval(self):
    self.model.eval()
    losses = []
    with torch.no_grad():
      for images, text, num_images, y in self.devloader:
        lengths = torch.tensor(num_images).to(self.device)
        y = torch.tensor(y, dtype=torch.float32).to(device)
        images = images.to(self.device)
        output = self.model(images, text, lengths)
        loss = self.criterion(output.flatten(), y.flatten())
        losses.append(loss.item())
    return sum(losses)/len(losses)
  def test(self):
    self.model.eval()
    targets = []
    preds = []
    with torch.no_grad():
      for images, text, num_images, y in self.testloader:
        lengths = torch.tensor(num_images).to(self.device)
        y = torch.tensor(y, dtype=torch.float32).to(device)
        images = images.to(self.device)
        output = self.model(images, text, lengths)
        preds.append(output)
        targets.append(y)

    y_pred = torch.cat([torch.tensor(p) for p in preds]).to('cpu').numpy().flatten()
    y_true = torch.cat([torch.tensor(t) for t in targets]).to('cpu').numpy().flatten()

    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MAE: {mae:.4f}")
    print(f"R²:  {r2:.4f}")

    return mae, r2

  def save(self, path=None):
    if path is None:
      torch.save(self.model.state_dict(), self.save_path)
    else:
      torch.save(self.model.state_dict(), path)
  def descongelar_capas(self):
    print("Descongelando capas...")
    for name, param in self.model.multiVGG.named_parameters():
      if '8.3.ghost2' in name:
        param.requires_grad = True
    for name, param in self.model.textModel.named_parameters():
      if '5.' in name:
        param.requires_grad = True


# Inicialización de los modelos, entrenamiento y testeo

In [None]:
import timm
v_model = timm.create_model('ghostnet_100', pretrained=True)
v_model.classifier = nn.Identity()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
for name, param in v_model.blocks.named_parameters():
    if '9' in name:  # suponiendo que el último bloque tiene '16' en el nombre
        param.requires_grad = True
    else:
        param.requires_grad = False


# Descongela la capa conv_head
for name, param in v_model.named_parameters():
    if 'conv_head' in name:
        param.requires_grad = True

In [None]:
for name, param in v_model.named_parameters():
    estado = "entrenable" if param.requires_grad else "congelada"
    print(f"{name:50} → {estado}")

conv_stem.weight                                   → entrenable
bn1.weight                                         → entrenable
bn1.bias                                           → entrenable
blocks.0.0.ghost1.primary_conv.0.weight            → congelada
blocks.0.0.ghost1.primary_conv.1.weight            → congelada
blocks.0.0.ghost1.primary_conv.1.bias              → congelada
blocks.0.0.ghost1.cheap_operation.0.weight         → congelada
blocks.0.0.ghost1.cheap_operation.1.weight         → congelada
blocks.0.0.ghost1.cheap_operation.1.bias           → congelada
blocks.0.0.ghost2.primary_conv.0.weight            → congelada
blocks.0.0.ghost2.primary_conv.1.weight            → congelada
blocks.0.0.ghost2.primary_conv.1.bias              → congelada
blocks.0.0.ghost2.cheap_operation.0.weight         → congelada
blocks.0.0.ghost2.cheap_operation.1.weight         → congelada
blocks.0.0.ghost2.cheap_operation.1.bias           → congelada
blocks.1.0.ghost1.primary_conv.0.weight            →

In [None]:
multiVGG = MultiImageFlexibleVGG(v_model).to(device)
featLSTM = FeatLSTM(1280, 512).to(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Recognai/distilbert-base-es-multilingual-cased")
model = AutoModel.from_pretrained("Recognai/distilbert-base-es-multilingual-cased")

In [None]:
for param in model.parameters():
    param.requires_grad = False

for name, param in model.named_parameters():
    if '5.output_layer_norm' in name or '5.ffn' in name:
        param.requires_grad = True


In [None]:
for name, param in model.named_parameters():
    estado = "entrenable" if param.requires_grad else "congelada"
    print(f"{name:50} → {estado}")

embeddings.word_embeddings.weight                  → congelada
embeddings.position_embeddings.weight              → congelada
embeddings.LayerNorm.weight                        → congelada
embeddings.LayerNorm.bias                          → congelada
transformer.layer.0.attention.q_lin.weight         → congelada
transformer.layer.0.attention.q_lin.bias           → congelada
transformer.layer.0.attention.k_lin.weight         → congelada
transformer.layer.0.attention.k_lin.bias           → congelada
transformer.layer.0.attention.v_lin.weight         → congelada
transformer.layer.0.attention.v_lin.bias           → congelada
transformer.layer.0.attention.out_lin.weight       → congelada
transformer.layer.0.attention.out_lin.bias         → congelada
transformer.layer.0.sa_layer_norm.weight           → congelada
transformer.layer.0.sa_layer_norm.bias             → congelada
transformer.layer.0.ffn.lin1.weight                → congelada
transformer.layer.0.ffn.lin1.bias                  → co

In [None]:
textmodel = TextModel(tokenizer, model, device).to(device)

In [None]:
multimodal_model = MultimodalModel(multiVGG, featLSTM, textmodel, device)

In [None]:
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)
devloader = DataLoader(dev_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)
testloader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

In [None]:
earlystopping = EarlyStopping(patience = 12, min_delta=0.01)

In [None]:
trainer = Trainer(multimodal_model.to(device), train_dataset, trainloader, devloader, testloader, 30, 0.001, device, True, earlystopping)

In [None]:
trainer.train()

Epoch 1/30, Loss: 0.1713414490222931
Epoch 1/30, Loss: 0.6342011094093323
Epoch 1/30, Loss: 1.241392731666565
Epoch 1/30, Loss: 0.4843325912952423
Epoch 1/30, Loss: 0.2563340663909912
Epoch 1/30, Loss: 2.6854259967803955
Epoch 1/30, Loss: 0.1396353542804718
Epoch 1/30, Loss: 0.26212918758392334
Epoch 1/30, Loss: 0.3765050172805786
Epoch 1/30, Loss: 0.273301899433136
Epoch 1/30, Loss: 0.2794382572174072
Epoch 1/30, Loss: 0.4893430471420288
Epoch 1/30, Loss: 0.13794009387493134
Epoch 1/30, Loss: 0.20097225904464722
Epoch 1/30, Loss: 0.11067746579647064
Epoch 1/30, Loss: 35.74848937988281
Epoch 1/30, Loss: 0.7789672613143921
Epoch 1/30, Loss: 0.14330121874809265
Epoch 1/30, Loss: 1.0006150007247925
Epoch 1/30, Loss: 0.21298864483833313
Epoch 1/30, Loss: 0.22624196112155914
Epoch 1/30, Loss: 2.789778709411621
Epoch 1/30, Loss: 0.2403961718082428
Epoch 1/30, Loss: 0.15360543131828308
Epoch 1/30, Loss: 0.17403990030288696
Epoch 1/30, Loss: 0.24317757785320282
Epoch 1/30, Loss: 0.147191241383

In [None]:
multimodal_model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/multimodal_model-2.pth'))
trainer = Trainer(multimodal_model.to(device), train_dataset, trainloader, devloader, testloader, 30, 0.001, device, True, earlystopping)

In [None]:
trainer.test()

MAE: 0.4086
R²:  0.1921


  y_pred = torch.cat([torch.tensor(p) for p in preds]).to('cpu').numpy().flatten()
  y_true = torch.cat([torch.tensor(t) for t in targets]).to('cpu').numpy().flatten()


(0.408601850271225, 0.19210952520370483)