In [None]:
import argparse
import os
import numpy as np
import math
import itertools
import signal
import pandas as pd
import random
from sklearn.metrics import mean_absolute_error, r2_score
import pickle
from io import BytesIO

import torchvision.transforms as transforms
from torchvision.utils import save_image
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torch.autograd import Variable
from torchvision import models

import gc

import zipfile
from io import BytesIO

import torch.nn as nn
import torch.nn.functional as F
import torch

import matplotlib.pyplot as plt
from PIL import Image

from torch.amp import GradScaler, autocast
from torch.utils.data import random_split

import glob

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/datos_coches_procesados.csv')

In [None]:
random.seed(42)
def split_keys(df, train, dev):
  keys = list(range(len(df)))
  random.shuffle(keys)

  train_split = train
  dev_split = dev

  n = len(keys)
  train_keys = keys[:int(train_split * n)]
  dev_keys = keys[int(train_split * n):int((train_split + dev_split) * n)]
  test_keys = keys[int((train_split + dev_split) * n):]
  return train_keys, dev_keys, test_keys

In [None]:
def set_scaler(y_train):
  y_train_int = []
  for y in y_train:
    y = y.split(",")[0]
    y = y.replace('.', '')
    y = int(y)
    y_train_int.append(y)
  targets = np.array(y_train_int).reshape(-1, 1)
  scaler = StandardScaler()
  scaler.fit(targets)
  return scaler

In [None]:
def get_labels_train(df, train_keys):
  filtered_df = df.loc[train_keys]
  y_train = filtered_df['precio'].tolist()
  return y_train

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, scaler,lista_elementos):
        self.df = df
        self.scaler = scaler
        self.keys = lista_elementos
    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]

        metadata = self.df.loc[key]
        try:
          año = int(float(metadata['año']))
        except:
          año = None
        try:
          km = int(float(metadata['km']))
        except:
          km = None
        try:
          precio = metadata['precio'].split(",")[0]
          precio = precio.replace('.', '')
          precio = int(precio)
        except:
          precio = -1
        precio = self.scaler.transform([[precio]])[0][0]
        text = "Modelo: " + str(metadata['marca']) +" "+str(metadata['modelo']) +" " + "año: " + str(año)+ " "+"Kilómetros: " + str(km) + " " + str(metadata['desc'])

        return text, precio

In [None]:
train_keys, dev_keys, test_keys = split_keys(df, 0.85, 0.05)

In [None]:
y_train = get_labels_train(df, train_keys)

In [None]:
scaler = set_scaler(y_train)

In [None]:
train_dataset = TextDataset(df, scaler, train_keys)
dev_dataset = TextDataset(df, scaler, dev_keys)
test_dataset = TextDataset(df, scaler, test_keys)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
class TextModel(nn.Module):
  def __init__(self, tokenizer, model, device):
    super(TextModel, self).__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.device = device
    self.fc = nn.Linear(768, 1)
  def forward(self, x):
    tokens = self.tokenizer(x, padding=True, truncation=True, return_tensors='pt')
    tokens = tokens.to(self.device)
    output = self.model(**tokens)
    output = self.fc(output.last_hidden_state[:, 0, :])
    return output

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Recognai/distilbert-base-es-multilingual-cased")
model = AutoModel.from_pretrained("Recognai/distilbert-base-es-multilingual-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/174k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/255M [00:00<?, ?B/s]

In [None]:
for param in model.parameters():
    param.requires_grad = False  # congela todo excepto la cabeza

# Si quieres descongelar solo las últimas capas del encoder (por ejemplo, las 2 últimas)
for name, param in model.named_parameters():
    if '5.output_layer_norm' in name or '5.ffn' in name:
        param.requires_grad = True

In [None]:
for name, param in model.named_parameters():
    estado = "entrenable" if param.requires_grad else "congelada"
    print(f"{name:50} → {estado}")

embeddings.word_embeddings.weight                  → congelada
embeddings.position_embeddings.weight              → congelada
embeddings.LayerNorm.weight                        → congelada
embeddings.LayerNorm.bias                          → congelada
transformer.layer.0.attention.q_lin.weight         → congelada
transformer.layer.0.attention.q_lin.bias           → congelada
transformer.layer.0.attention.k_lin.weight         → congelada
transformer.layer.0.attention.k_lin.bias           → congelada
transformer.layer.0.attention.v_lin.weight         → congelada
transformer.layer.0.attention.v_lin.bias           → congelada
transformer.layer.0.attention.out_lin.weight       → congelada
transformer.layer.0.attention.out_lin.bias         → congelada
transformer.layer.0.sa_layer_norm.weight           → congelada
transformer.layer.0.sa_layer_norm.bias             → congelada
transformer.layer.0.ffn.lin1.weight                → entrenable
transformer.layer.0.ffn.lin1.bias                  → e

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0
        self.should_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True


In [None]:
class Trainer():
  def __init__(self, model, dataset, dataloader, devloader, testloader, num_epochs, learning_rate, device, save, earlystopping):
    self.model = model
    self.dataset = dataset
    self.dataloader = dataloader
    self.num_epochs = num_epochs
    self.learning_rate = learning_rate
    self.device = device
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
    self.criterion = nn.MSELoss()
    self.save = save
    self.earlystopping = earlystopping
    self.devloader = devloader
    self.testloader = testloader
    self.save_path ='/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/text_model-2.pth'

  def train(self):
    for epoch in range(self.num_epochs):
      self.model.train()
      for text, y in self.dataloader:

        y = y.clone().detach().to(torch.float32).to(self.device)

        self.optimizer.zero_grad()

        output = self.model(text)
        loss = self.criterion(output.flatten(), y.flatten())
        loss.backward()
        self.optimizer.step()
        print(f"Epoch {epoch+1}/{self.num_epochs}, Loss: {loss.item()}")

      eval_loss = self.eval()

      if self.save and self.earlystopping.best_loss > eval_loss:
        torch.save(self.model.state_dict(), self.save_path)
      if epoch == 7:
        self.descongelar_capas()
      print(f"Epoch {epoch+1}/{self.num_epochs}, Eval Loss: {eval_loss}")

      self.earlystopping(eval_loss)
      if self.earlystopping.should_stop:
        print("Early stopping triggered")
        break

  def eval(self):
    self.model.eval()
    losses = []
    with torch.no_grad():
      for text, y in self.devloader:
        y = y.clone().detach().to(torch.float32).to(self.device)
        output = self.model(text)
        loss = self.criterion(output.flatten(), y.flatten())
        losses.append(loss.item())
    return sum(losses)/len(losses)
  def test(self):
    self.model.eval()
    targets = []
    preds = []
    with torch.no_grad():
      for text, y in self.testloader:
        y = y.clone().detach().to(torch.float32).to(self.device)
        output = self.model(text)
        preds.append(output.flatten())
        targets.append(y.flatten())

    y_pred = torch.cat([torch.tensor(p) for p in preds]).to('cpu').numpy().flatten()
    y_true = torch.cat([torch.tensor(t) for t in targets]).to('cpu').numpy().flatten()

    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"MAE: {mae:.4f}")
    print(f"R²:  {r2:.4f}")

    return mae, r2

  def save(self, path=None):
    if path is None:
      torch.save(self.model.state_dict(), self.save_path)
    else:
      torch.save(self.model.state_dict(), path)

  def descongelar_capas(self):
    print("Descongelando capas...")
    for name, param in self.model.textModel.named_parameters():
      if '5.' in name:
        param.requires_grad = True

In [None]:
textmodel = TextModel(tokenizer, model, device)

In [None]:
earlystopping = EarlyStopping(min_delta=0.01)

In [None]:
textmodel.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/MDT/ProyectoIndividual/text_model.pth'))

<All keys matched successfully>

In [None]:
trainer = Trainer(textmodel.to(device), train_dataset, train_loader, dev_loader, test_loader, 15, 0.001, device, True, earlystopping)

In [None]:
trainer.train()

Epoch 1/15, Loss: 0.3202843964099884
Epoch 1/15, Loss: 3.0397017002105713
Epoch 1/15, Loss: 0.19698786735534668
Epoch 1/15, Loss: 0.3779381513595581
Epoch 1/15, Loss: 1.2014055252075195
Epoch 1/15, Loss: 0.2957454025745392
Epoch 1/15, Loss: 0.44046419858932495
Epoch 1/15, Loss: 0.11500632762908936
Epoch 1/15, Loss: 0.5856974124908447
Epoch 1/15, Loss: 0.16816100478172302
Epoch 1/15, Loss: 0.15348844230175018
Epoch 1/15, Loss: 0.6598346829414368
Epoch 1/15, Loss: 0.19764700531959534
Epoch 1/15, Loss: 0.18695330619812012
Epoch 1/15, Loss: 1.8551902770996094
Epoch 1/15, Loss: 0.5436362624168396
Epoch 1/15, Loss: 0.3645628094673157
Epoch 1/15, Loss: 0.6686082482337952
Epoch 1/15, Loss: 0.14829027652740479
Epoch 1/15, Loss: 0.14135059714317322
Epoch 1/15, Loss: 0.12950676679611206
Epoch 1/15, Loss: 0.29596877098083496
Epoch 1/15, Loss: 44.78174591064453
Epoch 1/15, Loss: 0.25416308641433716
Epoch 1/15, Loss: 0.18625211715698242
Epoch 1/15, Loss: 0.11293862760066986
Epoch 1/15, Loss: 0.20846

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/15, Eval Loss: 0.5995785250113561
Early stopping triggered


In [None]:
trainer.test()

MAE: 0.2533
R²:  0.2045


  y_pred = torch.cat([torch.tensor(p) for p in preds]).to('cpu').numpy().flatten()
  y_true = torch.cat([torch.tensor(t) for t in targets]).to('cpu').numpy().flatten()


(0.2533424496650696, 0.20452231168746948)