# Modelo

## Dividir el Conjunto de Datos Basado en Usuarios

In [1]:
# import pandas as pd
# data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/processed_train_pca50.parquet"
# df = pd.read_parquet(data_path)

In [None]:
# num_pos = df['add_to_cart'].sum()
# num_neg = len(df) - num_pos

# print(num_pos)
# print(num_neg)

# pos_weight_value = num_neg / num_pos
# print(pos_weight_value)

2766405
44047154
15.922163963700182


In [None]:
'''
import pickle
import pandas as pd
import numpy as np

# Cargar el conjunto completo
import pyarrow.parquet as pq
data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/processed_train_pca50.parquet"

table = pq.ParquetFile(data_path)
dfs = [table.read_row_group(i).to_pandas() for i in range(table.num_row_groups)]
full_df = pd.concat(dfs, ignore_index=True)

# Obtener user_ids únicos y barajarlos
user_ids = full_df['user_id'].unique()
np.random.shuffle(user_ids)

# Dividir usuarios en entrenamiento y validación (80-20)
split_index = int(0.8 * len(user_ids))
train_user_ids = set(user_ids[:split_index])
val_user_ids = set(user_ids[split_index:])

# Guardar los conjuntos de user_ids
with open('train_user_ids.pkl', 'wb') as f:
    pickle.dump(train_user_ids, f)

with open('val_user_ids.pkl', 'wb') as f:
    pickle.dump(val_user_ids, f)
'''

## Entrenamiendo del modelo

In [9]:
# Importar las librerías necesarias
import torch
import torch.nn as nn
import torch.optim as optim
import os
import psutil
from torch.utils.data import IterableDataset, DataLoader
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from torch.utils.data import get_worker_info
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from torch.cuda.amp import GradScaler, autocast
import json
from sklearn.metrics import ndcg_score
import pickle

# Configuración global de características
FEATURE_COLS = ["time_since_last", "session_relative_position", "session_duration",
                "R", "F", "M", "device_type", "pagetype", "discount", "cod_section", "family",
                "is_new_user", "has_session_history"]

FEATURE_DIM = len(FEATURE_COLS)
print(f"FEATURE_DIM: {FEATURE_DIM}")


# ✅ 1️⃣ Configuración del Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo: {device}")

# 🔥 Función para Monitoreo de Memoria
def print_memory_usage(epoch=None):
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"🖥️ RAM Usage: {mem_info.rss / 1e9:.2f} GB")
    if torch.cuda.is_available():
        print(f"🔥 GPU Usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"🔥 GPU Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    if epoch is not None:
        print(f"📊 Memoria después del Epoch {epoch}")

# ✅ Implementación de Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean', pos_weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = torch.tensor(1.0, dtype=torch.float32, device=device) if pos_weight is None else pos_weight

    def forward(self, inputs, targets):
        # ✅ Clamping para evitar valores extremos
        inputs = torch.clamp(inputs, min=-10, max=10)

        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, pos_weight=self.pos_weight, reduction='none'
        )
        probas = torch.sigmoid(inputs)
        pt = targets * probas + (1 - targets) * (1 - probas)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        return loss.mean()

# ✅ 2️⃣ Definir Modelo Ajustado con Dropout y Regularización
# ✅ Modelo GRU con Atención
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.attn_weights = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        """
        gru_output: Tensor de tamaño (batch, seq_len, hidden_dim)
        Devuelve:
        - Scores de cada producto en la sesión (batch, seq_len)
        """
        attn_scores = self.attn_weights(gru_output).squeeze(-1)  # (batch, seq_len)
        attn_weights = torch.softmax(attn_scores, dim=-1)  # Normalizamos
        return attn_weights

class GRURecommender(nn.Module):
    def __init__(self, input_dim=50, feature_dim=FEATURE_DIM, hidden_dim=128, num_layers=2, output_dim=1):
        super(GRURecommender, self).__init__()
        self.gru = nn.GRU(input_dim + feature_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
        self.attention = AttentionLayer(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

        # Inicialización de pesos
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.GRU):
            for name, param in m.named_parameters():
                if 'weight' in name:
                    nn.init.xavier_uniform_(param)
                elif 'bias' in name:
                    nn.init.zeros_(param)

    def forward(self, product_embeddings, session_features):
            x = torch.cat((product_embeddings, session_features), dim=-1)
            gru_output, _ = self.gru(x)
            # Eliminar clamping
            # gru_output = torch.clamp(gru_output, min=-10, max=10)
            attn_weights = self.attention(gru_output)
            weighted_output = gru_output * attn_weights.unsqueeze(-1)
            scores = self.fc(self.dropout(weighted_output)).squeeze(-1)
            # Eliminar clamping
            # return torch.clamp(scores, min=-10, max=10)
            return scores

model = GRURecommender(input_dim=50, feature_dim=FEATURE_DIM, hidden_dim=128, num_layers=2, output_dim=1).to(device)
print(f"Modelo en: {next(model.parameters()).device}")

# ✅ 3️⃣ Definir Pérdida y Optimizador con Focal Loss y regularización
# pos_weight_value = 1
# pos_weight = torch.tensor(pos_weight_value, dtype=torch.float32, device=device)

# num_pos = df['add_to_cart'].sum()
# num_neg = len(df) - num_pos
# pos_weight_value = num_neg / num_pos
pos_weight_value = 10
pos_weight = torch.tensor(pos_weight_value, dtype=torch.float32, device=device)
print(f"Nuevo pos_weight: {pos_weight}")

# criterion = FocalLoss(alpha=0.5, gamma=2.5, pos_weight=pos_weight)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
# scaler = torch.amp.GradScaler('cuda')

print(f"Criterion: {criterion}")
print(f"Optimizer: {optimizer}")

# ✅ 4️⃣ Definir Funciones de Evaluación con Métricas Adicionales
def evaluate(model, data_loader, threshold=0.5):
    model.eval()
    all_targets = []
    all_outputs = []

    with torch.no_grad():
        for batch in data_loader:
            product_embeddings, session_features, targets, _, _ = batch
            product_embeddings = product_embeddings.to(device)
            session_features = session_features.to(device)
            targets = targets.to(device)

            outputs = model(product_embeddings, session_features)  # (batch, seq_len)
            outputs = torch.sigmoid(outputs)  # Convertimos logits a probabilidades

            all_targets.extend(targets.cpu().numpy().flatten())
            all_outputs.extend(outputs.cpu().numpy().flatten())

    # ✅ Convertir listas a arrays de NumPy
    all_targets = np.array(all_targets)
    all_outputs = np.array(all_outputs)

    # ✅ Filtrar NaN antes de calcular métricas
    mask = ~np.isnan(all_targets) & ~np.isnan(all_outputs)
    all_targets = all_targets[mask]
    all_outputs = all_outputs[mask]

    if len(all_targets) == 0:
        print("⚠️ No hay datos válidos para calcular métricas.")
        return float('nan'), float('nan'), float('nan'), float('nan')

    # Ahora calculamos las predicciones y la distribución
    predicted_classes = (all_outputs >= threshold).astype(int)
    unique_classes, counts = np.unique(predicted_classes, return_counts=True)
    print("Distribución de las predicciones del modelo:")
    for cls, count in zip(unique_classes, counts):
        print(f"Clase {cls}: {count} muestras")

    # Calculamos las métricas
    auc_roc = roc_auc_score(all_targets, all_outputs)
    average_precision = average_precision_score(all_targets, all_outputs)
    f1 = f1_score(all_targets, predicted_classes, zero_division=0)

    print(f"F1-score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")
    return auc_roc, average_precision, float('nan'), f1

def evaluate_ndcg(model, data_loader, k=5):
    model.eval()
    ndcg_scores = []

    with torch.no_grad():
        for batch in data_loader:
            product_embeddings, session_features, targets, session_ids, partnumbers_list = batch
            product_embeddings = product_embeddings.to(device)
            session_features = session_features.to(device)
            outputs = model(product_embeddings, session_features)
            outputs = outputs.cpu().numpy()
            targets = targets.cpu().numpy()

            for output, target in zip(outputs, targets):
                # Asegúrate de que output y target están alineados y son arrays numpy
                ndcg = ndcg_score([target], [output], k=k)
                ndcg_scores.append(ndcg)

    avg_ndcg = np.mean(ndcg_scores)
    print(f"NDCG@{k}: {avg_ndcg:.4f}")
    return avg_ndcg


# ✅ 5️⃣ Función de Entrenamiento con Persistencia del Mejor Modelo
def train(model, train_dataset, val_dataset, criterion, optimizer, scheduler, epochs):
    best_ndcg = 0.0
    # Eliminamos el escalador de gradientes ya que no usamos AMP
    # scaler = torch.cuda.amp.GradScaler('cuda')
    
    for epoch in range(epochs):
        model.train()
        print(f"--- Epoch {epoch+1}/{epochs} ---")

        train_loader = DataLoader(
            train_dataset,
            batch_size=256,
            shuffle=False,  # No es necesario shuffle con IterableDataset
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True
        )

        total_loss = 0.0
        batch_count = 0
        
        print_memory_usage(epoch="Inicio")
        print("Iniciando el bucle de batches...")

        for batch_idx, batch in enumerate(train_loader):
            product_embeddings, session_features, targets, _, _ = batch
            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            # 🚨 Verificar si hay NaN o Inf en las entradas
            if torch.isnan(product_embeddings).any() or torch.isinf(product_embeddings).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `product_embeddings`. Saltando batch...")
                continue
            if torch.isnan(session_features).any() or torch.isinf(session_features).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `session_features`. Saltando batch...")
                continue

            optimizer.zero_grad()
            # Sin AMP ni autocast
            outputs = model(product_embeddings, session_features)

            # 🚨 Verificar valores extremos en `outputs`
            if torch.isnan(outputs).any() or torch.isinf(outputs).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `outputs`. Saltando batch...")
                continue

            # Eliminamos el clamping para evitar interferir con los gradientes
            # outputs = torch.clamp(outputs, min=-5, max=5)

            targets = targets[:, :outputs.shape[1]]
            loss = criterion(outputs, targets)

            # ✅ Verificar si la pérdida es infinita o NaN antes de `backward()`
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `loss`. Saltando batch...")
                continue

            # Retropropagación sin AMP
            loss.backward()

            # Verificar si los gradientes son finitos
            grads_finite = True
            for name, param in model.named_parameters():
                if param.grad is not None and (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()):
                    grads_finite = False
                    print(f"⚠️ Batch {batch_idx}: NaN o Inf en gradientes en {name}. Saltando actualización.")
                    break

            if grads_finite:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
            else:
                optimizer.zero_grad()
                continue

            total_loss += loss.item()
            batch_count += 1

        scheduler.step()

        avg_loss = total_loss / batch_count if batch_count > 0 else float('inf')
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
        print_memory_usage(epoch=epoch+1)

        val_loader = DataLoader(
            val_dataset,
            batch_size=256,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True
        )

        auc_roc, avg_precision, mapk_score, f1 = evaluate(model, val_loader)
        print(f"Validation AUC-ROC: {auc_roc:.4f}, Average Precision: {avg_precision:.4f}, MAP@5: {mapk_score:.4f}")
        val_ndcg = evaluate_ndcg(model, val_loader)
        print(f"Validation NDCG: {val_ndcg:.4f}")
        
        if val_ndcg > best_ndcg:
            best_ndcg = val_ndcg
            model_filename = f"best_model_epoch_{epoch+1}_ndcg_{val_ndcg:.4f}.pt"
            torch.save(model.state_dict(), model_filename)
            print(f"✅ Nuevo mejor modelo guardado: {model_filename}")
        else:
            print(f"No hay mejora en NDCG@5: {val_ndcg:.4f} <= {best_ndcg:.4f}")

    return model_filename


# ✅ 6️⃣ Cargar Dataset con Normalización y Manejo de Outliers
MAX_SEQ_LENGTH = 50

class IterableSessionDataset(IterableDataset):
    def __init__(self, df_path, fraction=1.0, mode='train', balance=True, user_ids=None, train_user_ids=None):
        self.df_path = df_path
        self.feature_cols = FEATURE_COLS
        self.fraction = fraction
        self.mode = mode
        self.balance = balance
        self.user_ids = user_ids
        self.train_user_ids = train_user_ids

        # Cargar el DataFrame completo para calcular estadísticas globales
        table = pq.ParquetFile(self.df_path)
        num_row_groups = table.metadata.num_row_groups
        dfs = [table.read_row_group(i).to_pandas() for i in range(num_row_groups)]
        full_df = pd.concat(dfs, ignore_index=True)
        
        # Verificar que train_user_ids está disponible
        if self.train_user_ids is None:
            raise ValueError("train_user_ids no está definido. Debes proporcionarlo al inicializar.")
        
        # Calcular 'is_new_user' en full_df
        full_df['is_new_user'] = full_df['user_id'].apply(lambda x: 0 if x in self.train_user_ids else 1)
        
        # Calcular 'has_session_history' en full_df
        full_df['has_session_history'] = full_df.groupby('session_id').cumcount().apply(lambda x: 0 if x == 0 else 1)
        
        self.global_feature_means = full_df[self.feature_cols].mean()
        self.global_feature_stds = full_df[self.feature_cols].std().replace(0, 1e-6)

    def __iter__(self):
        
        worker_info = get_worker_info()
        if worker_info is None:
            return self._data_iterator()
        else:
            num_workers = worker_info.num_workers
            worker_id = worker_info.id
            return self._data_iterator(worker_id, num_workers)
        
        
    def _data_iterator(self, worker_id=0, num_workers=1):
        table = pq.ParquetFile(self.df_path)
        total_row_groups = table.metadata.num_row_groups
        num_row_groups_to_use = max(1, int(total_row_groups * self.fraction))

        if self.mode == 'train':
            row_groups = range(0, int(num_row_groups_to_use * 0.8))
        else:
            row_groups = range(int(num_row_groups_to_use * 0.8), num_row_groups_to_use)

        for i in row_groups:
            batch = table.read_row_group(i)
            df = batch.to_pandas()
            
            df_majority = df[df.add_to_cart == 0]
            df_minority = df[df.add_to_cart == 1]
            df_minority_oversampled = df_minority.sample(len(df_majority), replace=True, random_state=42)
            df_balanced = pd.concat([df_majority, df_minority_oversampled])
            df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
            
            # Filtrar por user_ids
            if self.user_ids is not None:
                df = df[df['user_id'].isin(self.user_ids)]
                        
            # Añadir columna 'is_new_user'
            df['is_new_user'] = df['user_id'].apply(lambda x: 0 if x in self.train_user_ids else 1)
            # Añadir columna 'has_session_history'
            df['has_session_history'] = df.groupby('session_id').cumcount().apply(lambda x: 0 if x == 0 else 1)

            # Aplicar transformaciones logarítmicas
            for col in ['time_since_last', 'session_duration']:
                df[col] = np.log1p(df[col])

            # Normalización global segura
            df[self.feature_cols] = (df[self.feature_cols] - self.global_feature_means) / self.global_feature_stds

            # Reemplazar posibles NaN resultantes de la normalización
            df[self.feature_cols] = df[self.feature_cols].fillna(0.0)

            # Continuar con el resto del procesamiento...
            # if self.balance:
            #     df_1 = df[df["add_to_cart"] == 1]
            #     df_0 = df[df["add_to_cart"] == 0]
            #     if len(df_1) > 0 and len(df_0) > 0:
            #         if len(df_1) > len(df_0):
            #             df_0 = df_0.sample(n=len(df_1), replace=True, random_state=42)
            #         else:
            #             df_1 = df_1.sample(n=len(df_0), replace=True, random_state=42)
            #    df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)

            for session_id, session_data in df.groupby("session_id"):
                product_embeddings = session_data["embedding_reduced"].tolist()
                partnumbers = session_data["partnumber"].tolist()

                # Verificar si hay NaN en embeddings
                if any(embedding is None or np.isnan(embedding).any() for embedding in product_embeddings):
                    continue

                product_embeddings = torch.tensor(np.array(product_embeddings), dtype=torch.float32)

                session_features = torch.tensor(session_data[self.feature_cols].values, dtype=torch.float32)

                # Verificar si hay NaN en features
                if torch.isnan(session_features).any():
                    continue

                targets = torch.tensor(session_data["add_to_cart"].values, dtype=torch.float32)
                yield product_embeddings, session_features, targets, session_id, partnumbers
                
                
# ✅ Función de Colación
def collate_fn(batch):
    product_embeddings, session_features, targets, session_ids, partnumbers_list = zip(*batch)

    if len(batch) == 0:
        return torch.empty(0), torch.empty(0), torch.empty(0), [], []

    max_len = min(max([x.shape[0] for x in product_embeddings]), MAX_SEQ_LENGTH)

    padded_embeddings = torch.zeros((len(batch), max_len, product_embeddings[0].shape[1]))
    padded_features = torch.zeros((len(batch), max_len, session_features[0].shape[1]))
    padded_targets = torch.zeros((len(batch), max_len))

    for i in range(len(batch)):
        seq_len = min(product_embeddings[i].shape[0], max_len)
        padded_embeddings[i, :seq_len] = product_embeddings[i][:seq_len]
        padded_features[i, :seq_len] = session_features[i][:seq_len]
        padded_targets[i, :seq_len] = targets[i][:seq_len]

    return padded_embeddings, padded_features, padded_targets, session_ids, partnumbers_list


# ✅ 7️⃣ Crear Datasets de Entrenamiento y Validación
data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/processed_train_pca50.parquet"
fraction = 0.05
with open('train_user_ids.pkl', 'rb') as f:
    train_user_ids = pickle.load(f)

with open('val_user_ids.pkl', 'rb') as f:
    val_user_ids = pickle.load(f)

train_dataset = IterableSessionDataset(
    data_path, fraction=fraction, mode='train', balance=True, user_ids=train_user_ids, train_user_ids=train_user_ids
)
val_dataset = IterableSessionDataset(
    data_path, fraction=fraction, mode='val', balance=False, user_ids=val_user_ids, train_user_ids=train_user_ids
)
# train_dataset = IterableSessionDataset(data_path, fraction=fraction, mode='train')
# val_dataset = IterableSessionDataset(data_path, fraction=fraction, mode='val')


# ✅ 8️⃣ Ejecutar Entrenamiento
EPOCHS = 5
print(f'\nFraction of Dataset: {fraction*100}%\n')
print(f'--- Número de epochs: {EPOCHS} ---')
best_model_file = train(model, train_dataset, val_dataset, criterion, optimizer, scheduler, EPOCHS)

FEATURE_DIM: 13
Dispositivo: cuda
Modelo en: cuda:0
Nuevo pos_weight: 10.0
Criterion: BCEWithLogitsLoss()
Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 1e-05
    lr: 1e-05
    maximize: False
    weight_decay: 0.0001
)

Fraction of Dataset: 5.0%

--- Número de epochs: 5 ---
--- Epoch 1/5 ---
🖥️ RAM Usage: 7.88 GB
🔥 GPU Usage: 0.02 GB
🔥 GPU Cached: 0.39 GB
📊 Memoria después del Epoch Inicio
Iniciando el bucle de batches...
Epoch [1/5], Loss: 1.3867
🖥️ RAM Usage: 8.27 GB
🔥 GPU Usage: 0.02 GB
🔥 GPU Cached: 0.39 GB
📊 Memoria después del Epoch 1
Distribución de las predicciones del modelo:
Clase 0: 9301 muestras
Clase 1: 108849 muestras
F1-score: 0.2668, AUC-ROC: 0.7033
Validation AUC-ROC: 0.7033, Average Precision: 0.4817, MAP@5: nan
NDCG@5: 0.2077
Validation NDCG: 0.2077
✅ Nuevo mejor modelo guardado: best_model_epoch_1_ndcg_0.2077.pt
--- Epoch

---

In [2]:
# Importar las librerías necesarias
import torch
import torch.nn as nn
import torch.optim as optim
import os
import psutil
from torch.utils.data import IterableDataset, DataLoader
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from torch.utils.data import get_worker_info
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from torch.cuda.amp import GradScaler, autocast
import json
from sklearn.metrics import ndcg_score
import pickle

# Configuración global de características
FEATURE_COLS = ["time_since_last", "session_relative_position", "session_duration",
                "R", "F", "M", "device_type", "pagetype", "discount", "cod_section", "family",
                "is_new_user", "has_session_history"]

FEATURE_DIM = len(FEATURE_COLS)
print(f"FEATURE_DIM: {FEATURE_DIM}")

# ✅ 1️⃣ Configuración del Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo: {device}")

# 🔥 Función para Monitoreo de Memoria
def print_memory_usage(epoch=None):
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"🖥️ RAM Usage: {mem_info.rss / 1e9:.2f} GB")
    if torch.cuda.is_available():
        print(f"🔥 GPU Usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"🔥 GPU Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    if epoch is not None:
        print(f"📊 Memoria después del Epoch {epoch}")

# ✅ Implementación de Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean', pos_weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = torch.tensor(1.0, dtype=torch.float32, device=device) if pos_weight is None else pos_weight

    def forward(self, inputs, targets):
        # ✅ Clamping para evitar valores extremos
        inputs = torch.clamp(inputs, min=-10, max=10)

        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, pos_weight=self.pos_weight, reduction='none'
        )
        probas = torch.sigmoid(inputs)
        pt = targets * probas + (1 - targets) * (1 - probas)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        return loss.mean()

# ✅ 2️⃣ Definir Modelo Ajustado con Dropout y Regularización
# ✅ Modelo GRU con Atención
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.attn_weights = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        """
        gru_output: Tensor de tamaño (batch, seq_len, hidden_dim)
        Devuelve:
        - Scores de cada producto en la sesión (batch, seq_len)
        """
        attn_scores = self.attn_weights(gru_output).squeeze(-1)  # (batch, seq_len)
        attn_weights = torch.softmax(attn_scores, dim=-1)  # Normalizamos
        return attn_weights

class GRURecommender(nn.Module):
    def __init__(self, input_dim=50, feature_dim=FEATURE_DIM, hidden_dim=128, num_layers=2, output_dim=1):
        super(GRURecommender, self).__init__()
        self.gru = nn.GRU(input_dim + feature_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
        self.attention = AttentionLayer(hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

        # Inicialización de pesos
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.GRU):
            for name, param in m.named_parameters():
                if 'weight' in name:
                    nn.init.xavier_uniform_(param)
                elif 'bias' in name:
                    nn.init.zeros_(param)

    def forward(self, product_embeddings, session_features):
        x = torch.cat((product_embeddings, session_features), dim=-1)
        gru_output, _ = self.gru(x)
        attn_weights = self.attention(gru_output)
        weighted_output = gru_output * attn_weights.unsqueeze(-1)
        scores = self.fc(self.dropout(weighted_output)).squeeze(-1)
        return scores

# ✅ 3️⃣ Configuración de pérdida, optimizador y scheduler se hará dentro de cada fold más adelante

# ✅ 4️⃣ Definir Funciones de Evaluación con Métricas Adicionales
def evaluate(model, data_loader, threshold=0.5):
    model.eval()
    all_targets = []
    all_outputs = []

    with torch.no_grad():
        for batch in data_loader:
            product_embeddings, session_features, targets, _, _ = batch
            product_embeddings = product_embeddings.to(device)
            session_features = session_features.to(device)
            targets = targets.to(device)

            outputs = model(product_embeddings, session_features)  # (batch, seq_len)
            outputs = torch.sigmoid(outputs)  # Convertimos logits a probabilidades

            all_targets.extend(targets.cpu().numpy().flatten())
            all_outputs.extend(outputs.cpu().numpy().flatten())

    # ✅ Convertir listas a arrays de NumPy
    all_targets = np.array(all_targets)
    all_outputs = np.array(all_outputs)

    # ✅ Filtrar NaN antes de calcular métricas
    mask = ~np.isnan(all_targets) & ~np.isnan(all_outputs)
    all_targets = all_targets[mask]
    all_outputs = all_outputs[mask]

    if len(all_targets) == 0:
        print("⚠️ No hay datos válidos para calcular métricas.")
        return float('nan'), float('nan'), float('nan'), float('nan')

    # Ahora calculamos las predicciones y la distribución
    predicted_classes = (all_outputs >= threshold).astype(int)
    unique_classes, counts = np.unique(predicted_classes, return_counts=True)
    print("Distribución de las predicciones del modelo:")
    for cls, count in zip(unique_classes, counts):
        print(f"Clase {cls}: {count} muestras")

    # Calculamos las métricas
    auc_roc = roc_auc_score(all_targets, all_outputs)
    average_precision = average_precision_score(all_targets, all_outputs)
    f1 = f1_score(all_targets, predicted_classes, zero_division=0)

    print(f"F1-score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")
    return auc_roc, average_precision, float('nan'), f1

def evaluate_ndcg(model, data_loader, k=5):
    model.eval()
    ndcg_scores = []

    with torch.no_grad():
        for batch in data_loader:
            product_embeddings, session_features, targets, session_ids, partnumbers_list = batch
            product_embeddings = product_embeddings.to(device)
            session_features = session_features.to(device)
            outputs = model(product_embeddings, session_features)
            outputs = outputs.cpu().numpy()
            targets = targets.cpu().numpy()

            for output, target in zip(outputs, targets):
                # Asegúrate de que output y target están alineados y son arrays numpy
                ndcg = ndcg_score([target], [output], k=k)
                ndcg_scores.append(ndcg)

    avg_ndcg = np.mean(ndcg_scores)
    print(f"NDCG@{k}: {avg_ndcg:.4f}")
    return avg_ndcg

# ✅ 5️⃣ Función de Entrenamiento con Persistencia del Mejor Modelo
def train(model, train_dataset, val_dataset, criterion, optimizer, scheduler, epochs):
    best_ndcg = 0.0
    model_filename = None  # Para almacenar el nombre del mejor modelo de este fold
    
    for epoch in range(epochs):
        model.train()
        print(f"--- Epoch {epoch+1}/{epochs} ---")

        train_loader = DataLoader(
            train_dataset,
            batch_size=256,
            shuffle=False,  # No es necesario shuffle con IterableDataset
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True
        )

        total_loss = 0.0
        batch_count = 0

        print_memory_usage(epoch="Inicio")
        print("Iniciando el bucle de batches...")

        for batch_idx, batch in enumerate(train_loader):
            product_embeddings, session_features, targets, _, _ = batch
            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            # 🚨 Verificar si hay NaN o Inf en las entradas
            if torch.isnan(product_embeddings).any() or torch.isinf(product_embeddings).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `product_embeddings`. Saltando batch...")
                continue
            if torch.isnan(session_features).any() or torch.isinf(session_features).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `session_features`. Saltando batch...")
                continue

            optimizer.zero_grad()
            outputs = model(product_embeddings, session_features)

            # 🚨 Verificar valores extremos en `outputs`
            if torch.isnan(outputs).any() or torch.isinf(outputs).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `outputs`. Saltando batch...")
                continue

            targets = targets[:, :outputs.shape[1]]
            loss = criterion(outputs, targets)

            # ✅ Verificar si la pérdida es infinita o NaN antes de `backward()`
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print(f"⚠️ Batch {batch_idx}: `NaN` o `Inf` en `loss`. Saltando batch...")
                continue

            loss.backward()

            # Verificar si los gradientes son finitos
            grads_finite = True
            for name, param in model.named_parameters():
                if param.grad is not None and (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()):
                    grads_finite = False
                    print(f"⚠️ Batch {batch_idx}: NaN o Inf en gradientes en {name}. Saltando actualización.")
                    break

            if grads_finite:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
            else:
                optimizer.zero_grad()
                continue

            total_loss += loss.item()
            batch_count += 1

        scheduler.step()

        avg_loss = total_loss / batch_count if batch_count > 0 else float('inf')
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
        print_memory_usage(epoch=epoch+1)

        val_loader = DataLoader(
            val_dataset,
            batch_size=256,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True
        )

        auc_roc, avg_precision, mapk_score, f1 = evaluate(model, val_loader)
        print(f"Validation AUC-ROC: {auc_roc:.4f}, Average Precision: {avg_precision:.4f}, MAP@5: {mapk_score:.4f}")
        val_ndcg = evaluate_ndcg(model, val_loader)
        print(f"Validation NDCG: {val_ndcg:.4f}")

        if val_ndcg > best_ndcg:
            best_ndcg = val_ndcg
            model_filename = f"CV_best_model_epoch_{epoch+1}_ndcg_{val_ndcg:.4f}.pt"
            torch.save(model.state_dict(), model_filename)
            print(f"✅ Nuevo mejor modelo guardado: {model_filename}")
        else:
            print(f"No hay mejora en NDCG@5: {val_ndcg:.4f} <= {best_ndcg:.4f}")

    return model_filename

# ✅ 6️⃣ Cargar Dataset con Normalización y Manejo de Outliers
MAX_SEQ_LENGTH = 50

class IterableSessionDataset(IterableDataset):
    def __init__(self, df_path, fraction=1.0, mode='train', balance=True, user_ids=None, train_user_ids=None):
        self.df_path = df_path
        self.feature_cols = FEATURE_COLS
        self.fraction = fraction
        self.mode = mode
        self.balance = balance
        self.user_ids = user_ids
        self.train_user_ids = train_user_ids

        # Cargar el DataFrame completo para calcular estadísticas globales de entrenamiento
        table = pq.ParquetFile(self.df_path)
        num_row_groups = table.metadata.num_row_groups
        dfs = [table.read_row_group(i).to_pandas() for i in range(num_row_groups)]
        full_df = pd.concat(dfs, ignore_index=True)

        # Verificar que train_user_ids está disponible
        if self.train_user_ids is None:
            raise ValueError("train_user_ids no está definido. Debes proporcionarlo al inicializar.")

        # Calcular 'is_new_user' en full_df
        full_df['is_new_user'] = full_df['user_id'].apply(lambda x: 0 if x in self.train_user_ids else 1)

        # Calcular 'has_session_history' en full_df
        full_df['has_session_history'] = full_df.groupby('session_id').cumcount().apply(lambda x: 0 if x == 0 else 1)

        self.global_feature_means = full_df[self.feature_cols].mean()
        self.global_feature_stds = full_df[self.feature_cols].std().replace(0, 1e-6)

    def __iter__(self):

        worker_info = get_worker_info()
        if worker_info is None:
            return self._data_iterator()
        else:
            num_workers = worker_info.num_workers
            worker_id = worker_info.id
            return self._data_iterator(worker_id, num_workers)

    def _data_iterator(self, worker_id=0, num_workers=1):
        table = pq.ParquetFile(self.df_path)
        total_row_groups = table.metadata.num_row_groups
        num_row_groups_to_use = max(1, int(total_row_groups * self.fraction))

        if self.mode == 'train':
            row_groups = range(0, int(num_row_groups_to_use * 0.8))
        else:
            row_groups = range(int(num_row_groups_to_use * 0.8), num_row_groups_to_use)

        for i in row_groups:
            batch = table.read_row_group(i)
            df = batch.to_pandas()

            # Balanceo del dataset
            df_majority = df[df.add_to_cart == 0]
            df_minority = df[df.add_to_cart == 1]
            if len(df_minority) > 0:
                df_minority_oversampled = df_minority.sample(len(df_majority), replace=True, random_state=42)
                df_balanced = pd.concat([df_majority, df_minority_oversampled])
                df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

            # Filtrar por user_ids si se proporcionan
            if self.user_ids is not None:
                df = df[df['user_id'].isin(self.user_ids)]

            # Añadir columna 'is_new_user'
            df['is_new_user'] = df['user_id'].apply(lambda x: 0 if x in self.train_user_ids else 1)
            # Añadir columna 'has_session_history'
            df['has_session_history'] = df.groupby('session_id').cumcount().apply(lambda x: 0 if x == 0 else 1)

            # Aplicar transformaciones logarítmicas
            for col in ['time_since_last', 'session_duration']:
                df[col] = np.log1p(df[col])

            # Normalización global segura
            df[self.feature_cols] = (df[self.feature_cols] - self.global_feature_means) / self.global_feature_stds

            # Reemplazar posibles NaN resultantes de la normalización
            df[self.feature_cols] = df[self.feature_cols].fillna(0.0)

            for session_id, session_data in df.groupby("session_id"):
                product_embeddings = session_data["embedding_reduced"].tolist()
                partnumbers = session_data["partnumber"].tolist()

                # Verificar si hay NaN en embeddings
                if any(embedding is None or np.isnan(embedding).any() for embedding in product_embeddings):
                    continue

                product_embeddings = torch.tensor(np.array(product_embeddings), dtype=torch.float32)

                session_features = torch.tensor(session_data[self.feature_cols].values, dtype=torch.float32)

                # Verificar si hay NaN en features
                if torch.isnan(session_features).any():
                    continue

                targets = torch.tensor(session_data["add_to_cart"].values, dtype=torch.float32)
                yield product_embeddings, session_features, targets, session_id, partnumbers


# ✅ Función de Colación
def collate_fn(batch):
    product_embeddings, session_features, targets, session_ids, partnumbers_list = zip(*batch)

    if len(batch) == 0:
        return torch.empty(0), torch.empty(0), torch.empty(0), [], []

    max_len = min(max([x.shape[0] for x in product_embeddings]), MAX_SEQ_LENGTH)

    padded_embeddings = torch.zeros((len(batch), max_len, product_embeddings[0].shape[1]))
    padded_features = torch.zeros((len(batch), max_len, session_features[0].shape[1]))
    padded_targets = torch.zeros((len(batch), max_len))

    for i in range(len(batch)):
        seq_len = min(product_embeddings[i].shape[0], max_len)
        padded_embeddings[i, :seq_len] = product_embeddings[i][:seq_len]
        padded_features[i, :seq_len] = session_features[i][:seq_len]
        padded_targets[i, :seq_len] = targets[i][:seq_len]

    return padded_embeddings, padded_features, padded_targets, session_ids, partnumbers_list

# ✅ 7️⃣ Crear Datasets y preparación para validación cruzada
data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/processed_train_pca50.parquet"
fraction = 0.05

with open('train_user_ids.pkl', 'rb') as f:
    train_user_ids = pickle.load(f)

with open('val_user_ids.pkl', 'rb') as f:
    val_user_ids = pickle.load(f)

# ✅ 8️⃣ Implementación de K-Fold Cross-Validation
EPOCHS = 5
k = 5  # Número de folds para cross-validation

# Mezclar y dividir train_user_ids en k folds
train_user_ids = list(train_user_ids)  # Convertir set a lista
np.random.shuffle(train_user_ids)
folds = np.array_split(train_user_ids, k)

all_fold_metrics = []

for fold in range(k):
    print(f"\n==== Fold {fold+1}/{k} ====")

    # Definir IDs de usuario para este fold
    val_fold_ids = folds[fold]
    # Combinar los folds restantes para formar el conjunto de entrenamiento
    train_fold_ids = np.concatenate([folds[i] for i in range(k) if i != fold])
    
    # Crear datasets para este fold
    train_dataset = IterableSessionDataset(
        data_path, fraction=fraction, mode='train', balance=True,
        user_ids=train_fold_ids, train_user_ids=train_fold_ids
    )
    val_dataset = IterableSessionDataset(
        data_path, fraction=fraction, mode='val', balance=False,
        user_ids=val_fold_ids, train_user_ids=train_fold_ids
    )
    
    # Re-inicializar modelo, criterio, optimizador y scheduler para cada fold
    model = GRURecommender(input_dim=50, feature_dim=FEATURE_DIM, hidden_dim=128, num_layers=2, output_dim=1).to(device)
    pos_weight_value = 10
    pos_weight = torch.tensor(pos_weight_value, dtype=torch.float32, device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
    
    # Entrenar el modelo para este fold
    best_model_file = train(model, train_dataset, val_dataset, criterion, optimizer, scheduler, EPOCHS)
    
    # Crear DataLoader para evaluación en el fold actual
    val_loader = DataLoader(
        val_dataset,
        batch_size=256,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    # Evaluar el rendimiento en el conjunto de validación para este fold
    auc_roc, avg_precision, mapk_score, f1 = evaluate(model, val_loader)
    val_ndcg = evaluate_ndcg(model, val_loader)
    
    # Almacenar métricas del fold actual
    all_fold_metrics.append((auc_roc, avg_precision, mapk_score, f1, val_ndcg))

# Convertir resultados a array de NumPy para calcular promedios
all_fold_metrics = np.array(all_fold_metrics)
avg_metrics = np.mean(all_fold_metrics, axis=0)

print(f"\nPromedio de métricas en {k}-fold cross-validation:")
print(f"AUC-ROC: {avg_metrics[0]:.4f}, Average Precision: {avg_metrics[1]:.4f}, "
      f"MAP@5: {avg_metrics[2]:.4f}, F1-score: {avg_metrics[3]:.4f}, NDCG@5: {avg_metrics[4]:.4f}")


FEATURE_DIM: 13
Dispositivo: cuda

==== Fold 1/5 ====
--- Epoch 1/5 ---
🖥️ RAM Usage: 7.08 GB
🔥 GPU Usage: 0.00 GB
🔥 GPU Cached: 0.00 GB
📊 Memoria después del Epoch Inicio
Iniciando el bucle de batches...
Epoch [1/5], Loss: 1.3782
🖥️ RAM Usage: 8.66 GB
🔥 GPU Usage: 0.02 GB
🔥 GPU Cached: 0.38 GB
📊 Memoria después del Epoch 1
Distribución de las predicciones del modelo:
Clase 0: 12898 muestras
Clase 1: 76602 muestras
F1-score: 0.2446, AUC-ROC: 0.6320
Validation AUC-ROC: 0.6320, Average Precision: 0.4517, MAP@5: nan
NDCG@5: 0.2051
Validation NDCG: 0.2051
✅ Nuevo mejor modelo guardado: CV_best_model_epoch_1_ndcg_0.2051.pt
--- Epoch 2/5 ---
🖥️ RAM Usage: 8.72 GB
🔥 GPU Usage: 0.02 GB
🔥 GPU Cached: 0.38 GB
📊 Memoria después del Epoch Inicio
Iniciando el bucle de batches...
Epoch [2/5], Loss: 1.3472
🖥️ RAM Usage: 8.72 GB
🔥 GPU Usage: 0.02 GB
🔥 GPU Cached: 0.38 GB
📊 Memoria después del Epoch 2
Distribución de las predicciones del modelo:
Clase 0: 54742 muestras
Clase 1: 34758 muestras
F1-score:

# Predicciones

In [9]:
def load_product_attributes(products_file_path):
    df_products = pd.read_parquet(products_file_path)
    product_attributes = {}
    for idx, row in df_products.iterrows():
        partnumber = int(row['partnumber'])
        product_attributes[partnumber] = {
            'cod_section': row.get('cod_section', None),
            'family': row.get('family', None),
            # Añadir más atributos si es necesario
        }
    return product_attributes

In [16]:
class IterableSessionDataset(IterableDataset):
    def __init__(self, df_path, fraction=1.0, mode='train', balance=True):
        self.df_path = df_path
        self.feature_cols = FEATURE_COLS
        self.fraction = fraction
        self.mode = mode
        self.balance = balance  # Solo aplica para entrenamiento
        # Cargar estadísticas globales para normalización
        self.global_feature_means = None
        self.global_feature_stds = None
        self._compute_global_statistics()

    def _compute_global_statistics(self):
        # Cargar una muestra del dataset para calcular la media y desviación estándar
        table = pq.ParquetFile(self.df_path)
        total_row_groups = table.metadata.num_row_groups
        num_row_groups_to_use = max(1, int(total_row_groups * self.fraction))

        dfs = []
        for i in range(num_row_groups_to_use):
            batch = table.read_row_group(i)
            df = batch.to_pandas()
            dfs.append(df)

        full_df = pd.concat(dfs)
        # Determinar columnas disponibles entre las características esperadas
        available_cols = [col for col in self.feature_cols if col in full_df.columns]
        self.global_feature_means = full_df[available_cols].mean()
        self.global_feature_stds = full_df[available_cols].std().replace(0, 1e-6)

        # Asignar valores por defecto para columnas faltantes
        for col in self.feature_cols:
            if col not in available_cols:
                self.global_feature_means[col] = 0.0
                self.global_feature_stds[col] = 1.0

    def __iter__(self):
        """ Devuelve un iterador sobre los datos. """
        worker_info = get_worker_info()
        if worker_info is None:
            return self._data_iterator()
        else:
            num_workers = worker_info.num_workers
            worker_id = worker_info.id
            return self._data_iterator(worker_id, num_workers)

    def _data_iterator(self, worker_id=0, num_workers=1):
        """ Iterador interno que carga los datos desde Parquet. """
        table = pq.ParquetFile(self.df_path)
        total_row_groups = table.metadata.num_row_groups
        num_row_groups_to_use = max(1, int(total_row_groups * self.fraction))

        if self.mode == 'train':
            row_groups = range(0, int(num_row_groups_to_use * 0.8))
        elif self.mode == 'val':
            row_groups = range(int(num_row_groups_to_use * 0.8), num_row_groups_to_use)
        elif self.mode == 'test':
            row_groups = range(0, num_row_groups_to_use)
        else:
            raise ValueError("Modo inválido. Use 'train', 'val' o 'test'.")

        for i in row_groups:
            batch = table.read_row_group(i)
            df = batch.to_pandas()

            # Asegurar que todas las columnas de FEATURE_COLS estén presentes
            for col in self.feature_cols:
                if col not in df.columns:
                    df[col] = 0.0

            # Aplicar transformaciones logarítmicas
            for col in ['time_since_last', 'session_duration']:
                df[col] = np.log1p(df[col])

            # Normalización global segura
            df[self.feature_cols] = (df[self.feature_cols] - self.global_feature_means) / self.global_feature_stds
            df[self.feature_cols] = df[self.feature_cols].fillna(0.0)

            # Para el modo 'train' y 'val', podemos balancear los datos
            if self.mode in ['train', 'val'] and self.balance:
                df_1 = df[df["add_to_cart"] == 1]
                df_0 = df[df["add_to_cart"] == 0]
                if len(df_1) > 0 and len(df_0) > 0:
                    if len(df_1) > len(df_0):
                        df_0 = df_0.sample(n=len(df_1), replace=True, random_state=42)
                    else:
                        df_1 = df_1.sample(n=len(df_0), replace=True, random_state=42)
                    df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)

            for session_id, session_data in df.groupby("session_id"):
                product_embeddings = session_data["embedding_reduced"].tolist()
                partnumbers = session_data["partnumber"].tolist()

                # Verificar si hay NaN en embeddings
                if any(embedding is None or np.isnan(embedding).any() for embedding in product_embeddings):
                    continue

                product_embeddings = torch.tensor(np.array(product_embeddings), dtype=torch.float32)
                session_features = torch.tensor(session_data[self.feature_cols].values, dtype=torch.float32)

                # Verificar si hay NaN en features
                if torch.isnan(session_features).any():
                    continue

                if self.mode in ['train', 'val']:
                    targets = torch.tensor(session_data["add_to_cart"].values, dtype=torch.float32)
                else:
                    targets = None  # No hay targets en el conjunto de prueba

                yield product_embeddings, session_features, targets, session_id, partnumbers


In [17]:
def collate_fn(batch):
    product_embeddings, session_features, targets, session_ids, partnumbers_list = zip(*batch)

    if len(batch) == 0:
        return torch.empty(0), torch.empty(0), None, [], []

    max_len = min(max([x.shape[0] for x in product_embeddings]), MAX_SEQ_LENGTH)

    padded_embeddings = torch.zeros((len(batch), max_len, product_embeddings[0].shape[1]))
    padded_features = torch.zeros((len(batch), max_len, session_features[0].shape[1]))

    if targets[0] is not None:
        padded_targets = torch.zeros((len(batch), max_len))
    else:
        padded_targets = None  # No hay targets en el conjunto de prueba

    for i in range(len(batch)):
        seq_len = min(product_embeddings[i].shape[0], max_len)
        padded_embeddings[i, :seq_len] = product_embeddings[i][:seq_len]
        padded_features[i, :seq_len] = session_features[i][:seq_len]
        if targets[0] is not None:
            padded_targets[i, :seq_len] = targets[i][:seq_len]

    return padded_embeddings, padded_features, padded_targets, session_ids, partnumbers_list

In [18]:
# Cargar los atributos de los productos
product_attributes = load_product_attributes("/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/optimized_products_transformed.parquet")

In [19]:
def inference_v2(model, test_dataset, device, top_k=5):
    model.eval()
    test_loader = DataLoader(
        test_dataset,
        batch_size=256,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn,
        pin_memory=True
    )

    recommendations = {}
    global_top_recommendations = []

    with torch.no_grad():
        for batch in test_loader:
            product_embeddings, session_features, _, session_ids, partnumbers_list = batch

            if product_embeddings.shape[0] == 0:
                continue  # Saltar batch vacío

            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)

            outputs = model(product_embeddings, session_features)  # (batch, seq_len)
            scores = torch.sigmoid(outputs).cpu().numpy()  # Convertimos logits a probabilidades

            for sid, parts, score_seq in zip(session_ids, partnumbers_list, scores):
                # Asegurar que `parts` y `score_seq` tienen la misma longitud
                if len(parts) != len(score_seq):
                    min_len = min(len(parts), len(score_seq))
                    parts = parts[:min_len]
                    score_seq = score_seq[:min_len]

                product_scores = list(zip(parts, score_seq))
                product_scores.sort(key=lambda x: x[1], reverse=True)

                top_recommendations = [int(p) for p, s in product_scores[:top_k]]
                top_recommendations = list(dict.fromkeys(top_recommendations))

                # Recopilar productos similares basados en atributos
                if len(top_recommendations) < top_k:
                    # Obtener atributos de los productos ya recomendados
                    similar_candidates = []
                    for rec in top_recommendations:
                        rec_attrs = product_attributes.get(rec, {})
                        # Buscar otros productos con atributos similares
                        for p in parts:
                            p = int(p)
                            if p in top_recommendations:
                                continue
                            p_attrs = product_attributes.get(p, {})
                            # Comparar atributos (por ejemplo, cod_section y family)
                            if (p_attrs.get("cod_section") == rec_attrs.get("cod_section") and
                                p_attrs.get("family") == rec_attrs.get("family")):
                                similar_candidates.append(p)
                    # Añadir candidatos similares si es necesario
                    for candidate in similar_candidates:
                        if candidate not in top_recommendations and len(top_recommendations) < top_k:
                            top_recommendations.append(candidate)

                    # Acumular productos para fallback global si aún faltan recomendaciones
                    if len(top_recommendations) < top_k:
                        global_top_recommendations.extend(
                            [p for p in parts if p not in top_recommendations]
                        )

                recommendations[int(sid)] = top_recommendations

    # Fallback global
    product_counts = {}
    for p in global_top_recommendations:
        product_counts[p] = product_counts.get(p, 0) + 1
    sorted_global_top = sorted(product_counts.items(), key=lambda x: x[1], reverse=True)
    global_top = [p for p, count in sorted_global_top]

    for sid, recs in recommendations.items():
        if len(recs) < top_k:
            additional = [p for p in global_top if p not in recs]
            recs.extend(additional[:top_k - len(recs)])
            recommendations[sid] = recs

    output_json = {
        "target": recommendations
    }

    with open("submission_v3.json", "w") as f:
        json.dump(output_json, f, indent=4)

    print("Archivo 'submission_v3.json' generado con éxito.")

In [20]:
# Cargar el mejor modelo entrenado
best_model_file = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/src/models/new_model/CV_best_model_epoch_5_ndcg_0.2583.pt'

model.load_state_dict(torch.load(best_model_file, weights_only=True))
model.to(device)
model.eval()

# Preparar el dataset de prueba
test_data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/processed_test_full.parquet"
test_dataset = IterableSessionDataset(test_data_path, fraction=1.0, mode='test', balance=False)

# Ejecutar la inferencia
inference_v2(model, test_dataset, device, top_k=5)

Archivo 'submission_v3.json' generado con éxito.
