# Modelo - fraction = 1

In [None]:
# Importar las librerías necesarias
import torch
import torch.nn as nn
import torch.optim as optim
import os
import psutil
from torch.utils.data import IterableDataset, DataLoader
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from torch.utils.data import get_worker_info
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
from torch.amp import GradScaler, autocast
import json

# Configuración global de características
FEATURE_COLS = ["time_since_last", "session_relative_position", "session_duration", 
                "R", "F", "M", "device_type", "pagetype", "discount", "cod_section", "family"]
FEATURE_DIM = len(FEATURE_COLS)

# ✅ 1️⃣ Configuración del Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo: {device}")

# 🔥 Función para Monitoreo de Memoria
def print_memory_usage(epoch=None):
    process = psutil.Process(os.getpid()) 
    mem_info = process.memory_info()
    print(f"🖥️ RAM Usage: {mem_info.rss / 1e9:.2f} GB")
    if torch.cuda.is_available():
        print(f"🔥 GPU Usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"🔥 GPU Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    if epoch is not None:
        print(f"📊 Memoria después del Epoch {epoch}")

# ✅ Implementación de Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean', pos_weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, pos_weight=self.pos_weight, reduction='none'
        )
        probas = torch.sigmoid(inputs)
        pt = targets * probas + (1 - targets) * (1 - probas)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

# ✅ 2️⃣ Definir Modelo Ajustado con Dropout y Regularización
class GRURecommender(nn.Module):
    def __init__(self, input_dim=50, feature_dim=FEATURE_DIM, hidden_dim=128, num_layers=2, output_dim=1):
        super(GRURecommender, self).__init__()
        # Ajustamos el GRU para aceptar la nueva dimensión de características
        self.gru = nn.GRU(input_dim + feature_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)
        for name, param in self.gru.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.zeros_(param)

    def forward(self, product_embeddings, session_features):
        # Normalizar las características de entrada
        product_embeddings = (product_embeddings - product_embeddings.mean(dim=1, keepdim=True)) / \
                             (product_embeddings.std(dim=1, keepdim=True) + 1e-6)
        session_features = (session_features - session_features.mean(dim=1, keepdim=True)) / \
                           (session_features.std(dim=1, keepdim=True) + 1e-6)
        x = torch.cat((product_embeddings, session_features), dim=-1)
        out, _ = self.gru(x)
        out = self.dropout(out)
        out = self.fc(out)
        return out.squeeze(-1)

model = GRURecommender().to(device)
print(f"Modelo en: {next(model.parameters()).device}")

# ✅ 3️⃣ Definir Pérdida y Optimizador con Focal Loss y regularización
pos_weight_value = 5.0
pos_weight = torch.tensor(pos_weight_value, dtype=torch.float32, device=device)
print(f"Pos_weight calculado: {pos_weight.item()}")

criterion = FocalLoss(alpha=0.25, gamma=2.0, pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
scaler = torch.GradScaler("cuda")

print(f"Criterion: {criterion}")
print(f"Optimizer: {optimizer}")

# ✅ 4️⃣ Definir Funciones de Evaluación con Métricas Adicionales y NDCG
def compute_ndcg(actuals, predictions, k=5):
    if len(actuals) == 0:
        return 0.0
    indices = np.argsort(-predictions)[:k]
    sorted_rels = actuals[indices]
    dcg = sum((2**rel - 1) / np.log2(i+2) for i, rel in enumerate(sorted_rels))
    ideal_sorted_rels = np.sort(actuals)[::-1][:k]
    idcg = sum((2**rel - 1) / np.log2(i+2) for i, rel in enumerate(ideal_sorted_rels))
    return dcg / idcg if idcg > 0 else 0.0

def evaluate(model, data_loader, threshold=0.5):
    model.eval()
    all_targets = []
    all_outputs = []
    with torch.no_grad():
        for batch in data_loader:
            product_embeddings, session_features, targets, _, _ = batch
            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            outputs = model(product_embeddings, session_features)
            outputs = torch.sigmoid(outputs)
            all_targets.extend(targets.cpu().numpy().flatten())
            all_outputs.extend(outputs.cpu().numpy().flatten())
    all_targets = np.array(all_targets)
    all_outputs = np.array(all_outputs)
    try:
        auc_roc = roc_auc_score(all_targets, all_outputs)
    except ValueError:
        auc_roc = float('nan')
    average_precision = average_precision_score(all_targets, all_outputs)
    y_pred = (all_outputs >= threshold).astype(int)
    precision = precision_score(all_targets, y_pred, zero_division=0)
    recall = recall_score(all_targets, y_pred, zero_division=0)
    f1 = f1_score(all_targets, y_pred, zero_division=0)
    mapk_score = calculate_mapk(all_targets, all_outputs, k=5)
    ndcg = compute_ndcg(all_targets, all_outputs, k=5)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, NDCG@5: {ndcg:.4f}")
    return auc_roc, average_precision, mapk_score, f1

def calculate_mapk(actuals, predictions, k=5):
    indices = np.argsort(-predictions)
    actuals_sorted = actuals[indices]
    top_k = actuals_sorted[:k]
    return np.mean(top_k)

# ✅ 5️⃣ Función de Entrenamiento con Persistencia del Mejor Modelo
def train(model, train_dataset, val_dataset, criterion, optimizer, scheduler, epochs):
    best_f1 = 0.0
    for epoch in range(epochs):
        model.train()
        print(f"--- Epoch {epoch+1}/{epochs} ---")
        train_loader = DataLoader(
            train_dataset,
            batch_size=256,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True
        )
        total_loss = 0.0
        batch_count = 0
        print_memory_usage(epoch="Inicio")
        print("Iniciando el bucle de batches...")
        for batch_idx, batch in enumerate(train_loader):
            product_embeddings, session_features, targets, _, _ = batch
            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            optimizer.zero_grad()
            with autocast(device_type='cuda'):
                outputs = model(product_embeddings, session_features)
                loss = criterion(outputs, targets)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            grads_finite = True
            for param in model.parameters():
                if param.grad is not None:
                    if torch.isnan(param.grad).any() or torch.isinf(param.grad).any():
                        grads_finite = False
                        print("NaN o Inf encontrado en los gradientes. Saltando la actualización.")
                        break
            if grads_finite:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.zero_grad()
                scaler.update()
                continue
            total_loss += loss.item()
            batch_count += 1
        scheduler.step()
        avg_loss = total_loss / batch_count if batch_count > 0 else float('inf')
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
        print_memory_usage(epoch=epoch+1)
        val_loader = DataLoader(
            val_dataset,
            batch_size=256,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_fn,
            pin_memory=True
        )
        auc_roc, avg_precision, mapk_score, f1 = evaluate(model, val_loader)
        print(f"Validation AUC-ROC: {auc_roc:.4f}, Average Precision: {avg_precision:.4f}, MAP@5: {mapk_score:.4f}")
        if f1 > best_f1:
            best_f1 = f1
            model_filename = f"best_model_epoch_{epoch+1}_f1_{f1:.4f}.pt"
            torch.save(model.state_dict(), model_filename)
            print(f"Nuevo mejor modelo guardado: {model_filename}")
    return model_filename

# ✅ 6️⃣ Cargar Dataset con Normalización y Manejo de Outliers
MAX_SEQ_LENGTH = 50

class IterableSessionDataset(IterableDataset):
    def __init__(self, df_path, fraction=1.0, mode='train'):
        self.df_path = df_path
        self.feature_cols = FEATURE_COLS  # Usamos las características ampliadas
        self.fraction = fraction
        self.mode = mode

    def __iter__(self):
        worker_info = get_worker_info()
        if worker_info is None:
            return self._data_iterator()
        else:
            num_workers = worker_info.num_workers
            worker_id = worker_info.id
            return self._data_iterator(worker_id, num_workers)

    def _data_iterator(self, worker_id=0, num_workers=1):
        table = pq.ParquetFile(self.df_path)
        total_row_groups = table.metadata.num_row_groups
        num_row_groups_to_use = max(1, int(total_row_groups * self.fraction))
        num_row_groups_train = int(num_row_groups_to_use * 0.8)
        num_row_groups_val = num_row_groups_to_use - num_row_groups_train
        if self.mode == 'train':
            start_idx = 0
            end_idx = num_row_groups_train
        elif self.mode == 'val':
            start_idx = num_row_groups_train
            end_idx = num_row_groups_train + num_row_groups_val
        elif self.mode == 'test':
            start_idx = 0
            end_idx = num_row_groups_to_use
        else:
            raise ValueError("Invalid mode: must be 'train', 'val', or 'test'.")
        batches_per_worker = (end_idx - start_idx + num_workers - 1) // num_workers if num_workers > 0 else end_idx - start_idx
        start = start_idx + worker_id * batches_per_worker
        end = min(start + batches_per_worker, end_idx)
        for i in range(start, end):
            batch = table.read_row_group(i)
            df = batch.to_pandas()
            # Preprocesamiento
            for col in ['time_since_last', 'session_duration']:
                if col in df.columns:
                    df[col] = np.log1p(df[col])
                else:
                    df[col] = 0.0
            for col in self.feature_cols:
                if col not in df.columns:
                    df[col] = 0.0
            df[self.feature_cols] = (df[self.feature_cols] - df[self.feature_cols].mean()) / (df[self.feature_cols].std() + 1e-6)
            df_iter = df.groupby("session_id")
            for session_id, session_data in df_iter:
                product_embeddings = session_data["embedding_reduced"].tolist()
                partnumbers = session_data["partnumber"].tolist()
                if any(embedding is None or np.isnan(embedding).any() for embedding in product_embeddings):
                    continue
                product_embeddings = torch.tensor(np.array(product_embeddings), dtype=torch.float32)
                session_features = torch.tensor(session_data[self.feature_cols].values, dtype=torch.float32)
                if self.mode in ['train', 'val']:
                    targets = torch.tensor(session_data["add_to_cart"].values, dtype=torch.float32)
                    targets = torch.clamp(targets, min=0.0, max=1.0)
                    yield product_embeddings, session_features, targets, session_id, partnumbers
                elif self.mode == 'test':
                    targets = torch.zeros(product_embeddings.shape[0], dtype=torch.float32)
                    yield product_embeddings, session_features, targets, session_id, partnumbers

# ✅ Muestreo ponderado en la función de colación
def collate_fn(batch):
    if len(batch) == 0:
        return (torch.zeros(1, MAX_SEQ_LENGTH, 50 + FEATURE_DIM),
                torch.zeros(1, MAX_SEQ_LENGTH, FEATURE_DIM),
                torch.zeros(1, MAX_SEQ_LENGTH), [0], [0])
    mode = 'train' if batch[0][2].sum().item() > 0 else 'test'
    if mode == 'train':
        positive_samples = []
        negative_samples = []
        for sample in batch:
            if sample[2].sum().item() > 0:
                positive_samples.append(sample)
            else:
                negative_samples.append(sample)
        oversample_factor = 2
        oversampled_positive = positive_samples * oversample_factor
        oversampled_batch = negative_samples + oversampled_positive
        np.random.shuffle(oversampled_batch)
    else:
        oversampled_batch = batch
    product_embeddings, session_features, targets, session_ids, partnumbers_list = zip(*oversampled_batch)
    max_len = min(max([x.shape[0] for x in product_embeddings]), MAX_SEQ_LENGTH)
    padded_embeddings = torch.zeros((len(oversampled_batch), max_len, product_embeddings[0].shape[1]))
    padded_features = torch.zeros((len(oversampled_batch), max_len, session_features[0].shape[1]))
    padded_targets = torch.zeros((len(oversampled_batch), max_len))
    for i in range(len(oversampled_batch)):
        seq_len = min(product_embeddings[i].shape[0], max_len)
        padded_embeddings[i, :seq_len] = product_embeddings[i][:seq_len]
        padded_features[i, :seq_len] = session_features[i][:seq_len]
        padded_targets[i, :seq_len] = targets[i][:seq_len]
    return padded_embeddings, padded_features, padded_targets, session_ids, partnumbers_list

# ✅ 7️⃣ Crear Datasets de Entrenamiento y Validación
data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/processed_train_pca50.parquet"
fraction = 1
train_dataset = IterableSessionDataset(data_path, fraction=fraction, mode='train')
val_dataset = IterableSessionDataset(data_path, fraction=fraction, mode='val')

# ✅ 8️⃣ Ejecutar Entrenamiento
EPOCHS = 10
print(f'--- Número de epochs: {EPOCHS} ---')
best_model_file = train(model, train_dataset, val_dataset, criterion, optimizer, scheduler, EPOCHS)




# Inferencia

In [3]:
# 📤 9️⃣ Inferencia y Generación de Predicciones
def inference(model, test_dataset, device, top_k=5):
    model.eval()
    test_loader = DataLoader(
        test_dataset,
        batch_size=256,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn,
        pin_memory=True
    )

    recommendations = {}
    global_top_recommendations = []

    with torch.no_grad():
        for batch in test_loader:
            product_embeddings, session_features, _, session_ids, partnumbers_list = batch
            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)
            outputs = model(product_embeddings, session_features)
            scores = torch.sigmoid(outputs).cpu().numpy()
            for sid, parts, score_seq in zip(session_ids, partnumbers_list, scores):
                product_scores = list(zip(parts, score_seq))
                product_scores.sort(key=lambda x: x[1], reverse=True)
                top_recommendations = [int(p) for p, s in product_scores[:top_k]]
                top_recommendations = list(dict.fromkeys(top_recommendations))
                if len(top_recommendations) < top_k:
                    global_top_recommendations.extend([p for p in parts if p not in top_recommendations])
                recommendations[int(sid)] = top_recommendations

    product_counts = {}
    for p in global_top_recommendations:
        product_counts[p] = product_counts.get(p, 0) + 1
    sorted_global_top = sorted(product_counts.items(), key=lambda x: x[1], reverse=True)
    global_top = [p for p, count in sorted_global_top]

    for sid, recs in recommendations.items():
        if len(recs) < top_k:
            additional = [p for p in global_top if p not in recs]
            recs.extend(additional[:top_k - len(recs)])
            recommendations[sid] = recs

    output_json = {
        "target": recommendations
    }

    with open("submission.json", "w") as f:
        json.dump(output_json, f, indent=4)

    print("Archivo 'submission.json' generado con éxito.")

# Cargar el mejor modelo entrenado
model.load_state_dict(torch.load(best_model_file, weights_only=True))
model.to(device)
model.eval()

# Preparar el dataset de prueba
test_data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/test_full.parquet"
test_dataset = IterableSessionDataset(test_data_path, fraction=1.0, mode='test')

# Ejecutar la inferencia
inference(model, test_dataset, device, top_k=5)

Archivo 'submission.json' generado con éxito.


# Más personalización en la inferencia

In [4]:
import pandas as pd

def load_product_attributes(file_path):
    # Carga los datos de productos desde un archivo (ajusta según tu formato)
    df = pd.read_parquet(file_path)  # o pd.read_parquet(file_path) si es Parquet
    
    # Inicializa el diccionario
    product_attributes = {}
    
    # Recorre cada fila del DataFrame para llenar el diccionario
    for _, row in df.iterrows():
        product_attributes[row['partnumber']] = {
            "cod_section": row["cod_section"],
            "family": row["family"],
            "discount": row["discount"],
            # Agrega más atributos si es necesario
        }
    
    return product_attributes




In [None]:
import torch
import json
from torch.utils.data import DataLoader

# Supongamos que 'product_attributes' ya está cargado y disponible globalmente.
# Por ejemplo:
# product_attributes = load_product_attributes("path_to_products_file")

# Cargar el diccionario usando la función
product_attributes = load_product_attributes("/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/optimized_products_transformed.parquet")


def inference_v2(model, test_dataset, device, top_k=5):
    model.eval()
    test_loader = DataLoader(
        test_dataset,
        batch_size=256,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn,
        pin_memory=True
    )

    recommendations = {}
    global_top_recommendations = []

    with torch.no_grad():
        for batch in test_loader:
            product_embeddings, session_features, _, session_ids, partnumbers_list = batch

            product_embeddings = product_embeddings.to(device, non_blocking=True)
            session_features = session_features.to(device, non_blocking=True)

            outputs = model(product_embeddings, session_features)
            scores = torch.sigmoid(outputs).cpu().numpy()

            for sid, parts, score_seq in zip(session_ids, partnumbers_list, scores):
                product_scores = list(zip(parts, score_seq))
                product_scores.sort(key=lambda x: x[1], reverse=True)
                top_recommendations = [int(p) for p, s in product_scores[:top_k]]
                top_recommendations = list(dict.fromkeys(top_recommendations))

                # Recopilar productos similares basados en atributos
                if len(top_recommendations) < top_k:
                    # Obtener atributos de los productos ya recomendados
                    similar_candidates = []
                    for rec in top_recommendations:
                        rec_attrs = product_attributes.get(rec, {})
                        # Buscar otros productos en la misma sesión con atributos similares
                        for p in parts:
                            p = int(p)
                            if p in top_recommendations:
                                continue
                            p_attrs = product_attributes.get(p, {})
                            # Comparar atributos (por ejemplo, cod_section y family)
                            if (p_attrs.get("cod_section") == rec_attrs.get("cod_section") and
                                p_attrs.get("family") == rec_attrs.get("family")):
                                similar_candidates.append(p)
                    # Añadir candidatos similares si es necesario
                    for candidate in similar_candidates:
                        if candidate not in top_recommendations and len(top_recommendations) < top_k:
                            top_recommendations.append(candidate)

                    # Acumular productos para fallback global si aún faltan recomendaciones
                    if len(top_recommendations) < top_k:
                        global_top_recommendations.extend(
                            [p for p in parts if p not in top_recommendations]
                        )

                recommendations[int(sid)] = top_recommendations

    # Fallback global como en la versión anterior
    product_counts = {}
    for p in global_top_recommendations:
        product_counts[p] = product_counts.get(p, 0) + 1
    sorted_global_top = sorted(product_counts.items(), key=lambda x: x[1], reverse=True)
    global_top = [p for p, count in sorted_global_top]

    for sid, recs in recommendations.items():
        if len(recs) < top_k:
            additional = [p for p in global_top if p not in recs]
            recs.extend(additional[:top_k - len(recs)])
            recommendations[sid] = recs

    output_json = {
        "target": recommendations
    }

    with open("submission_v2.json", "w") as f:
        json.dump(output_json, f, indent=4)

    print("Archivo 'submission_v2.json' generado con éxito.")

# Cargar el mejor modelo entrenado
model.load_state_dict(torch.load(best_model_file, weights_only=True))
model.to(device)
model.eval()

# Preparar el dataset de prueba
test_data_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed_v2/test_full.parquet"
test_dataset = IterableSessionDataset(test_data_path, fraction=1.0, mode='test')

# Ejecutar la inferencia
inference_v2(model, test_dataset, device, top_k=5)


Archivo 'submission_v2.json' generado con éxito.
