In [1]:
import tensorflow as tf
print("GPU disponible:", tf.config.list_physical_devices('GPU'))

2025-01-13 10:02:18.631435: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-13 10:02:18.761481: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736758938.809560    1405 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736758938.822557    1405 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-13 10:02:18.938450: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

GPU disponible: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# **Refractorización del pipeline**

## Bloque de Preparación

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import implicit
import json
import os

# Definir rutas de los archivos
product_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/products_data.pkl'
user_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/user_data.csv'
train_enriched_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/hybrid_model/train_preprocessed.pkl"
test_enriched_path = "/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/hybrid_model/test_preprocessed.pkl"

# Cargar los datasets
products = pd.read_pickle(product_path)
users = pd.read_csv(user_path)
train = pd.read_pickle(train_enriched_path)
test = pd.read_pickle(test_enriched_path)

# Preprocesamiento de datos
def preprocess_data(train, test):
    # Convertir 'pagetype' a numérico en 'train'
    train['pagetype'] = pd.to_numeric(train['pagetype'], errors='coerce')
    train['pagetype'] = train['pagetype'].fillna(-1)
    train['pagetype'] = train['pagetype'].astype('int16')
    
    # Obtener los user_id únicos en 'train'
    train_user_ids = set(train['user_id'].unique())
    train_user_ids.discard(-1)  # Remover -1 si representa usuarios no logueados

    # Identificar sesiones con interacciones en 'test'
    test_sessions_with_interactions = set(test[test['partnumber'].notnull()]['session_id'].unique())
    
    return train_user_ids, test_sessions_with_interactions

# Clasificar sesiones
def classify_sessions(test, train_user_ids, test_sessions_with_interactions):
    def classify_session(row):
        user_id = row['user_id']
        session_id = row['session_id']
        
        if user_id == -1:
            # Usuario no logueado
            if session_id in test_sessions_with_interactions:
                return 'Usuario recurrente no logueado'
            else:
                return 'Usuario nuevo no logueado'
        else:
            # Usuario logueado
            if user_id in train_user_ids:
                return 'Usuario recurrente logueado'
            else:
                return 'Usuario nuevo logueado'
    test['user_class'] = test.apply(classify_session, axis=1)
    return test

# Ejecutar preprocesamiento y clasificación
train_user_ids, test_sessions_with_interactions = preprocess_data(train, test)
test = classify_sessions(test, train_user_ids, test_sessions_with_interactions)

In [3]:
print(f'Columnas de test {test.columns}')
print(len(test.columns))

Columnas de test Index(['session_id', 'date', 'timestamp_local', 'user_id', 'country',
       'partnumber', 'device_type', 'pagetype', 'user_class'],
      dtype='object')
9


In [None]:
print(f'Columnas de train {train.columns}')
print(len(train.columns))

Columnas de train Index(['session_id', 'date', 'timestamp_local', 'add_to_cart', 'user_id',
       'country', 'partnumber', 'device_type', 'pagetype'],
      dtype='object')
9


In [None]:
print(train.info)
print(test.info)

<bound method DataFrame.info of           session_id       date         timestamp_local  add_to_cart  user_id  \
10725642     4696487 2024-06-15 2024-06-15 11:02:27.710            1       -1   
35628113     4572779 2024-06-15 2024-06-15 03:43:20.699            1       -1   
35628118     4572779 2024-06-15 2024-06-15 03:43:08.240            1       -1   
9475628      3239459 2024-06-15 2024-06-15 03:37:24.111            1       -1   
9475555      3239459 2024-06-15 2024-06-15 03:37:03.935            1       -1   
...              ...        ...                     ...          ...      ...   
11700533      150565 2024-06-01 2024-06-01 02:00:00.337            0       -1   
14954883     4285254 2024-06-01 2024-06-01 02:00:00.127            0       -1   
44709791     4040227 2024-06-01 2024-06-01 02:00:00.083            0    64048   
2046107      2464407 2024-06-01 2024-06-01 02:00:00.056            0     4996   
35407364     4747800 2024-06-01 2024-06-01 02:00:00.051            0       -1

## Funciones de cada Modelo

### Modelo de Popularidad

In [6]:
def train_popularity_model(train):
    # Calcular la popularidad de los productos en el conjunto de entrenamiento
    product_popularity = train.groupby('partnumber')['add_to_cart'].sum().reset_index()
    product_popularity.rename(columns={'add_to_cart': 'popularity'}, inplace=True)
    product_popularity.sort_values(by='popularity', ascending=False, inplace=True)
    popular_products = product_popularity['partnumber'].tolist()
    return popular_products

# Entrenar el modelo de popularidad
popular_products = train_popularity_model(train)

def recommend_by_popularity(popular_products, top_n=5):
    return popular_products[:top_n]

In [7]:
def enhanced_popularity_model(train_data):
    # Crear una copia explícita del DataFrame
    train = train_data.copy()
    
    # 1. Considerar el tipo de página
    page_weights = {
        0: 1.0,    # producto
        1: 0.5,    # categoría
        2: 0.3,    # búsqueda
        -1: 0.1    # otros
    }
    train.loc[:, 'page_weight'] = train['pagetype'].map(page_weights)
    
    # 2. Calcular popularidad ponderada
    popularity_df = train.groupby('partnumber').agg({
        'add_to_cart': 'sum',
        'page_weight': 'count'
    }).reset_index()
    
    # 3. Normalizar los valores
    popularity_df['add_to_cart_norm'] = (popularity_df['add_to_cart'] - popularity_df['add_to_cart'].min()) / \
                                      (popularity_df['add_to_cart'].max() - popularity_df['add_to_cart'].min())
    popularity_df['page_weight_norm'] = (popularity_df['page_weight'] - popularity_df['page_weight'].min()) / \
                                      (popularity_df['page_weight'].max() - popularity_df['page_weight'].min())
    
    # 4. Calcular score final
    popularity_df['popularity_score'] = (
        0.7 * popularity_df['add_to_cart_norm'] +
        0.3 * popularity_df['page_weight_norm']
    )
    
    return popularity_df.sort_values('popularity_score', ascending=False)['partnumber'].tolist()

In [8]:
def get_country_specific_recommendations(train_data, country):
    # Crear una copia explícita para el filtrado por país
    country_data = train_data[train_data['country'] == country].copy()
    
    # Si no hay suficientes datos para el país, usar datos globales
    if len(country_data) < 100:  # umbral arbitrario, ajustar según necesidad
        return enhanced_popularity_model(train_data.copy())
    
    # Obtener productos populares específicos del país
    country_popular = enhanced_popularity_model(country_data)
    
    return country_popular

def blend_recommendations(general_recs, country_recs, ratio=0.7):
    # Asegurarse de que tenemos suficientes recomendaciones
    if not general_recs:
        return country_recs[:5]
    if not country_recs:
        return general_recs[:5]
    
    num_general = int(5 * ratio)
    num_country = 5 - num_general
    
    final_recs = general_recs[:num_general]
    country_specific = [x for x in country_recs if x not in final_recs]
    final_recs.extend(country_specific[:num_country])
    
    # Asegurarse de que tenemos exactamente 5 recomendaciones
    while len(final_recs) < 5:
        if len(general_recs) > len(final_recs):
            next_rec = next(rec for rec in general_recs if rec not in final_recs)
            final_recs.append(next_rec)
    
    return final_recs[:5]

### Modelo Basado en Contenido

In [9]:
def prepare_content_model(products):
    # Asegurarnos de que los embeddings son arrays de NumPy
    products['embedding'] = products['embedding'].apply(np.array)
    # Crear un diccionario {partnumber: embedding}
    embeddings_dict = dict(zip(products['partnumber'], products['embedding']))
    return embeddings_dict

def find_similar_products(partnumber, embeddings_dict, top_n=5):
    target_embedding = embeddings_dict.get(partnumber)
    if not isinstance(target_embedding, np.ndarray):
        print(f"Advertencia: El embedding del producto {partnumber} es inválido.")
        return []

    # Obtener todas las embeddings y los partnumbers correspondientes
    all_partnumbers = []
    all_embeddings = []
    for pnum, emb in embeddings_dict.items():
        if isinstance(emb, np.ndarray) and emb.shape == target_embedding.shape:
            all_partnumbers.append(pnum)
            all_embeddings.append(emb)

    # Convertir a arrays de NumPy
    all_embeddings = np.stack(all_embeddings)

    # Calcular la similitud de coseno
    similarities = cosine_similarity([target_embedding], all_embeddings)[0]

    # Obtener los índices de los productos más similares (excluyendo el propio producto)
    similar_indices = similarities.argsort()[::-1]
    similar_partnumbers = []
    for idx in similar_indices:
        if all_partnumbers[idx] != partnumber:
            similar_partnumbers.append(all_partnumbers[idx])
        if len(similar_partnumbers) == top_n:
            break

    return similar_partnumbers

def recommend_by_content(partnumbers_interacted, embeddings_dict, top_n=5):
    recommendations = []
    for partnumber in partnumbers_interacted:
        # Obtener productos similares
        similar_products = find_similar_products(partnumber, embeddings_dict, top_n=top_n)
        # Añadir los productos similares a la lista de recomendaciones
        recommendations.extend(similar_products)

    # Eliminar productos ya vistos y duplicados
    recommendations = [p for p in recommendations if p not in partnumbers_interacted]
    recommendations = list(dict.fromkeys(recommendations))

    return recommendations[:top_n]

# Preparar el modelo basado en contenido
embeddings_dict = prepare_content_model(products)

### Modelo Colaborativo

In [10]:
import tensorflow as tf
print("GPU disponible:", tf.config.list_physical_devices('GPU'))

GPU disponible: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [11]:
# 1. Importaciones
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import json
from scipy.sparse import csr_matrix
import gc
import psutil
import cupy as cp
from sklearn.preprocessing import StandardScaler

In [12]:
# 2. Función de preparación de datos
def prepare_collaborative_model(train_logged_in, products, sample_fraction=0.05, batch_size=1000):
    """
    Preparar datos para LambdaRank con ranking explícito
    """
    print(f"Tamaño original del dataset: {len(train_logged_in)}")
    
    # Muestrear usuarios
    unique_users = train_logged_in['user_id'].unique()
    num_users_to_sample = int(len(unique_users) * sample_fraction)
    sampled_users = cp.random.choice(unique_users, size=num_users_to_sample, replace=False)
    
    # Filtrar dataset
    train_sample = train_logged_in[train_logged_in['user_id'].isin(cp.asnumpy(sampled_users))].copy()
    print(f"Tamaño del dataset muestreado: {len(train_sample)}")
    
    # Preparar datos
    le_user = LabelEncoder()
    le_product = LabelEncoder()
    
    train_sample['user_encoded'] = le_user.fit_transform(train_sample['user_id'])
    train_sample['product_encoded'] = le_product.fit_transform(train_sample['partnumber'])
    
    # Crear ranking por usuario
    train_sample['relevance'] = train_sample['add_to_cart']
    
    features = []
    labels = []
    group_sizes = []
    current_group_size = 0
    
    # Procesar por usuario para mantener estructura de ranking
    for user_id in train_sample['user_id'].unique():
        user_data = train_sample[train_sample['user_id'] == user_id]
        
        # Asegurar que tenemos suficientes interacciones
        if len(user_data) < 2:  # Necesitamos al menos 2 items para ranking
            continue
            
        # Crear features para cada interacción del usuario
        user_features = []
        user_labels = []
        
        for _, row in user_data.iterrows():
            product_data = products.loc[row['partnumber']]
            feature_vector = [
                row['user_encoded'],
                row['product_encoded'],
                row['pagetype'],
                product_data['discount'],
                product_data['color_id'],
                product_data['cod_section'],
                product_data['family'],
                *product_data['embedding']
            ]
            user_features.append(feature_vector)
            user_labels.append(row['relevance'])
        
        if user_features:
            features.extend(user_features)
            labels.extend(user_labels)
            group_sizes.append(len(user_features))
            current_group_size += 1
            
        if current_group_size % 100 == 0:
            print(f"Procesados {current_group_size} usuarios")
    
    # Convertir a arrays numpy
    features_array = np.array(features, dtype=np.float32)
    labels_array = np.array(labels, dtype=np.int32)
    
    print(f"Features shape: {features_array.shape}")
    print(f"Labels shape: {labels_array.shape}")
    print(f"Número de grupos: {len(group_sizes)}")
    print(f"Distribución de tamaños de grupo: min={min(group_sizes)}, max={max(group_sizes)}, avg={np.mean(group_sizes):.2f}")
    
    return features_array, labels_array, group_sizes, le_user, le_product

In [13]:
def create_validation_set(X, y, group_sizes, validation_fraction=0.2):
    """
    Crear conjunto de validación manteniendo grupos intactos
    """
    n_groups = len(group_sizes)
    n_val_groups = int(n_groups * validation_fraction)
    
    # Índices aleatorios para validación
    val_group_indices = np.random.choice(n_groups, n_val_groups, replace=False)
    
    # Crear máscaras para train y validación
    start_idx = 0
    train_mask = np.ones(len(X), dtype=bool)
    
    for i, size in enumerate(group_sizes):
        if i in val_group_indices:
            train_mask[start_idx:start_idx + size] = False
        start_idx += size
    
    val_mask = ~train_mask
    
    return (X[train_mask], y[train_mask], [s for i, s in enumerate(group_sizes) if i not in val_group_indices],
            X[val_mask], y[val_mask], [s for i, s in enumerate(group_sizes) if i in val_group_indices])

In [14]:
def train_lambdarank_model(train_logged_in, products, sample_fraction=0.05, batch_size=1000, validation_fraction=0.2):
    """
    Entrenar modelo LambdaRank con ajustes para evitar el sobreajuste perfecto
    """
    # Preparar datos
    X, y, group_sizes, le_user, le_product = prepare_collaborative_model(
        train_logged_in, 
        products,
        sample_fraction=sample_fraction,
        batch_size=batch_size
    )
    
    # Normalizar features numéricas
    feature_names = ['user_id', 'product_id', 'pagetype', 'discount', 'color_id', 
                    'section', 'family'] + [f'emb_{i}' for i in range(X.shape[1]-7)]
    
    # Función para feature engineering
    def add_interaction_features(X, feature_names):
        X_df = pd.DataFrame(X, columns=feature_names)
        
        # Más interacciones
        X_df['pagetype_x_user'] = X_df['pagetype'] * X_df['user_id']
        X_df['pagetype_x_discount'] = X_df['pagetype'] * X_df['discount']
        X_df['user_x_section'] = X_df['user_id'] * X_df['section']
        
        # Agregaciones por usuario
        user_stats = X_df.groupby('user_id').agg({
            'pagetype': ['mean', 'std'],
            'discount': 'mean'
        }).fillna(0)
        
        feature_names.extend(['pagetype_x_user', 'pagetype_x_discount', 'user_x_section'])
        
        return X_df.values, feature_names
    
    # Aplicar feature engineering
    print("Aplicando feature engineering...")
    X, feature_names = add_interaction_features(X, feature_names)
    
    # Escalar features
    print("Escalando features...")
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Crear conjuntos de entrenamiento y validación
    print("Creando conjuntos de train y validación...")
    X_train, y_train, group_sizes_train, X_val, y_val, group_sizes_val = create_validation_set(
        X, y, group_sizes, validation_fraction=validation_fraction
    )
    
    print("Creando datasets de LightGBM...")
    train_data = lgb.Dataset(
        X_train, 
        label=y_train,
        group=group_sizes_train,
        feature_name=feature_names
    )
    
    valid_data = lgb.Dataset(
        X_val,
        label=y_val,
        group=group_sizes_val,
        reference=train_data
    )
    
    # Parámetros significativamente ajustados
    
    params = {
        'objective': 'lambdarank',
        'metric': ['ndcg', 'map', 'auc'],
        'ndcg_eval_at': [5, 10, 15],
        
        # Ajustes para permitir más splits
        'num_leaves': 256,              # Reducir
        'max_depth': 6,                 # Reducir
        'min_data_in_leaf': 10,         # Aumentar
        'min_gain_to_split': 0.0001,    # Reducir
        
        # Learning rate más agresivo
        'learning_rate': 0.01,          # Aumentar
        
        # Muestreo más agresivo
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        
        # Regularización más suave
        'lambda_l1': 0.005,
        'lambda_l2': 0.005,
        
        # Balance más agresivo
        'scale_pos_weight': 3.0,
        'pos_bagging_fraction': 1.0,
        'neg_bagging_fraction': 0.5,
        
        # Otros ajustes
        'max_bin': 255,
        'min_sum_hessian_in_leaf': 0.00001,
        'boost_from_average': True,
        'first_metric_only': False,     # Permitir múltiples métricas
        'feature_fraction_bynode': 0.8,
        
        # Nuevos parámetros
        'boost_from_average': True,
        'is_unbalance': True,
        'early_stopping_round': 50      # Más paciencia
    }

    
    print("Entrenando modelo LambdaRank en CPU...")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=300,
        valid_sets=[train_data, valid_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=10),
            lgb.reset_parameter(learning_rate=lambda iter: 
                                    0.01 * (0.995 ** iter)),
            # Nuevo callback para feature importance
            lgb.record_evaluation({
                'feature_importance': []
            }),
            # Añadir callback personalizado para métricas adicionales
            lgb.callback.record_evaluation({
                'ndcg@5': [],
                'ndcg@10': [],
                'ndcg@15': [],
                'map@5': [],
                'map@10': [],
                'map@15': [],
                'auc': [],
                'average_precision': [],
                'precision@5': [],
                'recall@5': []
                })
            ]
    )       
        
    # Análisis detallado
    print("\nMétricas finales:")
    metrics = ['ndcg@5', 'ndcg@10', 'map']
    datasets = ['training', 'validation']
    
    for dataset in datasets:
        print(f"\n{dataset.capitalize()}:")
        for metric in metrics:
            try:
                value = model.best_score[dataset][metric]
                print(f"  {metric}: {value:.4f}")
            except:
                print(f"  {metric}: No disponible")
    
    # Análisis de features
    importance = model.feature_importance(importance_type='gain')
    feature_imp = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)
    
    print("\nTop 10 features más importantes:")
    print(feature_imp.head(10))
    
    # Guardar información del modelo
    model_info = {
        'params': params,
        'best_iteration': model.best_iteration,
        'feature_importance': feature_imp.to_dict(),
        'best_score': model.best_score
    }
    
    return model, le_user, le_product, model_info

# Uso
print("Iniciando entrenamiento del modelo LambdaRank...")
train_logged_in = train[train['user_id'] != -1].copy()

# Entrenar modelo con validación
lambdarank_model, le_user, le_product, model_info = train_lambdarank_model(
    train_logged_in, 
    products, 
    sample_fraction=0.05,
    batch_size=2000,
    validation_fraction=0.15
)

# Imprimir información detallada del modelo
print("\nInformación del modelo:")
print(f"Mejor iteración: {model_info['best_iteration']}")
print("\nParámetros utilizados:")
for param, value in model_info['params'].items():
    print(f"{param}: {value}")
    
    

Iniciando entrenamiento del modelo LambdaRank...
Tamaño original del dataset: 5488678
Tamaño del dataset muestreado: 275401
Procesados 100 usuarios
Procesados 200 usuarios
Procesados 300 usuarios
Procesados 400 usuarios
Procesados 500 usuarios
Procesados 600 usuarios
Procesados 700 usuarios
Procesados 800 usuarios
Procesados 900 usuarios
Procesados 1000 usuarios
Procesados 1100 usuarios
Procesados 1200 usuarios
Procesados 1300 usuarios
Procesados 1400 usuarios
Procesados 1500 usuarios
Procesados 1600 usuarios
Procesados 1700 usuarios
Procesados 1800 usuarios
Procesados 1900 usuarios
Procesados 2000 usuarios
Procesados 2100 usuarios
Procesados 2200 usuarios
Procesados 2300 usuarios
Procesados 2400 usuarios
Procesados 2500 usuarios
Procesados 2600 usuarios
Procesados 2700 usuarios
Procesados 2800 usuarios
Procesados 2900 usuarios
Procesados 3000 usuarios
Procesados 3100 usuarios
Procesados 3200 usuarios
Procesados 3300 usuarios
Procesados 3400 usuarios
Procesados 3500 usuarios
Procesados

In [15]:
# Copia de código
'''def train_lambdarank_model(train_logged_in, products, sample_fraction=0.05, batch_size=1000, validation_fraction=0.2):
    """
    Entrenar modelo LambdaRank con ajustes para evitar el sobreajuste perfecto
    """
    # Preparar datos
    X, y, group_sizes, le_user, le_product = prepare_collaborative_model(
        train_logged_in, 
        products,
        sample_fraction=sample_fraction,
        batch_size=batch_size
    )
    
    # Normalizar features numéricas
    feature_names = ['user_id', 'product_id', 'pagetype', 'discount', 'color_id', 
                    'section', 'family'] + [f'emb_{i}' for i in range(X.shape[1]-7)]
    
    # Función para feature engineering
    def add_interaction_features(X, feature_names):
        X_df = pd.DataFrame(X, columns=feature_names)
        
        # Más interacciones
        X_df['pagetype_x_user'] = X_df['pagetype'] * X_df['user_id']
        X_df['pagetype_x_discount'] = X_df['pagetype'] * X_df['discount']
        X_df['user_x_section'] = X_df['user_id'] * X_df['section']
        
        # Agregaciones por usuario
        user_stats = X_df.groupby('user_id').agg({
            'pagetype': ['mean', 'std'],
            'discount': 'mean'
        }).fillna(0)
        
        feature_names.extend(['pagetype_x_user', 'pagetype_x_discount', 'user_x_section'])
        
        return X_df.values, feature_names
    
    # Aplicar feature engineering
    print("Aplicando feature engineering...")
    X, feature_names = add_interaction_features(X, feature_names)
    
    # Escalar features
    print("Escalando features...")
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Crear conjuntos de entrenamiento y validación
    print("Creando conjuntos de train y validación...")
    X_train, y_train, group_sizes_train, X_val, y_val, group_sizes_val = create_validation_set(
        X, y, group_sizes, validation_fraction=validation_fraction
    )
    
    print("Creando datasets de LightGBM...")
    train_data = lgb.Dataset(
        X_train, 
        label=y_train,
        group=group_sizes_train,
        feature_name=feature_names
    )
    
    valid_data = lgb.Dataset(
        X_val,
        label=y_val,
        group=group_sizes_val,
        reference=train_data
    )
    
    # Parámetros significativamente ajustados
    
    params = {
        'objective': 'lambdarank',
        'metric': ['ndcg', 'map', 'auc'],
        'ndcg_eval_at': [5, 10, 15],
        
        # Ajustes para permitir más splits
        'num_leaves': 256,              # Reducir
        'max_depth': 6,                 # Reducir
        'min_data_in_leaf': 10,         # Aumentar
        'min_gain_to_split': 0.0001,    # Reducir
        
        # Learning rate más agresivo
        'learning_rate': 0.01,          # Aumentar
        
        # Muestreo más agresivo
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        
        # Regularización más suave
        'lambda_l1': 0.005,
        'lambda_l2': 0.005,
        
        # Balance más agresivo
        'scale_pos_weight': 3.0,
        'pos_bagging_fraction': 1.0,
        'neg_bagging_fraction': 0.5,
        
        # Otros ajustes
        'max_bin': 255,
        'min_sum_hessian_in_leaf': 0.00001,
        'boost_from_average': True,
        'first_metric_only': False,     # Permitir múltiples métricas
        'feature_fraction_bynode': 0.8,
        
        # Nuevos parámetros
        'boost_from_average': True,
        'is_unbalance': True,
        'early_stopping_round': 50      # Más paciencia
    }

    
    print("Entrenando modelo LambdaRank en CPU...")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=300,
        valid_sets=[train_data, valid_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=10),
            lgb.reset_parameter(learning_rate=lambda iter: 
                                    0.01 * (0.995 ** iter)),
            # Nuevo callback para feature importance
            lgb.record_evaluation({
                'feature_importance': []
            }),
            # Añadir callback personalizado para métricas adicionales
            lgb.callback.record_evaluation({
                'ndcg@5': [],
                'ndcg@10': [],
                'ndcg@15': [],
                'map@5': [],
                'map@10': [],
                'map@15': [],
                'auc': [],
                'average_precision': [],
                'precision@5': [],
                'recall@5': []
                })
            ]
    )       
        
    # Análisis detallado
    print("\nMétricas finales:")
    metrics = ['ndcg@5', 'ndcg@10', 'map']
    datasets = ['training', 'validation']
    
    for dataset in datasets:
        print(f"\n{dataset.capitalize()}:")
        for metric in metrics:
            try:
                value = model.best_score[dataset][metric]
                print(f"  {metric}: {value:.4f}")
            except:
                print(f"  {metric}: No disponible")
    
    # Análisis de features
    importance = model.feature_importance(importance_type='gain')
    feature_imp = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)
    
    print("\nTop 10 features más importantes:")
    print(feature_imp.head(10))
    
    # Guardar información del modelo
    model_info = {
        'params': params,
        'best_iteration': model.best_iteration,
        'feature_importance': feature_imp.to_dict(),
        'best_score': model.best_score
    }
    
    return model, le_user, le_product, model_info

# Uso
print("Iniciando entrenamiento del modelo LambdaRank...")
train_logged_in = train[train['user_id'] != -1].copy()

# Entrenar modelo con validación
lambdarank_model, le_user, le_product, model_info = train_lambdarank_model(
    train_logged_in, 
    products, 
    sample_fraction=0.05,
    batch_size=2000,
    validation_fraction=0.15
)

# Imprimir información detallada del modelo
print("\nInformación del modelo:")
print(f"Mejor iteración: {model_info['best_iteration']}")
print("\nParámetros utilizados:")
for param, value in model_info['params'].items():
    print(f"{param}: {value}")
    
    '''

'def train_lambdarank_model(train_logged_in, products, sample_fraction=0.05, batch_size=1000, validation_fraction=0.2):\n    """\n    Entrenar modelo LambdaRank con ajustes para evitar el sobreajuste perfecto\n    """\n    # Preparar datos\n    X, y, group_sizes, le_user, le_product = prepare_collaborative_model(\n        train_logged_in, \n        products,\n        sample_fraction=sample_fraction,\n        batch_size=batch_size\n    )\n    \n    # Normalizar features numéricas\n    feature_names = [\'user_id\', \'product_id\', \'pagetype\', \'discount\', \'color_id\', \n                    \'section\', \'family\'] + [f\'emb_{i}\' for i in range(X.shape[1]-7)]\n    \n    # Función para feature engineering\n    def add_interaction_features(X, feature_names):\n        X_df = pd.DataFrame(X, columns=feature_names)\n        \n        # Más interacciones\n        X_df[\'pagetype_x_user\'] = X_df[\'pagetype\'] * X_df[\'user_id\']\n        X_df[\'pagetype_x_discount\'] = X_df[\'pagetype\'] *

In [16]:
def recommend_by_collaborative(user_id, model, le_user, le_product, products, top_n=10):
    try:
        if user_id in le_user.classes_:
            user_encoded = le_user.transform([user_id])[0]
            predictions = []
            batch_size = 1000
            all_products = products.index.values
            
            for i in range(0, len(all_products), batch_size):
                batch_products = all_products[i:i+batch_size]
                batch_features = []
                
                for prod in batch_products:
                    if prod in le_product.classes_:
                        prod_encoded = le_product.transform([prod])[0]
                        product_data = products.loc[prod]
                        
                        # Features base
                        base_features = [
                            user_encoded,
                            prod_encoded,
                            0,  # pagetype default
                            product_data['discount'],
                            product_data['color_id'],
                            product_data['cod_section'],
                            product_data['family']
                        ]
                        
                        # Embeddings
                        emb_features = product_data['embedding']
                        
                        # Features de interacción
                        interaction_features = [
                            0 * user_encoded,  # pagetype_x_user (con pagetype=0)
                            0 * product_data['discount'],  # pagetype_x_discount
                            user_encoded * product_data['cod_section'],  # user_x_section
                        ]
                        
                        features = base_features + list(emb_features) + interaction_features
                        batch_features.append(features)
                
                if batch_features:
                    scores = model.predict(batch_features)
                    for prod, score in zip(batch_products, scores):
                        predictions.append((prod, score))
            
            predictions.sort(key=lambda x: x[1], reverse=True)
            return [p[0] for p in predictions[:top_n]]
            
    except Exception as e:
        print(f"LambdaRank failed for user {user_id}: {str(e)}")
    
    return recommend_by_popularity(popular_products, top_n=10)

## Generación de recomendaciones

In [17]:
#  6. Función de recomendación por sesión
def generate_recommendations_for_session(session_id, user_id, session_data, user_class):
    country = session_data['country'].iloc[0]
    
    if user_class == 'Usuario recurrente logueado':
        recs = recommend_by_collaborative(
            user_id,
            lambdarank_model,
            le_user,
            le_product,
            products,
            top_n=10
        )
        
        country_recs = get_country_specific_recommendations(train, country)
        recs = blend_recommendations(recs, country_recs, ratio=0.8)  # Aumentar peso de LambdaRank
        
    elif user_class == 'Usuario recurrente no logueado':
        partnumbers_interacted = session_data['partnumber'].unique()
        if len(partnumbers_interacted) > 0:
            recs = recommend_by_content(partnumbers_interacted, embeddings_dict, top_n=10)
            country_recs = get_country_specific_recommendations(train, country)
            recs = blend_recommendations(recs, country_recs, ratio=0.7)  # Ajustar ratio
        else:
            recs = recommend_by_popularity(popular_products, top_n=5)
    else:
        country_recs = get_country_specific_recommendations(train, country)
        general_recs = recommend_by_popularity(popular_products, top_n=10)
        recs = blend_recommendations(general_recs, country_recs, ratio=0.6)  # Ajustar ratio
    
    return recs[:5]

# 7. Generar todas las recomendaciones
def generate_all_recommendations(test):
    user_recommendations = {}
    sessions = test[['session_id', 'user_id', 'user_class']].drop_duplicates()

    for _, row in sessions.iterrows():
        session_id = row['session_id']
        user_id = row['user_id']
        user_class = row['user_class']
        session_data = test[test['session_id'] == session_id]
        recs = generate_recommendations_for_session(session_id, user_id, session_data, user_class)
        user_recommendations[session_id] = recs
    return user_recommendations

# 8. Generar recomendaciones
user_recommendations = generate_all_recommendations(test)

# 9. Verificar completitud
total_sessions_in_test = test['session_id'].nunique()
total_sessions_with_recommendations = len(user_recommendations)

print(f"Total de sesiones en el conjunto de prueba: {total_sessions_in_test}")
print(f"Total de sesiones con recomendaciones: {total_sessions_with_recommendations}")

if total_sessions_in_test == total_sessions_with_recommendations:
    print("Todas las sesiones tienen recomendaciones.")
else:
    print(f"Faltan recomendaciones para {total_sessions_in_test - total_sessions_with_recommendations} sesiones.")

Total de sesiones en el conjunto de prueba: 7349
Total de sesiones con recomendaciones: 7349
Todas las sesiones tienen recomendaciones.


## Generación del Json

In [25]:
# 10. Preparar y guardar JSON
def prepare_output_for_json(user_recommendations):
    user_recommendations_str_keys = {str(session_id): recs for session_id, recs in user_recommendations.items()}
    for session_id, recs in user_recommendations_str_keys.items():
        user_recommendations_str_keys[session_id] = [int(p) for p in recs]
    return user_recommendations_str_keys

def save_recommendations_to_json(user_recommendations_str_keys, output_path):
    """
    Guardar las recomendaciones en formato JSON
    """
    # Crear el directorio si no existe
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Preparar el formato final
    output = {'target': user_recommendations_str_keys}
    
    # Guardar el archivo
    with open(output_path, 'w') as f:
        json.dump(output, f)
    print(f"Archivo guardado exitosamente en: {output_path}")

# Preparar y guardar JSON
output_json_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3.json'
user_recommendations_str_keys = prepare_output_for_json(user_recommendations)
save_recommendations_to_json(user_recommendations_str_keys, output_json_path)

# Verificar JSON
def verify_json_output(output_path):
    with open(output_path, 'r') as f:
        data = json.load(f)
    print(f"Número de sesiones en el JSON: {len(data['target'])}")
    print("\nPrimeras 5 sesiones en el JSON:")
    for session_id in list(data['target'].keys())[:5]:
        print(f"Sesión {session_id}: {data['target'][session_id]}")

verify_json_output(output_json_path)

Archivo guardado exitosamente en: /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3.json
Número de sesiones en el JSON: 7349

Primeras 5 sesiones en el JSON:
Sesión 746: [38002, 12468, 834, 9039, 17547]
Sesión 1306: [693, 42576, 23029, 24271, 20647]
Sesión 1364: [15604, 32851, 13891, 24271, 20647]
Sesión 1377: [40779, 24271, 2763, 14230, 25032]
Sesión 2251: [40779, 24271, 2763, 14230, 25032]


## Analizar resultados y comparación

In [None]:
def analyze_recommendations(user_recommendations, test, train):
    print("=== Análisis de Recomendaciones ===")
    
    # 1. Distribución de tipos de usuario
    user_types = test.groupby('user_class').size()
    print("\nDistribución de tipos de usuario:")
    print(user_types)
    
    # 2. Análisis por país
    country_dist = test.groupby('country').size()
    print("\nDistribución por país:")
    print(country_dist)
    
    # 3. Verificar diversidad de recomendaciones
    all_recs = [item for sublist in user_recommendations.values() for item in sublist]
    unique_recs = len(set(all_recs))
    print(f"\nNúmero total de productos únicos recomendados: {unique_recs}")
    
    # 4. Verificar overlap con productos populares
    popular_products_set = set(popular_products[:100])
    rec_overlap = sum(1 for x in set(all_recs) if x in popular_products_set)
    print(f"Overlap con productos populares: {rec_overlap/unique_recs:.2%}")
    
    # 5. Análisis de recomendaciones por tipo de usuario
    print("\nAnálisis por tipo de usuario:")
    for user_class in test['user_class'].unique():
        class_sessions = test[test['user_class'] == user_class]['session_id'].unique()
        class_recs = [user_recommendations[s] for s in class_sessions if s in user_recommendations]
        class_unique_recs = len(set([item for sublist in class_recs for item in sublist]))
        print(f"\n{user_class}:")
        print(f"Productos únicos recomendados: {class_unique_recs}")
    
    # 6. Verificar recomendaciones por país
    print("\nEjemplo de recomendaciones por país:")
    for country in test['country'].unique()[:3]:
        country_sessions = test[test['country'] == country]['session_id'].unique()[:2]
        print(f"\nPaís: {country}")
        for session in country_sessions:
            print(f"Session {session}: {user_recommendations[session]}")

def compare_versions(json_path_v3, json_path_v4):
    """
    Compara las versiones del modelo anterior y LambdaRank
    """
    # Cargar ambas versiones
    with open(json_path_v3, 'r') as f:
        v3 = json.load(f)
    with open(json_path_v4, 'r') as f:
        v4 = json.load(f)
    
    # Comparar recomendaciones
    different_recs = 0
    total_sessions = len(v3['target'])
    
    # Análisis detallado de cambios
    changes_analysis = {
        'total_different': 0,
        'completely_different': 0,
        'partially_different': 0,
        'overlap_stats': []
    }
    
    for session_id in v3['target']:
        old_recs = set(v3['target'][session_id])
        new_recs = set(v4['target'][session_id])
        
        if old_recs != new_recs:
            different_recs += 1
            
            # Analizar el tipo de cambio
            overlap = len(old_recs.intersection(new_recs))
            if overlap == 0:
                changes_analysis['completely_different'] += 1
            else:
                changes_analysis['partially_different'] += 1
                changes_analysis['overlap_stats'].append(overlap/5)  # 5 es el número total de recomendaciones
    
    changes_analysis['total_different'] = different_recs
    
    # Imprimir resultados
    print(f"Sesiones con recomendaciones diferentes: {different_recs/total_sessions:.2%}")
    print(f"Cambios completos: {changes_analysis['completely_different']/total_sessions:.2%}")
    print(f"Cambios parciales: {changes_analysis['partially_different']/total_sessions:.2%}")
    if changes_analysis['overlap_stats']:
        print(f"Promedio de overlap en cambios parciales: {np.mean(changes_analysis['overlap_stats']):.2%}")
    
    # Mostrar ejemplos de cambios
    print("\nEjemplos de cambios en recomendaciones:")
    for session_id in list(v3['target'].keys())[:5]:
        if v3['target'][session_id] != v4['target'][session_id]:
            print(f"\nSession {session_id}:")
            print(f"Anterior (v3): {v3['target'][session_id]}")
            print(f"Nueva (v4): {v4['target'][session_id]}")

def analyze_lambdarank_performance(user_recommendations, test):
    print("\n=== Análisis de Rendimiento de LambdaRank ===")
    
    # Análisis existente...
    
    # Añadir análisis por país
    print("\nRendimiento por país:")
    for country in test['country'].unique():
        country_sessions = test[test['country'] == country]['session_id'].unique()
        country_recs = [user_recommendations[s] for s in country_sessions if s in user_recommendations]
        print(f"\nPaís {country}:")
        print(f"Sesiones: {len(country_sessions)}")
        print(f"Productos únicos recomendados: {len(set([item for sublist in country_recs for item in sublist]))}")
    
    # Añadir análisis de cobertura de categorías
    print("\nCobertura de categorías:")
    all_recs = [item for sublist in user_recommendations.values() for item in sublist]
    rec_categories = products.loc[all_recs]['cod_section'].unique()
    print(f"Categorías cubiertas: {len(rec_categories)}")
    
    return lambdarank_used, unique_recs, rec_counts

# Ejecutar análisis
print("Ejecutando análisis completo...")
analyze_recommendations(user_recommendations, test, train)
compare_versions('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions_save/old/predictions_3_shared_stratify_v3_436.json', '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3.json')
lambdarank_used, unique_recs, rec_counts = analyze_lambdarank_performance(user_recommendations, test)

In [27]:
def analyze_recommendations_detailed(user_recommendations, test, train, products):
    print("=== Análisis Detallado de Recomendaciones ===")
    
    # 1. Métricas básicas
    print("\n1. Métricas Básicas:")
    all_recs = [item for sublist in user_recommendations.values() for item in sublist]
    unique_recs = len(set(all_recs))
    total_recs = len(all_recs)
    print(f"Total de recomendaciones: {total_recs}")
    print(f"Productos únicos recomendados: {unique_recs}")
    print(f"Ratio de diversidad: {unique_recs/total_recs:.4f}")
    
    # 2. Distribución por tipo de usuario
    print("\n2. Distribución por Tipo de Usuario:")
    user_types = test.groupby('user_class').agg({
        'session_id': 'count',
        'user_id': 'nunique'
    }).rename(columns={
        'session_id': 'num_sessions',
        'user_id': 'unique_users'
    })
    print(user_types)
    
    # 3. Análisis por país
    print("\n3. Análisis por País:")
    country_stats = {}
    for country in test['country'].unique():
        country_sessions = test[test['country'] == country]['session_id'].unique()
        country_recs = [user_recommendations[s] for s in country_sessions if s in user_recommendations]
        country_products = set([item for sublist in country_recs for item in sublist])
        
        country_stats[country] = {
            'sessions': len(country_sessions),
            'unique_products': len(country_products),
            'avg_products_per_session': len([item for sublist in country_recs for item in sublist]) / len(country_sessions)
        }
    
    for country, stats in country_stats.items():
        print(f"\nPaís {country}:")
        for metric, value in stats.items():
            print(f"- {metric}: {value:.2f}")
    
    # 4. Análisis de categorías
    print("\n4. Análisis de Categorías:")
    rec_products = pd.DataFrame(all_recs, columns=['partnumber'])
    rec_products = rec_products.merge(products[['cod_section', 'family']], 
                                    left_on='partnumber', 
                                    right_index=True)
    
    category_stats = rec_products.groupby('cod_section').agg({
        'partnumber': 'count',
        'family': 'nunique'
    }).rename(columns={
        'partnumber': 'recommendations',
        'family': 'unique_families'
    })
    
    print("\nDistribución por categoría:")
    print(category_stats.sort_values('recommendations', ascending=False).head())
    
    # 5. Análisis de diversidad temporal
    print("\n5. Análisis de Cobertura:")
    total_products = len(products)
    total_categories = products['cod_section'].nunique()
    total_families = products['family'].nunique()
    
    print(f"Cobertura de productos: {unique_recs/total_products:.2%}")
    print(f"Cobertura de categorías: {category_stats.index.nunique()/total_categories:.2%}")
    print(f"Cobertura de familias: {rec_products['family'].nunique()/total_families:.2%}")
    
    # 6. Análisis de balance
    print("\n6. Análisis de Balance:")
    rec_counts = pd.Series(all_recs).value_counts()
    print(f"Estadísticas de frecuencia de recomendación:")
    print(f"- Media: {rec_counts.mean():.2f}")
    print(f"- Mediana: {rec_counts.median():.2f}")
    print(f"- Desv. Est.: {rec_counts.std():.2f}")
    print(f"- Max: {rec_counts.max():.2f}")
    print(f"- Min: {rec_counts.min():.2f}")
    
    # 7. Análisis de similitud entre recomendaciones
    print("\n7. Análisis de Similitud:")
    def calculate_session_similarity(recs1, recs2):
        return len(set(recs1) & set(recs2)) / len(set(recs1) | set(recs2))
    
    similarities = []
    sample_sessions = list(user_recommendations.keys())[:1000]  # Limitar para eficiencia
    for i in range(len(sample_sessions)-1):
        for j in range(i+1, len(sample_sessions)):
            sim = calculate_session_similarity(
                user_recommendations[sample_sessions[i]],
                user_recommendations[sample_sessions[j]]
            )
            similarities.append(sim)
    
    print(f"Similitud media entre sesiones: {np.mean(similarities):.4f}")
    print(f"Desviación estándar de similitud: {np.std(similarities):.4f}")
    
    # 8. Métricas de negocio
    print("\n8. Métricas de Negocio:")
    discount_ratio = len([p for p in all_recs if products.loc[p, 'discount']]) / len(all_recs)
    print(f"Ratio de productos con descuento: {discount_ratio:.2%}")
    
    # 9. Resumen de hallazgos
    print("\n9. Resumen de Hallazgos Clave:")
    print(f"- Diversidad general: {unique_recs/total_recs:.2%}")
    print(f"- Cobertura de catálogo: {unique_recs/total_products:.2%}")
    print(f"- Balance entre países: {np.std([s['unique_products'] for s in country_stats.values()])/np.mean([s['unique_products'] for s in country_stats.values()]):.2%} (CV)")
    
    return {
        'unique_recs': unique_recs,
        'country_stats': country_stats,
        'category_stats': category_stats,
        'rec_counts': rec_counts,
        'similarities': similarities
    }

# Ejecutar el análisis detallado
analysis_results = analyze_recommendations_detailed(user_recommendations, test, train, products)

=== Análisis Detallado de Recomendaciones ===

1. Métricas Básicas:
Total de recomendaciones: 36745
Productos únicos recomendados: 7918
Ratio de diversidad: 0.2155

2. Distribución por Tipo de Usuario:
                                num_sessions  unique_users
user_class                                                
Usuario nuevo logueado                  3768           916
Usuario recurrente logueado             1998           495
Usuario recurrente no logueado         23509             1

3. Análisis por País:

País 57:
- sessions: 1122.00
- unique_products: 2465.00
- avg_products_per_session: 5.00

País 34:
- sessions: 2860.00
- unique_products: 4712.00
- avg_products_per_session: 5.00

País 25:
- sessions: 2065.00
- unique_products: 1540.00
- avg_products_per_session: 5.00

País 29:
- sessions: 1302.00
- unique_products: 2623.00
- avg_products_per_session: 5.00

4. Análisis de Categorías:

Distribución por categoría:
             recommendations  unique_families
cod_section      

---