In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from lightgbm.callback import early_stopping, log_evaluation

# Configurar LightGBM para uso dinámico de memoria en GPU
def configure_gpu():
    try:
        import os
        os.environ['LIGHTGBM_DEVICE'] = 'gpu'
        os.environ['LIGHTGBM_GPU_PLATFORM_ID'] = '0'
        os.environ['LIGHTGBM_GPU_DEVICE_ID'] = '0'
        print("GPU configurada para LightGBM.")
    except ImportError:
        print("Error al configurar GPU. Continuando con CPU.")

configure_gpu()

GPU configurada para LightGBM.


In [23]:
print("\n--- Cargando datos ---")
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data_final.pkl')
test_df= pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl')


--- Cargando datos ---


In [21]:
print(test_df.columns)
print(train_df.columns)

Index(['session_id', 'date', 'timestamp_local', 'user_id', 'country',
       'partnumber', 'device_type', 'pagetype'],
      dtype='object')
Index(['session_id', 'date', 'timestamp_local', 'add_to_cart', 'user_id',
       'country', 'partnumber', 'device_type', 'pagetype', 'hour',
       'day_of_week', 'week', 'session_interactions', 'discount_category',
       'similar_products', 'color_id', 'family', 'cod_section', 'discount',
       'popularity', 'cluster'],
      dtype='object')


In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation


def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9

    categorical_columns = ['discount_category', 'similar_products']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)

    return df

# Preprocesar los datos
train_df = preprocess_data(train_df)

# Asegurar que todas las columnas sean numéricas o booleanas
compatible_columns = train_df.select_dtypes(include=['int64', 'float64', 'bool', 'int32', 'int16', 'int8']).columns
train_df = train_df[compatible_columns]

# Separar las características, etiquetas y grupos
X = train_df.drop(['add_to_cart'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Dividir datos en entrenamiento y validación respetando los grupos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu'  # Cambiado para CPU
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=callbacks
)

print("\n--- Guardando el modelo ---")
model.save_model('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_model.txt')

print("\n--- Modelo entrenado y guardado con éxito ---")



--- Preprocesando datos ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.847607	train's ndcg@3: 0.862816	train's ndcg@5: 0.873727	val's ndcg@1: 0.846946	val's ndcg@3: 0.862283	val's ndcg@5: 0.873344
[20]	train's ndcg@1: 0.848607	train's ndcg@3: 0.863873	train's ndcg@5: 0.874765	val's ndcg@1: 0.847769	val's ndcg@3: 0.863228	val's ndcg@5: 0.874127
[30]	train's ndcg@1: 0.849251	train's ndcg@3: 0.86456	train's ndcg@5: 0.875424	val's ndcg@1: 0.848557	val's ndcg@3: 0.863929	val's ndcg@5: 0.874886
[40]	train's ndcg@1: 0.849653	train's ndcg@3: 0.864986	train's ndcg@5: 0.875834	val's ndcg@1: 0.848723	val's ndcg@3: 0.864267	val's ndcg@5: 0.87516
[50]	train's ndcg@1: 0.849938	train's ndcg@3: 0.865301	train's ndcg@5: 0.876147	val's ndcg@1: 0.849026	val's ndcg@3: 0.86453	val's ndcg@5: 0.875442
[60]	train's ndcg@1: 0.850298	train's ndcg@3: 0.865639	train's ndcg@5: 0.876479	val's 

In [12]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation


def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9

    categorical_columns = ['discount_category', 'similar_products']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)

    return df

# Preprocesar los datos
train_df = preprocess_data(train_df)

# Asegurar que todas las columnas sean numéricas o booleanas
compatible_columns = train_df.select_dtypes(include=['int64', 'float64', 'bool', 'int32', 'int16', 'int8']).columns
train_df = train_df[compatible_columns]

# Separar las características, etiquetas y grupos
X = train_df.drop(['add_to_cart'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Dividir datos en entrenamiento y validación respetando los grupos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.03,  # Reducir tasa de aprendizaje
    'num_leaves': 40,      # Aumentar el número de hojas
    'min_data_in_leaf': 15,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu'        # Cambiar a GPU
}


callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=callbacks
)

print("\n--- Guardando el modelo ---")
model.save_model('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_model.txt')

print("\n--- Modelo entrenado y guardado con éxito ---")



--- Preprocesando datos ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.847609	train's ndcg@3: 0.862797	train's ndcg@5: 0.873717	val's ndcg@1: 0.847182	val's ndcg@3: 0.862263	val's ndcg@5: 0.873228
[20]	train's ndcg@1: 0.848629	train's ndcg@3: 0.863958	train's ndcg@5: 0.874863	val's ndcg@1: 0.848183	val's ndcg@3: 0.863404	val's ndcg@5: 0.87431
[30]	train's ndcg@1: 0.849208	train's ndcg@3: 0.864619	train's ndcg@5: 0.875463	val's ndcg@1: 0.848535	val's ndcg@3: 0.863813	val's ndcg@5: 0.874794
[40]	train's ndcg@1: 0.849541	train's ndcg@3: 0.864961	train's ndcg@5: 0.875813	val's ndcg@1: 0.848947	val's ndcg@3: 0.864223	val's ndcg@5: 0.875164
[50]	train's ndcg@1: 0.849929	train's ndcg@3: 0.865359	train's ndcg@5: 0.876201	val's ndcg@1: 0.849245	val's ndcg@3: 0.864519	val's ndcg@5: 0.875475
[60]	train's ndcg@1: 0.850132	train's ndcg@3: 0.865593	train's ndcg@5: 0.876416	val'

In [13]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation


def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9

    categorical_columns = ['discount_category', 'similar_products']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)

    return df

# Preprocesar los datos
train_df = preprocess_data(train_df)

# Asegurar que todas las columnas sean numéricas o booleanas
compatible_columns = train_df.select_dtypes(include=['int64', 'float64', 'bool', 'int32', 'int16', 'int8']).columns
train_df = train_df[compatible_columns]

# Separar las características, etiquetas y grupos
X = train_df.drop(['add_to_cart'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Dividir datos en entrenamiento y validación respetando los grupos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.02,  # Reducir tasa de aprendizaje para un ajuste más fino
    'num_leaves': 50,      # Capturar más patrones
    'min_data_in_leaf': 10,  # Más flexibilidad en las hojas
    'max_bin': 255,  # Granularidad en las características
    'feature_fraction': 0.8,  # Submuestreo de características
    'bagging_fraction': 0.8,  # Submuestreo de datos
    'bagging_freq': 5,  # Frecuencia de muestreo
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu'  # Cambiar a 'gpu' si es posible
}



callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=callbacks
)

print("\n--- Guardando el modelo ---")
model.save_model('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_model_v2.txt')

print("\n--- Modelo entrenado y guardado con éxito ---")



--- Preprocesando datos ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.848265	train's ndcg@3: 0.863583	train's ndcg@5: 0.874437	val's ndcg@1: 0.84776	val's ndcg@3: 0.863032	val's ndcg@5: 0.874029
[20]	train's ndcg@1: 0.849301	train's ndcg@3: 0.864663	train's ndcg@5: 0.875493	val's ndcg@1: 0.848581	val's ndcg@3: 0.863982	val's ndcg@5: 0.874905
[30]	train's ndcg@1: 0.849852	train's ndcg@3: 0.865224	train's ndcg@5: 0.876051	val's ndcg@1: 0.849229	val's ndcg@3: 0.864419	val's ndcg@5: 0.875388
[40]	train's ndcg@1: 0.850244	train's ndcg@3: 0.865644	train's ndcg@5: 0.876474	val's ndcg@1: 0.849721	val's ndcg@3: 0.864789	val's ndcg@5: 0.875769
[50]	train's ndcg@1: 0.850557	train's ndcg@3: 0.865971	train's ndcg@5: 0.87679	val's ndcg@1: 0.849843	val's ndcg@3: 0.864944	val's ndcg@5: 0.876002
[60]	train's ndcg@1: 0.850779	train's ndcg@3: 0.866218	train's ndcg@5: 0.877004	val's

---

## Modelo utilizando únicamente las características alineadas con el dataset de test(evaluación)

In [18]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Cargar y preprocesar datos ---
def preprocess_data(df, columns_to_keep):
    print("\n--- Preprocesando datos ---")
    df = df[columns_to_keep]  # Seleccionar solo las columnas permitidas
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    return df

# Columnas permitidas (comunes entre `train` y `test`)
columns_to_keep = [
    'session_id', 'date', 'timestamp_local', 'user_id', 
    'country', 'partnumber', 'device_type', 'pagetype', 'add_to_cart'
]

# Cargar y preprocesar el dataset de entrenamiento
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_data(train_df, columns_to_keep)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Dividir datos en entrenamiento y validación respetando los grupos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# --- Configurar y entrenar el modelo ---
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.03,
    'num_leaves': 40,
    'min_data_in_leaf': 15,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu'
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=callbacks
)

# Guardar el modelo entrenado
print("\n--- Guardando el modelo ---")
model.save_model('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_aligned_model.txt')
print("\n--- Modelo entrenado y guardado con éxito ---")



--- Preprocesando datos ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.842234	train's ndcg@3: 0.8571	train's ndcg@5: 0.86801	val's ndcg@1: 0.841675	val's ndcg@3: 0.856382	val's ndcg@5: 0.867438
[20]	train's ndcg@1: 0.842289	train's ndcg@3: 0.857207	train's ndcg@5: 0.868128	val's ndcg@1: 0.841882	val's ndcg@3: 0.856652	val's ndcg@5: 0.86765
[30]	train's ndcg@1: 0.842335	train's ndcg@3: 0.85724	train's ndcg@5: 0.868167	val's ndcg@1: 0.841814	val's ndcg@3: 0.856495	val's ndcg@5: 0.867429
[40]	train's ndcg@1: 0.842551	train's ndcg@3: 0.857358	train's ndcg@5: 0.868279	val's ndcg@1: 0.841843	val's ndcg@3: 0.856529	val's ndcg@5: 0.867586
[50]	train's ndcg@1: 0.842476	train's ndcg@3: 0.857344	train's ndcg@5: 0.868283	val's ndcg@1: 0.841838	val's ndcg@3: 0.856515	val's ndcg@5: 0.867532
Early stopping, best iteration is:
[1]	train's ndcg@1: 0.842482	train's ndcg@3: 0.857505

In [21]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, classification_report
import pandas as pd
import numpy as np
import lightgbm as lgb

# --- Cargar el modelo entrenado ---
print("\n--- Cargando el modelo entrenado ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_aligned_model.txt'
model = lgb.Booster(model_file=model_path)

# --- Cargar y preprocesar el conjunto de validación ---
print("\n--- Cargando el conjunto de validación ---")
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')

# Preprocesamiento básico
def preprocess_data(df):
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    return df

train_df = preprocess_data(train_df)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Dividir datos en entrenamiento y validación respetando los grupos
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_val = X.iloc[val_idx]
y_val = y.iloc[val_idx]

# --- Generar predicciones ---
print("\n--- Generando predicciones ---")
y_val_pred = model.predict(X_val)

# Convertir las probabilidades en etiquetas binarias
y_val_pred_binary = np.where(y_val_pred >= 0.5, 1, 0)

# --- Cálculo de métricas adicionales ---
print("\n--- Calculando métricas de evaluación ---")
roc_auc = roc_auc_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred_binary)
recall = recall_score(y_val, y_val_pred_binary)
f1 = f1_score(y_val, y_val_pred_binary)
accuracy = accuracy_score(y_val, y_val_pred_binary)

# Mostrar resultados
print("\n--- Métricas de Evaluación ---")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Reporte completo
print("\n--- Classification Report ---")
print(classification_report(y_val, y_val_pred_binary))


--- Cargando el modelo entrenado ---

--- Cargando el conjunto de validación ---

--- Generando predicciones ---

--- Calculando métricas de evaluación ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



--- Métricas de Evaluación ---
ROC AUC Score: 0.6503
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Accuracy: 0.9405

--- Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97   8741049
           1       0.00      0.00      0.00    553207

    accuracy                           0.94   9294256
   macro avg       0.47      0.50      0.48   9294256
weighted avg       0.88      0.94      0.91   9294256



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json

# --- Cargar el modelo entrenado ---
print("\n--- Cargando el modelo entrenado ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_aligned_model.txt'
model = lgb.Booster(model_file=model_path)

# --- Cargar el conjunto de prueba ---
print("\n--- Cargando el conjunto de prueba ---")
test_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl')
print("Columnas en test_df:", test_df.columns)

# --- Obtener productos populares ---
print("\n--- Calculando productos populares ---")
popular_products = test_df['partnumber'].value_counts().index.tolist()

# --- Preprocesar el conjunto de prueba ---
def preprocess_test_data(test_df):
    print("\n--- Preprocesando `test_df` ---")
    test_df['date'] = pd.to_datetime(test_df['date']).astype(int) / 10**9
    test_df['timestamp_local'] = pd.to_datetime(test_df['timestamp_local']).astype(int) / 10**9
    feature_columns = [
        'session_id', 'date', 'timestamp_local', 'user_id', 'country',
        'partnumber', 'device_type', 'pagetype'
    ]
    return test_df[feature_columns]

test_df = preprocess_test_data(test_df)
print("Columnas procesadas en test_df:", test_df.columns)

# --- Generar predicciones para cada session_id ---
print("\n--- Generando predicciones ---")
session_ids = test_df['session_id'].unique()
predictions = {}

for session_id in session_ids:
    session_data = test_df[test_df['session_id'] == session_id].copy()

    if session_data.empty:
        predictions[str(session_id)] = []
        continue

    session_features = session_data.drop(['session_id'], axis=1)

    if session_features.shape[1] != model.num_feature():
        raise ValueError(f"El número de características no coincide con el modelo. "
                         f"Esperado: {model.num_feature()}, Actual: {session_features.shape[1]}")

    session_data['score'] = model.predict(session_features)

    recommended_products = (
        session_data.sort_values(by='score', ascending=False)['partnumber']
        .drop_duplicates()
        .tolist()
    )

    # Completar con productos populares si faltan
    for product in popular_products:
        if len(recommended_products) >= 5:
            break
        if product not in recommended_products:
            recommended_products.append(product)

    # Garantizar exactamente 5 valores únicos
    predictions[str(session_id)] = recommended_products[:5]

# --- Guardar el archivo predictions_3.json ---
print("\n--- Guardando el archivo predictions_3_final.json ---")
output = {"target": predictions}
output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_final.json'
with open(output_path, 'w') as f:
    json.dump(output, f, indent=4)

print(f"Archivo predictions_3_final.json generado con éxito en {output_path}.")



--- Cargando el modelo entrenado ---

--- Cargando el conjunto de prueba ---
Columnas en test_df: Index(['session_id', 'date', 'timestamp_local', 'user_id', 'country',
       'partnumber', 'device_type', 'pagetype'],
      dtype='object')

--- Calculando productos populares ---

--- Preprocesando `test_df` ---
Columnas procesadas en test_df: Index(['session_id', 'date', 'timestamp_local', 'user_id', 'country',
       'partnumber', 'device_type', 'pagetype'],
      dtype='object')

--- Generando predicciones ---

--- Guardando el archivo predictions_3_final.json ---
Archivo predictions_3_final.json generado con éxito en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_final.json.


Tarea 3: 26% completada

235/900 puntos

---


---

In [16]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento de datos ---
def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    return df

# Cargar y preprocesar datos
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_data(train_df)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Dividir datos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# Configurar y entrenar modelo
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.005,
    'num_leaves': 100,
    'min_data_in_leaf': 25,
    'max_bin': 200,
    'subsample': 0.8,
    'subsample_freq': 1,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu'
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=200,
    callbacks=callbacks
)

# Guardar modelo
print("\n--- Guardando el modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model.txt'
model.save_model(model_path)
print(f"\n--- Modelo guardado en {model_path} ---")



--- Preprocesando datos ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.843253	train's ndcg@3: 0.858145	train's ndcg@5: 0.869074	val's ndcg@1: 0.842437	val's ndcg@3: 0.857323	val's ndcg@5: 0.868362
[20]	train's ndcg@1: 0.843383	train's ndcg@3: 0.858346	train's ndcg@5: 0.869241	val's ndcg@1: 0.842444	val's ndcg@3: 0.857375	val's ndcg@5: 0.868239
[30]	train's ndcg@1: 0.843541	train's ndcg@3: 0.858482	train's ndcg@5: 0.869385	val's ndcg@1: 0.842645	val's ndcg@3: 0.857444	val's ndcg@5: 0.868347
[40]	train's ndcg@1: 0.843909	train's ndcg@3: 0.858738	train's ndcg@5: 0.869665	val's ndcg@1: 0.842535	val's ndcg@3: 0.857529	val's ndcg@5: 0.868446
[50]	train's ndcg@1: 0.844293	train's ndcg@3: 0.859177	train's ndcg@5: 0.870055	val's ndcg@1: 0.842964	val's ndcg@3: 0.857816	val's ndcg@5: 0.868718
[60]	train's ndcg@1: 0.844481	train's ndcg@3: 0.859532	train's ndcg@5: 0.870347	val

In [17]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, classification_report
import numpy as np

# --- Generar predicciones para conjunto de validación ---
print("\n--- Evaluando el modelo ---")
y_val_pred = model.predict(X_val)

# Convertir las probabilidades en etiquetas binarias para métricas de clasificación
y_val_pred_binary = np.where(y_val_pred >= 0.5, 1, 0)

# --- Cálculo de métricas adicionales ---
roc_auc = roc_auc_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred_binary)
recall = recall_score(y_val, y_val_pred_binary)
f1 = f1_score(y_val, y_val_pred_binary)
accuracy = accuracy_score(y_val, y_val_pred_binary)

# Imprimir resultados
print("\n--- Métricas de Evaluación ---")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Reporte completo
print("\n--- Classification Report ---")
print(classification_report(y_val, y_val_pred_binary))


--- Evaluando el modelo ---

--- Métricas de Evaluación ---
ROC AUC Score: 0.6537
Precision: 0.9990
Recall: 0.1081
F1 Score: 0.1951
Accuracy: 0.9469

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.95      1.00      0.97   8741049
           1       1.00      0.11      0.20    553207

    accuracy                           0.95   9294256
   macro avg       0.97      0.55      0.58   9294256
weighted avg       0.95      0.95      0.93   9294256



In [11]:
import pandas as pd
import lightgbm as lgb
import json

# Cargar modelo entrenado
print("\n--- Cargando el modelo entrenado ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model.txt'
model = lgb.Booster(model_file=model_path)

# Cargar conjunto de prueba
print("\n--- Cargando el conjunto de prueba ---")
test_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl')

# Preprocesar conjunto de prueba
print("\n--- Preprocesando conjunto de prueba ---")
test_df['date'] = pd.to_datetime(test_df['date']).astype(int) / 10**9
test_df['timestamp_local'] = pd.to_datetime(test_df['timestamp_local']).astype(int) / 10**9
test_df['hour'] = pd.to_datetime(test_df['timestamp_local'], unit='s').dt.hour
test_df['day_of_week'] = pd.to_datetime(test_df['timestamp_local'], unit='s').dt.dayofweek

# Generar predicciones
print("\n--- Generando predicciones ---")
session_ids = test_df['session_id'].unique()
predictions = {}

popular_products = test_df['partnumber'].value_counts().index.tolist()

for session_id in session_ids:
    session_data = test_df[test_df['session_id'] == session_id].copy()
    if session_data.empty:
        predictions[str(session_id)] = []
        continue

    session_features = session_data.drop(['session_id'], axis=1)
    if session_features.shape[1] != model.num_feature():
        raise ValueError(f"El número de características no coincide con el modelo. "
                         f"Esperado: {model.num_feature()}, Actual: {session_features.shape[1]}")
    
    session_data['score'] = model.predict(session_features)
    recommended_products = (
        session_data.sort_values(by='score', ascending=False)['partnumber']
        .drop_duplicates()
        .tolist()
    )
    
    # Completar con productos populares si faltan
    for product in popular_products:
        if len(recommended_products) >= 5:
            break
        if product not in recommended_products:
            recommended_products.append(product)
    
    predictions[str(session_id)] = recommended_products[:5]

# Guardar predicciones
output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched.json'
print(f"\n--- Guardando predicciones en {output_path} ---")
with open(output_path, 'w') as f:
    json.dump({"target": predictions}, f, indent=4)
print("Predicciones guardadas con éxito.")



--- Cargando el modelo entrenado ---

--- Cargando el conjunto de prueba ---

--- Preprocesando conjunto de prueba ---

--- Generando predicciones ---

--- Guardando predicciones en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched.json ---
Predicciones guardadas con éxito.


---

### balanced


In [22]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento de datos ---
def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    return df

# Cargar y preprocesar datos
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_data(train_df)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Calcular scale_pos_weight
pos_weight = len(y[y == 0]) / len(y[y == 1])
print(f"\n--- Calculado scale_pos_weight: {pos_weight:.2f} ---")

# Dividir datos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# Configurar y entrenar modelo
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.03,
    'num_leaves': 40,
    'min_data_in_leaf': 15,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu',
    'scale_pos_weight': pos_weight  # Balancear las clases
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=200,
    callbacks=callbacks
)

# Guardar modelo
print("\n--- Guardando el modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_weighted_model.txt'
model.save_model(model_path)
print(f"\n--- Modelo guardado en {model_path} ---")



--- Preprocesando datos ---

--- Calculado scale_pos_weight: 15.95 ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.842865	train's ndcg@3: 0.857555	train's ndcg@5: 0.868467	val's ndcg@1: 0.842335	val's ndcg@3: 0.857066	val's ndcg@5: 0.867949
[20]	train's ndcg@1: 0.842795	train's ndcg@3: 0.857643	train's ndcg@5: 0.868573	val's ndcg@1: 0.842271	val's ndcg@3: 0.857012	val's ndcg@5: 0.86807
[30]	train's ndcg@1: 0.842801	train's ndcg@3: 0.857622	train's ndcg@5: 0.868538	val's ndcg@1: 0.842333	val's ndcg@3: 0.857153	val's ndcg@5: 0.868048
[40]	train's ndcg@1: 0.842782	train's ndcg@3: 0.857621	train's ndcg@5: 0.868574	val's ndcg@1: 0.842069	val's ndcg@3: 0.856895	val's ndcg@5: 0.867818
[50]	train's ndcg@1: 0.84276	train's ndcg@3: 0.857571	train's ndcg@5: 0.868533	val's ndcg@1: 0.842264	val's ndcg@3: 0.857073	val's ndcg@5: 0.868044
Early stopping, best iteration is:
[2]	tra

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, classification_report

# --- Cargar el modelo entrenado ---
print("\n--- Cargando el modelo entrenado ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_weighted_model.txt'
model = lgb.Booster(model_file=model_path)

# --- Cargar los datos de validación ---
print("\n--- Cargando datos de validación ---")
val_data_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl'
val_df = pd.read_pickle(val_data_path)

# --- Preprocesar datos ---
print("\n--- Preprocesando datos ---")
val_df['date'] = pd.to_datetime(val_df['date']).astype(int) / 10**9
val_df['timestamp_local'] = pd.to_datetime(val_df['timestamp_local']).astype(int) / 10**9
val_df['hour'] = pd.to_datetime(val_df['timestamp_local'], unit='s').dt.hour
val_df['day_of_week'] = pd.to_datetime(val_df['timestamp_local'], unit='s').dt.dayofweek

# Separar las características y etiquetas
X_val = val_df.drop(['add_to_cart', 'session_id'], axis=1)
y_val = val_df['add_to_cart']

# Predecir las probabilidades de las etiquetas positivas
print("\n--- Generando predicciones ---")
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)

# --- Calcular métricas ---
print("\n--- Calculando métricas ---")
roc_auc = roc_auc_score(y_val, y_pred_proba)
precision = precision_score(y_val, y_pred, zero_division=0)
recall = recall_score(y_val, y_pred, zero_division=0)
f1 = f1_score(y_val, y_pred, zero_division=0)
accuracy = accuracy_score(y_val, y_pred)

# --- Mostrar resultados ---
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# --- Informe de clasificación ---
print("\n--- Classification Report ---")
print(classification_report(y_val, y_pred, zero_division=0))



--- Cargando el modelo entrenado ---

--- Cargando datos de validación ---

--- Preprocesando datos ---

--- Generando predicciones ---

--- Calculando métricas ---
ROC AUC Score: 0.6517
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Accuracy: 0.9410

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.94      1.00      0.97  43805662
           1       0.00      0.00      0.00   2745783

    accuracy                           0.94  46551445
   macro avg       0.47      0.50      0.48  46551445
weighted avg       0.89      0.94      0.91  46551445



In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json

# --- Generar predicciones y guardar en JSON ---
def json_generator(model_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    # --- Cargar el conjunto de prueba ---
    print("\n--- Cargando el conjunto de prueba ---")
    test_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl')
    print("Columnas en test_df:", test_df.columns)

    # --- Obtener productos populares ---
    print("\n--- Calculando productos populares ---")
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    # --- Preprocesar el conjunto de prueba ---
    def preprocess_test_data(test_df):
        print("\n--- Preprocesando `test_df` ---")
        test_df['date'] = pd.to_datetime(test_df['date']).astype(int) / 10**9
        test_df['timestamp_local'] = pd.to_datetime(test_df['timestamp_local']).astype(int) / 10**9
        test_df['hour'] = pd.to_datetime(test_df['timestamp_local'], unit='s').dt.hour
        test_df['day_of_week'] = pd.to_datetime(test_df['timestamp_local'], unit='s').dt.dayofweek
        feature_columns = [
            'session_id', 'date', 'timestamp_local', 'user_id', 'country',
            'partnumber', 'device_type', 'pagetype', 'hour', 'day_of_week'
        ]
        return test_df[feature_columns]

    test_df = preprocess_test_data(test_df)
    print("Columnas procesadas en test_df:", test_df.columns)

    # --- Generar predicciones para cada session_id ---
    print("\n--- Generando predicciones ---")
    session_ids = test_df['session_id'].unique()
    predictions = {}

    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = []
            continue

        session_features = session_data.drop(['session_id'], axis=1)

        if session_features.shape[1] != model.num_feature():
            raise ValueError(f"El número de características no coincide con el modelo. "
                             f"Esperado: {model.num_feature()}, Actual: {session_features.shape[1]}")

        session_data['score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Completar con productos populares si faltan
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in recommended_products:
                recommended_products.append(product)

        # Garantizar exactamente 5 valores únicos
        predictions[str(session_id)] = recommended_products[:5]

    # --- Guardar el archivo predictions_3.json ---
    print("\n--- Guardando el archivo predictions_3.json ---")
    output = {"target": predictions}
    with open(output_path, 'w') as f:
        json.dump(output, f, indent=4)

    print(f"Archivo predictions_3.json generado con éxito en {output_path}.")

# Parámetros de entrada y salida
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_weighted_model.txt'
output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_weighted.json'

# Generar JSON
json_generator(model_path, output_path)



--- Cargando el modelo entrenado ---

--- Cargando el conjunto de prueba ---
Columnas en test_df: Index(['session_id', 'date', 'timestamp_local', 'user_id', 'country',
       'partnumber', 'device_type', 'pagetype'],
      dtype='object')

--- Calculando productos populares ---

--- Preprocesando `test_df` ---
Columnas procesadas en test_df: Index(['session_id', 'date', 'timestamp_local', 'user_id', 'country',
       'partnumber', 'device_type', 'pagetype', 'hour', 'day_of_week'],
      dtype='object')

--- Generando predicciones ---

--- Guardando el archivo predictions_3.json ---
Archivo predictions_3.json generado con éxito en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_weighted.json.


---