In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento de datos ---
def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    return df

def enrich_features(df):
    print("\n--- Enriqueciendo características ---")
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# Cargar y enriquecer datos
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_data(train_df)
train_df = enrich_features(train_df)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Calcular scale_pos_weight
pos_weight = len(y[y == 0]) / len(y[y == 1])
print(f"\n--- Calculado scale_pos_weight: {pos_weight:.2f} ---")

# Dividir datos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# Configurar y entrenar modelo
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.03,
    'num_leaves': 40,
    'min_data_in_leaf': 15,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu',
    'scale_pos_weight': pos_weight
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=200,
    callbacks=callbacks
)

# Guardar modelo
print("\n--- Guardando el modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model.txt'
model.save_model(model_path)
print(f"\n--- Modelo guardado en {model_path} ---")



--- Preprocesando datos ---

--- Enriqueciendo características ---

--- Calculado scale_pos_weight: 15.95 ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.843676	train's ndcg@3: 0.85858	train's ndcg@5: 0.869545	val's ndcg@1: 0.842769	val's ndcg@3: 0.85768	val's ndcg@5: 0.868707
[20]	train's ndcg@1: 0.843748	train's ndcg@3: 0.858609	train's ndcg@5: 0.869587	val's ndcg@1: 0.84282	val's ndcg@3: 0.857825	val's ndcg@5: 0.868845
[30]	train's ndcg@1: 0.843708	train's ndcg@3: 0.858674	train's ndcg@5: 0.869643	val's ndcg@1: 0.843045	val's ndcg@3: 0.857745	val's ndcg@5: 0.868878
[40]	train's ndcg@1: 0.843737	train's ndcg@3: 0.858736	train's ndcg@5: 0.86968	val's ndcg@1: 0.842918	val's ndcg@3: 0.857886	val's ndcg@5: 0.868909
[50]	train's ndcg@1: 0.8437	train's ndcg@3: 0.858691	train's ndcg@5: 0.869702	val's ndcg@1: 0.843054	val's ndcg@3: 0.857943	val's ndcg@5: 0.868944
Early s

In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, precision_score, recall_score, f1_score

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_enrich_test_data(df):
    print("\n--- Preprocesando y enriqueciendo datos de prueba ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# --- Generar JSON ---
def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_enrich_test_data(test_df)

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data[X.columns.tolist()]
        session_data.loc[:, 'score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        while len(recommended_products) < 5:
            recommended_products.append(popular_products[len(recommended_products) % len(popular_products)])

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")


# --- Ejecutar predicciones y métricas ---
if __name__ == "__main__":
    model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model.txt'
    test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
    output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched.json'

    # Generar JSON
    generate_predictions(model_path, test_path, output_path)




--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y enriqueciendo datos de prueba ---

--- Generando predicciones ---

--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched.json


In [31]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_enrich_test_data(df):
    print("\n--- Preprocesando y enriqueciendo datos de prueba ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# --- Generar JSON ---
def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_enrich_test_data(test_df)

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data[X.columns.tolist()]
        session_data.loc[:, 'score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Eliminar repeticiones antes de completar
        recommended_products = list(dict.fromkeys(recommended_products))

        # Completar con productos populares si faltan
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in recommended_products:
                recommended_products.append(product)

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")


# --- Ejecutar predicciones ---
if __name__ == "__main__":
    model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model.txt'
    test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
    output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched_v2.json'

    # Generar JSON
    generate_predictions(model_path, test_path, output_path)



--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y enriqueciendo datos de prueba ---

--- Generando predicciones ---

--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched_v2.json


Tarea 3: 26% completada

236/900 puntos

---

---

---

In [44]:
import json

# Ruta del JSON a validar
json_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched_test.json'

def validate_json(file_path):
    print(f"\n--- Validando archivo JSON: {file_path} ---")

    # Cargar el JSON
    with open(file_path, 'r') as f:
        data = json.load(f)

    recommendations = data.get("target", {})

    # Inicializar contadores
    total_sessions = len(recommendations)
    sessions_with_repeated_recommendations = 0
    sessions_with_less_than_5_recommendations = 0

    # Verificar cada sesión
    for session_id, products in recommendations.items():
        # Verificar si hay recomendaciones repetidas
        if len(products) != len(set(products)):
            sessions_with_repeated_recommendations += 1

        # Verificar si tiene menos de 5 recomendaciones
        if len(products) < 5:
            sessions_with_less_than_5_recommendations += 1

    # Imprimir resultados
    print(f"Total de sesiones: {total_sessions}")
    print(f"Sesiones con recomendaciones repetidas: {sessions_with_repeated_recommendations}")
    print(f"Sesiones con menos de 5 recomendaciones: {sessions_with_less_than_5_recommendations}")

    return {
        "total_sessions": total_sessions,
        "sessions_with_repeated_recommendations": sessions_with_repeated_recommendations,
        "sessions_with_less_than_5_recommendations": sessions_with_less_than_5_recommendations,
    }
    

# Validar JSON
validate_json(json_path)



--- Validando archivo JSON: /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched_v4.json ---
Total de sesiones: 7349
Sesiones con recomendaciones repetidas: 0
Sesiones con menos de 5 recomendaciones: 0


{'total_sessions': 7349,
 'sessions_with_repeated_recommendations': 0,
 'sessions_with_less_than_5_recommendations': 0}

---

In [33]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento de datos ---
def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    return df

def enrich_features(df):
    print("\n--- Enriqueciendo características ---")
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# Cargar y enriquecer datos
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_data(train_df)
train_df = enrich_features(train_df)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Calcular scale_pos_weight
pos_weight = len(y[y == 0]) / len(y[y == 1])
print(f"\n--- Calculado scale_pos_weight: {pos_weight:.2f} ---")

# Dividir datos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# Configurar y entrenar modelo
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.01,
    'num_leaves': 50,
    'min_data_in_leaf': 20,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu',
    'scale_pos_weight': pos_weight
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=200,
    callbacks=callbacks
)

# Guardar modelo
print("\n--- Guardando el modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model_v2.txt'
model.save_model(model_path)
print(f"\n--- Modelo guardado en {model_path} ---")



--- Preprocesando datos ---

--- Enriqueciendo características ---

--- Calculado scale_pos_weight: 15.95 ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.843844	train's ndcg@3: 0.858705	train's ndcg@5: 0.869633	val's ndcg@1: 0.843356	val's ndcg@3: 0.858004	val's ndcg@5: 0.869006
[20]	train's ndcg@1: 0.8438	train's ndcg@3: 0.858696	train's ndcg@5: 0.869646	val's ndcg@1: 0.843328	val's ndcg@3: 0.85796	val's ndcg@5: 0.869004
[30]	train's ndcg@1: 0.843757	train's ndcg@3: 0.858743	train's ndcg@5: 0.869626	val's ndcg@1: 0.843098	val's ndcg@3: 0.857892	val's ndcg@5: 0.868939
[40]	train's ndcg@1: 0.843857	train's ndcg@3: 0.858853	train's ndcg@5: 0.86979	val's ndcg@1: 0.842889	val's ndcg@3: 0.857743	val's ndcg@5: 0.868811
[50]	train's ndcg@1: 0.843939	train's ndcg@3: 0.858961	train's ndcg@5: 0.869852	val's ndcg@1: 0.842683	val's ndcg@3: 0.857623	val's ndcg@5: 0.86867
[60]	t

In [43]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_enrich_test_data(df):
    print("\n--- Preprocesando y enriqueciendo datos de prueba ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# --- Generar JSON ---
def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_enrich_test_data(test_df)

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data[X.columns.tolist()]
        session_data['score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .tolist()
        )

        # Eliminar duplicados mientras se conserva el orden
        recommended_products = list(dict.fromkeys(recommended_products))

        # Completar con productos populares para asegurar 5 elementos únicos
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in recommended_products:
                recommended_products.append(product)

        # Verificar que las recomendaciones tengan exactamente 5 valores únicos
        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")


# --- Ejecutar predicciones ---
if __name__ == "__main__":
    model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_enriched_model_v2.txt'
    test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
    output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched_v4.json'

    # Generar JSON
    generate_predictions(model_path, test_path, output_path)



--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y enriqueciendo datos de prueba ---

--- Generando predicciones ---

--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_enriched_v4.json


---

In [47]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_align_data(df):
    print("\n--- Preprocesando y alineando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    return df

def enrich_features(df):
    print("\n--- Enriqueciendo características ---")
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# --- Cargar y preprocesar datos ---
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_and_align_data(train_df)
train_df = enrich_features(train_df)

# Filtrar características disponibles en test_df
features_to_use = ['date', 'timestamp_local', 'hour', 'day_of_week', 
                   'session_length', 'country_popularity', 'product_interaction_rate',
                   'user_id', 'country', 'partnumber', 'device_type', 'pagetype']
train_df = train_df[features_to_use + ['add_to_cart', 'session_id']]

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Calcular scale_pos_weight
pos_weight = len(y[y == 0]) / len(y[y == 1])
print(f"\n--- Calculado scale_pos_weight: {pos_weight:.2f} ---")

# Dividir datos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# Configurar y entrenar modelo
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': ['ndcg', 'auc', 'map'],
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.01,
    'num_leaves': 50,
    'min_data_in_leaf': 20,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'device': 'cpu',
    'scale_pos_weight': pos_weight
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=200,
    callbacks=callbacks
)

# Guardar modelo
print("\n--- Guardando el modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_aligned_model.txt'
model.save_model(model_path)
print(f"\n--- Modelo guardado en {model_path} ---")



--- Preprocesando y alineando datos ---

--- Enriqueciendo características ---

--- Calculado scale_pos_weight: 15.95 ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.843844	train's ndcg@3: 0.858705	train's ndcg@5: 0.869633	train's auc: 0.663763	train's map@1: 0.843844	train's map@3: 0.847475	train's map@5: 0.852684	val's ndcg@1: 0.843356	val's ndcg@3: 0.858004	val's ndcg@5: 0.869006	val's auc: 0.6637	val's map@1: 0.843356	val's map@3: 0.846688	val's map@5: 0.851942
[20]	train's ndcg@1: 0.8438	train's ndcg@3: 0.858696	train's ndcg@5: 0.869646	train's auc: 0.660903	train's map@1: 0.8438	train's map@3: 0.847472	train's map@5: 0.852692	val's ndcg@1: 0.843328	val's ndcg@3: 0.85796	val's ndcg@5: 0.869004	val's auc: 0.661137	val's map@1: 0.843328	val's map@3: 0.84665	val's map@5: 0.851911
[30]	train's ndcg@1: 0.843757	train's ndcg@3: 0.858743	train's ndcg@5: 0.869626	trai

In [48]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_align_test_data(df):
    print("\n--- Preprocesando y alineando datos de prueba ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    df['session_length'] = df.groupby('session_id')['partnumber'].transform('count')
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / df['session_length']
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / df['session_length']
    return df

# --- Generar JSON ---
def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    # Características alineadas al modelo entrenado
    aligned_features = [
        'date', 'timestamp_local', 'hour', 'day_of_week', 
        'session_length', 'country_popularity', 'product_interaction_rate',
        'user_id', 'country', 'partnumber', 'device_type', 'pagetype'
    ]

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_align_test_data(test_df)

    # Asegurar que las columnas coincidan con las características del modelo
    missing_features = set(aligned_features) - set(test_df.columns)
    if missing_features:
        print(f"Advertencia: Faltan las siguientes características en los datos de prueba: {missing_features}")
        for feature in missing_features:
            test_df[feature] = 0  # Rellenar las columnas faltantes con valores predeterminados

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data[aligned_features]
        session_data['score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Eliminar repeticiones antes de completar
        recommended_products = list(dict.fromkeys(recommended_products))

        # Completar con productos populares si faltan
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in recommended_products:
                recommended_products.append(product)

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")

# --- Ejecutar predicciones ---
if __name__ == "__main__":
    model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_aligned_model.txt'
    test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
    output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_aligned.json'

    # Generar JSON
    generate_predictions(model_path, test_path, output_path)



--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y alineando datos de prueba ---

--- Generando predicciones ---

--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_aligned.json


---

In [5]:
# Hiperparámetros.
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import uniform, randint
from itertools import product
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_align_data(df):
    print("\n--- Preprocesando y alineando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# --- Función para entrenar y validar el modelo ---
def train_and_evaluate(params, X_train, y_train, groups_train, X_val, y_val, groups_val):
    lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
    lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train', 'val'],
        num_boost_round=200,
        callbacks=[
            early_stopping(stopping_rounds=50, verbose=False),
            log_evaluation(period=10)
        ]
    )
    
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    return mse, model

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from itertools import product
from scipy.stats import randint, uniform

# Función de ajuste manual de hiperparámetros
def manual_hyperparameter_tuning(X, y, groups, param_distributions, n_iter=50):
    print("\n--- Realizando ajuste de hiperparámetros ---")
    best_score = -np.inf
    best_params = None
    best_model = None

    # Generar combinaciones de parámetros
    param_grid = [
        {key: dist.rvs() for key, dist in param_distributions.items()}
        for _ in range(n_iter)
    ]

    for i, params in enumerate(param_grid):
        print(f"\n--- Probando conjunto de parámetros {i + 1}/{n_iter} ---")
        print(params)
        
        # Dividir los datos en entrenamiento y validación respetando grupos
        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        train_idx, val_idx = next(gss.split(X, y, groups=groups))

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        groups_train = groups.iloc[train_idx].value_counts().values
        groups_val = groups.iloc[val_idx].value_counts().values

        # Crear conjuntos de datos de LightGBM
        lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

        # Configurar y entrenar el modelo
        model = lgb.train(
            {**params, 'objective': 'lambdarank', 'metric': 'ndcg', 'ndcg_eval_at': [1, 3, 5], 'verbose': -1},
            lgb_train,
            valid_sets=[lgb_train, lgb_val],
            valid_names=['train', 'val'],
            num_boost_round=200,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=True),
                lgb.log_evaluation(period=10)
            ]
        )

        # Obtener el mejor puntaje
        score = model.best_score['val']['ndcg@5']
        print(f"--- Puntaje actual: {score} ---")

        if score > best_score:
            best_score = score
            best_params = params
            best_model = model

    return best_params, best_score, best_model

# Ejemplo de ejecución
param_distributions = {
    'num_leaves': randint(20, 100),
    'learning_rate': uniform(0.01, 0.1),
    'min_data_in_leaf': randint(10, 50),
    'max_bin': randint(50, 300),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
}

# Suponiendo que ya tienes X, y, groups definidos
best_params, best_score, best_model = manual_hyperparameter_tuning(
    X, y, groups,
    param_distributions,
    n_iter=50
)

print("\n--- Mejores parámetros ---")
print(best_params)
print("\n--- Mejor puntaje ---")
print(best_score)

# Guardar el modelo final
print("\n--- Guardando el mejor modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_tuned_model.txt'
best_model.save_model(model_path)
print(f"Modelo guardado en {model_path}")



--- Realizando ajuste de hiperparámetros ---

--- Probando conjunto de parámetros 1/50 ---
{'num_leaves': 62, 'learning_rate': 0.10300419603594589, 'min_data_in_leaf': 47, 'max_bin': 129, 'subsample': 0.6124099504493548, 'colsample_bytree': 0.7483568265898739}


KeyboardInterrupt: 

---

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from lightgbm.callback import early_stopping, log_evaluation

# --- Preprocesamiento de datos ---
def preprocess_data(df):
    print("\n--- Preprocesando datos ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    return df

def enrich_features(df):
    print("\n--- Enriqueciendo características ---")
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# Cargar y enriquecer datos
train_df = pd.read_pickle('/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl')
train_df = preprocess_data(train_df)
train_df = enrich_features(train_df)

# Separar características, etiquetas y grupos
X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
y = train_df['add_to_cart']
groups = train_df['session_id']

# Calcular scale_pos_weight
pos_weight = len(y[y == 0]) / len(y[y == 1])
print(f"\n--- Calculado scale_pos_weight: {pos_weight:.2f} ---")

# Dividir datos
print("\n--- Dividiendo datos ---")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
groups_train = groups.iloc[train_idx].value_counts().values
groups_val = groups.iloc[val_idx].value_counts().values

# Configurar y entrenar modelo
print("\n--- Configurando y entrenando el modelo LambdaMART ---")
lgb_train = lgb.Dataset(X_train, label=y_train, group=groups_train)
lgb_val = lgb.Dataset(X_val, label=y_val, group=groups_val, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.1022,
    'num_leaves': 74,
    'min_data_in_leaf': 12,
    'boosting_type': 'gbdt',
    'verbose': -1,
    'max_bin': 255,
    'subsample': 0.783,
    'colsample_bytree': 0.726,
    'device': 'cpu',
    'scale_pos_weight': pos_weight
}

callbacks = [
    early_stopping(stopping_rounds=50, verbose=True),
    log_evaluation(period=10)
]

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    num_boost_round=200,
    callbacks=callbacks
)

# Guardar modelo
print("\n--- Guardando el modelo ---")
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_hyper_model.txt'
model.save_model(model_path)
print(f"\n--- Modelo guardado en {model_path} ---")



--- Preprocesando datos ---

--- Enriqueciendo características ---

--- Calculado scale_pos_weight: 15.95 ---

--- Dividiendo datos ---

--- Configurando y entrenando el modelo LambdaMART ---
Training until validation scores don't improve for 50 rounds
[10]	train's ndcg@1: 0.84378	train's ndcg@3: 0.858809	train's ndcg@5: 0.869743	val's ndcg@1: 0.842981	val's ndcg@3: 0.857758	val's ndcg@5: 0.868794
[20]	train's ndcg@1: 0.844208	train's ndcg@3: 0.859226	train's ndcg@5: 0.87023	val's ndcg@1: 0.843053	val's ndcg@3: 0.858036	val's ndcg@5: 0.869007
[30]	train's ndcg@1: 0.844918	train's ndcg@3: 0.860025	train's ndcg@5: 0.870956	val's ndcg@1: 0.843237	val's ndcg@3: 0.858457	val's ndcg@5: 0.869469
[40]	train's ndcg@1: 0.845783	train's ndcg@3: 0.860867	train's ndcg@5: 0.871765	val's ndcg@1: 0.843602	val's ndcg@3: 0.858688	val's ndcg@5: 0.869788
[50]	train's ndcg@1: 0.846616	train's ndcg@3: 0.861667	train's ndcg@5: 0.872528	val's ndcg@1: 0.843738	val's ndcg@3: 0.85894	val's ndcg@5: 0.870031
[60]

In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_enrich_test_data(df):
    print("\n--- Preprocesando y enriqueciendo datos de prueba ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek
    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length
    return df

# --- Generar JSON ---
def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_enrich_test_data(test_df)

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data[X.columns.tolist()]
        session_data.loc[:, 'score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Eliminar repeticiones antes de completar
        recommended_products = list(dict.fromkeys(recommended_products))

        # Completar con productos populares si faltan
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in recommended_products:
                recommended_products.append(product)

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")


# --- Ejecutar predicciones ---
if __name__ == "__main__":
    model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_hyper_model.txt'
    test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
    output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_hyper_v1.json'

    # Generar JSON
    generate_predictions(model_path, test_path, output_path)



--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y enriqueciendo datos de prueba ---

--- Generando predicciones ---

--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_3_hyper_v1.json
