In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import json
from sklearn.model_selection import train_test_split

# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_enrich_data(df, mode='train'):
    print(f"\n--- Preprocesando y enriqueciendo datos ({mode}) ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek

    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length

    # Características adicionales
    df['hour_bucket'] = pd.cut(
        df['hour'], 
        bins=[0, 6, 12, 18, 24], 
        labels=['Noche1', 'Mañana', 'Tarde', 'Noche2'],  # Etiquetas únicas
        include_lowest=True
    )
    df['day_period_popularity'] = df.groupby('hour_bucket')['partnumber'].transform('count')


    return df

# --- Entrenamiento del modelo ---
def train_lambdamart(train_path, model_path):
    print("\n--- Cargando datos de entrenamiento ---")
    train_df = pd.read_pickle(train_path)
    train_df = preprocess_and_enrich_data(train_df)

    # Dividir características y etiquetas
    X = train_df.drop(['add_to_cart', 'session_id'], axis=1)
    y = train_df['add_to_cart']
    groups = train_df['session_id'].value_counts().values

    # Crear Dataset LightGBM
    train_data = lgb.Dataset(X, label=y, group=groups)

    # Parámetros del modelo
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_eval_at': [1, 3, 5],
        'learning_rate': 0.05,
        'num_leaves': 70,
        'max_bin': 255,
        'min_data_in_leaf': 20,
        'boosting_type': 'gbdt',
        'verbose': -1
    }

    # Entrenar modelo
    print("\n--- Entrenando modelo Lambdamart ---")
    model = lgb.train(
        params, train_data,
        num_boost_round=500,
        valid_sets=[train_data],
        valid_names=['train'],
    )

    # Guardar modelo
    model.save_model(model_path)
    print(f"Modelo guardado en {model_path}")

def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_enrich_data(test_df, mode='test')

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data.drop(['session_id'], axis=1)
        session_data['score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Completar recomendaciones con productos populares sin duplicados
        unique_recommended_products = set(recommended_products)
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in unique_recommended_products:
                recommended_products.append(product)
                unique_recommended_products.add(product)

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")


# --- Ejecutar ---
if __name__ == "__main__":
    train_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl'
    test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
    model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_model.txt'
    output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_lambda_v1.json'

    train_lambdamart(train_path, model_path)
    generate_predictions(model_path, test_path, output_path)



--- Cargando datos de entrenamiento ---

--- Preprocesando y enriqueciendo datos (train) ---


  df['day_period_popularity'] = df.groupby('hour_bucket')['partnumber'].transform('count')



--- Entrenando modelo Lambdamart ---
Modelo guardado en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_model.txt

--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y enriqueciendo datos (test) ---


  df['day_period_popularity'] = df.groupby('hour_bucket')['partnumber'].transform('count')



--- Generando predicciones ---

--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_lambda_v1.json


In [6]:
# --- Preprocesamiento y enriquecimiento ---
def preprocess_and_enrich_data(df, mode='train'):
    print(f"\n--- Preprocesando y enriqueciendo datos ({mode}) ---")
    df['date'] = pd.to_datetime(df['date']).astype(int) / 10**9
    df['timestamp_local'] = pd.to_datetime(df['timestamp_local']).astype(int) / 10**9
    df['hour'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp_local'], unit='s').dt.dayofweek

    session_length = df.groupby('session_id')['partnumber'].transform('count')
    df['session_length'] = session_length
    df['country_popularity'] = df.groupby('country')['partnumber'].transform('count') / session_length
    df['product_interaction_rate'] = df.groupby('partnumber')['session_id'].transform('nunique') / session_length

    # Características adicionales
    df['hour_bucket'] = pd.cut(
        df['hour'], 
        bins=[0, 6, 12, 18, 24], 
        labels=['Noche1', 'Mañana', 'Tarde', 'Noche2'],  # Etiquetas únicas
        include_lowest=True
    )
    df['day_period_popularity'] = df.groupby('hour_bucket')['partnumber'].transform('count')


    return df

def generate_predictions(model_path, test_path, output_path):
    print("\n--- Cargando el modelo entrenado ---")
    model = lgb.Booster(model_file=model_path)

    print("\n--- Cargando datos de prueba ---")
    test_df = pd.read_pickle(test_path)
    test_df = preprocess_and_enrich_data(test_df, mode='test')

    session_ids = test_df['session_id'].unique()
    predictions = {}
    popular_products = test_df['partnumber'].value_counts().index.tolist()

    print("\n--- Generando predicciones ---")
    for session_id in session_ids:
        session_data = test_df[test_df['session_id'] == session_id].copy()

        if session_data.empty:
            predictions[str(session_id)] = popular_products[:5]
            continue

        session_features = session_data.drop(['session_id'], axis=1)
        session_data['score'] = model.predict(session_features)

        recommended_products = (
            session_data.sort_values(by='score', ascending=False)['partnumber']
            .drop_duplicates()
            .tolist()
        )

        # Completar recomendaciones con productos populares sin duplicados
        unique_recommended_products = set(recommended_products)
        for product in popular_products:
            if len(recommended_products) >= 5:
                break
            if product not in unique_recommended_products:
                recommended_products.append(product)
                unique_recommended_products.add(product)

        predictions[str(session_id)] = recommended_products[:5]

    print("\n--- Guardando predicciones ---")
    with open(output_path, 'w') as f:
        json.dump({"target": predictions}, f, indent=4)
    print(f"Predicciones guardadas en {output_path}")
    
train_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/train_data.pkl'
test_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/data/processed/new_processed/test_data.pkl'
model_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/models/lambdamart_model.txt'
output_path = '/home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_lambda_v1.json'

generate_predictions(model_path, test_path, output_path)



--- Cargando el modelo entrenado ---

--- Cargando datos de prueba ---

--- Preprocesando y enriqueciendo datos (test) ---

--- Generando predicciones ---


  df['day_period_popularity'] = df.groupby('hour_bucket')['partnumber'].transform('count')



--- Guardando predicciones ---
Predicciones guardadas en /home/pablost/Hackathon_inditex_data_science/hackathon-inditex-data-recommender/predictions/predictions_lambda_v1.json
