In [None]:
# Install necessary libraries
!pip install lightgbm pandas scikit-learn

# Import necessary modules from the uploaded files
from ...services.unified_feature_engineering import InstacartDataPreprocessor
from stacked_basket_model import StackedBasketModel
import pandas as pd
import json

print("Libraries and modules imported.")

# --- Step 1: Preprocess the data ---
print("Starting data preprocessing...")
preprocessor = InstacartDataPreprocessor(data_path='.')

# It's best practice to explicitly load the data first
preprocessor.load_raw_data()

# The save_processed_data method runs the full pipeline and returns a dictionary.
# We also provide an output_path, which is where the "processed" folder will be created.
processed_data = preprocessor.save_processed_data(output_path="./processed")

# Unpack the dictionary to get the dataframes we need for training
features_df = processed_data['features']
instacart_future_df = processed_data['instacart_future']
keyset = processed_data['keyset']

print("Data preprocessing complete.")
print(f"features_df shape: {features_df.shape}")
print(f"instacart_future_df shape: {instacart_future_df.shape}")


# --- Step 2: Train the Stacked Model ---
print("\nInitializing the Stacked Model...")
stacked_model = StackedBasketModel()

# The train method handles the full two-stage training process
stacked_model.train(features_df, instacart_future_df, keyset)

print("\n✅✅✅ Two-stage training complete. Download stage1_lgbm.pkl and stage2_gbc.pkl.")


# --- Step 3 (Optional but Recommended): Final Evaluation on Test Set ---
print("\n--- Starting Final Evaluation on Test Set ---")

from evaluator import BasketPredictionEvaluator

test_users = keyset['test']
predictions_for_eval = []

for user_id in test_users:
    # Get the final predicted basket from the now-trained stacked model
    predicted_basket = stacked_model.predict(features_df, user_id)
    
    # Get the ground truth basket
    actual_products_series = instacart_future_df[instacart_future_df['user_id'] == user_id]['products']
    actual_basket = json.loads(actual_products_series.iloc[0]) if not actual_products_series.empty else []

    predictions_for_eval.append({
        "user_id": user_id,
        "predicted_products": predicted_basket,
        "actual_products": actual_basket,
    })

# Use the comprehensive evaluator
final_evaluator = BasketPredictionEvaluator()
final_metrics = final_evaluator.evaluate_model(predictions_for_eval)

# Print the detailed results
import pprint
pprint.pprint(final_metrics)

print("--- Final Evaluation Complete ---")