In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import json
import pickle
from datetime import datetime
import time
import os
import sys

# Scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, classification_report
)

# Gradient Boosting
import xgboost as xgb
import lightgbm as lgb

# Plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots



In [4]:
# Deep Learning
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, optimizers, callbacks
    from tensorflow.keras import models as tf_models
    import tensorflow.keras.backend as K

    # GPU configuration
    gpus = tf.config.list_physical_devices('GPU')
    if len(gpus) > 0:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        policy = tf.keras.mixed_precision.Policy('mixed_float16')
        tf.keras.mixed_precision.set_global_policy(policy)
        print(f"GPU enabled: {gpus[0].name}")
    else:
        print("Using CPU")

    tf.get_logger().setLevel('ERROR')
    TF_AVAILABLE = True

except ImportError:
    TF_AVAILABLE = False
    print("TensorFlow not available")

# Hyperparameter tuning
try:
    import optuna
    OPTUNA_AVAILABLE = True
    print("Optuna available for hyperparameter tuning")
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Optuna not available")

GPU enabled: /physical_device:GPU:0
Optuna not available


In [5]:
# Environment detection
try:
    from google.colab import drive
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')
    BASE_PATH = Path("/content/drive/MyDrive/23")
    IN_COLAB = True
    print("Running in Google Colab")
except ImportError:
    BASE_PATH = Path("../")
    IN_COLAB = False
    print("Running locally")

# Setup paths
data_processed = BASE_PATH / "data" / "processed"
experiments_path = BASE_PATH / "experiments"
checkpoints_path = experiments_path / "checkpoints"
predictions_path = BASE_PATH / "predictions"

# Create directories
for path in [experiments_path, checkpoints_path, predictions_path]:
    path.mkdir(parents=True, exist_ok=True)

# Random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
if TF_AVAILABLE:
    tf.random.set_seed(RANDOM_SEED)

# Display config
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print(f"Setup complete. Random seed: {RANDOM_SEED}")

Running in Google Colab
Setup complete. Random seed: 42


## 1. Data Loading & Validation

In [6]:
# Load dataset
dataset_file = data_processed / "final_training_dataset.parquet"
if not dataset_file.exists():
    dataset_file = data_processed / "final_training_dataset.csv"
    if not dataset_file.exists():
        raise FileNotFoundError(f"Dataset not found at: {data_processed}")

print(f"Loading dataset from: {dataset_file}")
if dataset_file.suffix == '.parquet':
    df = pd.read_parquet(dataset_file)
else:
    df = pd.read_csv(dataset_file)

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Basic info
print(f"Users: {df['user_id'].nunique():,}")
print(f"Products: {df['product_id'].nunique():,}")
print(f"Samples: {len(df):,}")

# Check imbalance
target_dist = df['reordered'].value_counts()
reorder_rate = df['reordered'].mean()
print(f"Reorder rate: {reorder_rate:.1%}")
display(target_dist)

Loading dataset from: /content/drive/MyDrive/23/data/processed/final_training_dataset.parquet
Dataset shape: (1384617, 33)
Memory usage: 348.6 MB
Users: 131,209
Products: 39,123
Samples: 1,384,617
Reorder rate: 59.9%


Unnamed: 0_level_0,count
reordered,Unnamed: 1_level_1
1,828824
0,555793


In [7]:
# Features
feature_cols = [col for col in df.columns if col not in ['user_id', 'product_id', 'reordered']]
numeric_features = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df[feature_cols].select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Features: {len(feature_cols)} total")
print(f"Numeric: {len(numeric_features)}")
print(f"Categorical: {len(categorical_features)}")

# Handle missing values
X = df[feature_cols].copy()
y = df['reordered'].copy()
user_ids = df['user_id'].copy()

if X.isnull().any().any():
    print("Handling missing values...")
    for col in numeric_features:
        if X[col].isnull().any():
            X[col] = X[col].fillna(X[col].median())
    for col in categorical_features:
        if X[col].isnull().any():
            X[col] = X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else 'unknown')
else:
    print("No missing values found")

print("Data validation complete")

Features: 30 total
Numeric: 30
Categorical: 0
No missing values found
Data validation complete


## 2. Train-Test Split

In [8]:
# Check if preprocessed data already exists
train_test_files = {
    'X_train': data_processed / "X_train.parquet",
    'X_test': data_processed / "X_test.parquet",
    'y_train': data_processed / "y_train.parquet",
    'y_test': data_processed / "y_test.parquet",
    'X_train_scaled': data_processed / "X_train_scaled.parquet",
    'X_test_scaled': data_processed / "X_test_scaled.parquet",
    'scaler': data_processed / "feature_scaler.joblib"
}

all_files_exist = all(file_path.exists() for file_path in train_test_files.values())

if all_files_exist:
    print("Loading preprocessed train-test split...")
    X_train = pd.read_parquet(train_test_files['X_train'])
    X_test = pd.read_parquet(train_test_files['X_test'])
    y_train = pd.read_parquet(train_test_files['y_train']).squeeze()
    y_test = pd.read_parquet(train_test_files['y_test']).squeeze()
    X_train_scaled = pd.read_parquet(train_test_files['X_train_scaled'])
    X_test_scaled = pd.read_parquet(train_test_files['X_test_scaled'])
    scaler = joblib.load(train_test_files['scaler'])

    print(f"Loaded - Train: {len(X_train):,}, Test: {len(X_test):,}")

else:
    print("Creating new train-test split...")

    # User-based split
    unique_users = df['user_id'].unique()
    n_train_users = int(len(unique_users) * 0.8)
    np.random.shuffle(unique_users)
    train_users = set(unique_users[:n_train_users])
    test_users = set(unique_users[n_train_users:])

    train_mask = df['user_id'].isin(train_users)
    test_mask = df['user_id'].isin(test_users)

    X_train = X[train_mask].copy()
    X_test = X[test_mask].copy()
    y_train = y[train_mask].copy()
    y_test = y[test_mask].copy()

    print(f"Train: {len(X_train):,} samples ({y_train.mean():.1%} reorder rate)")
    print(f"Test: {len(X_test):,} samples ({y_test.mean():.1%} reorder rate)")

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )

    # Save preprocessed data
    print("Saving train-test split for future use...")
    X_train.to_parquet(train_test_files['X_train'])
    X_test.to_parquet(train_test_files['X_test'])
    y_train.to_frame('reordered').to_parquet(train_test_files['y_train'])
    y_test.to_frame('reordered').to_parquet(train_test_files['y_test'])
    X_train_scaled.to_parquet(train_test_files['X_train_scaled'])
    X_test_scaled.to_parquet(train_test_files['X_test_scaled'])
    joblib.dump(scaler, train_test_files['scaler'])

    print("Train-test split saved successfully")

print("Data preprocessing complete")

Loading preprocessed train-test split...
Loaded - Train: 1,105,234, Test: 279,383
Data preprocessing complete


## 3. Baseline Models

In [9]:
trained_models = {}
model_results = {}

def calculate_metrics(y_true, y_pred, y_pred_proba, model_name):
    metrics = {
        'model': model_name,
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_pred_proba)
    }

    # Precision@K and Recall@K
    for k in [10, 20, 50]:
        if len(y_pred_proba) >= k:
            top_k_indices = np.argsort(y_pred_proba)[-k:]
            y_pred_at_k = np.zeros_like(y_pred)
            y_pred_at_k[top_k_indices] = 1

            metrics[f'precision_at_{k}'] = precision_score(y_true, y_pred_at_k, zero_division=0)
            metrics[f'recall_at_{k}'] = recall_score(y_true, y_pred_at_k, zero_division=0)

    return metrics

print("Training baseline models...")

Training baseline models...


In [10]:
# 1. Logistic Regression
start_time = time.time()
lr_model = LogisticRegression(random_state=RANDOM_SEED, max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_scaled, y_train)
train_time_lr = time.time() - start_time

y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

trained_models['logistic_regression'] = lr_model
model_results['logistic_regression'] = {
    'y_pred': y_pred_lr,
    'y_pred_proba': y_pred_proba_lr,
    'train_time': train_time_lr
}
print(f"Logistic Regression: {train_time_lr:.2f}s")

Logistic Regression: 2.71s


In [11]:
# 2. XGBoost
start_time = time.time()
xgb_model = xgb.XGBClassifier(
    random_state=RANDOM_SEED,
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum()
)

xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
train_time_xgb = time.time() - start_time

y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

trained_models['xgboost'] = xgb_model
model_results['xgboost'] = {
    'y_pred': y_pred_xgb,
    'y_pred_proba': y_pred_proba_xgb,
    'train_time': train_time_xgb
}
print(f"XGBoost: {train_time_xgb:.2f}s")

XGBoost: 15.04s


In [12]:
# 3. LightGBM
start_time = time.time()
lgb_model = lgb.LGBMClassifier(
    random_state=RANDOM_SEED,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    class_weight='balanced',
    verbose=-1
)

lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(10, verbose=False)])
train_time_lgb = time.time() - start_time

y_pred_lgb = lgb_model.predict(X_test)
y_pred_proba_lgb = lgb_model.predict_proba(X_test)[:, 1]

trained_models['lightgbm'] = lgb_model
model_results['lightgbm'] = {
    'y_pred': y_pred_lgb,
    'y_pred_proba': y_pred_proba_lgb,
    'train_time': train_time_lgb
}
print(f"LightGBM: {train_time_lgb:.2f}s")

print(f"Baseline models training complete: {len(trained_models)} models")

LightGBM: 15.95s
Baseline models training complete: 3 models


## 4. Advanced Models (TensorFlow)

In [13]:
advanced_models = {}
advanced_results = {}

if TF_AVAILABLE:
    X_train_tf = X_train_scaled.values.astype(np.float32)
    X_test_tf = X_test_scaled.values.astype(np.float32)
    y_train_tf = y_train.values.astype(np.float32)
    y_test_tf = y_test.values.astype(np.float32)

    batch_size = 2048 if len(gpus) > 0 else 512
    epochs = 50

    print(f"TensorFlow models - Batch size: {batch_size}, Max epochs: {epochs}")

    # Callbacks
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=8, min_lr=1e-7)

else:
    print("TensorFlow not available - skipping advanced models")

TensorFlow models - Batch size: 2048, Max epochs: 50


In [14]:
if TF_AVAILABLE:
    # 1. MLP Model
    def create_mlp_model(input_dim, hidden_dims=[512, 256, 128], dropout=0.3):
        model = tf_models.Sequential()
        model.add(layers.Dense(hidden_dims[0], input_dim=input_dim, activation='relu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout))

        for hidden_dim in hidden_dims[1:]:
            model.add(layers.Dense(hidden_dim, activation='relu'))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(dropout))

        model.add(layers.Dense(1, activation='sigmoid'))
        return model

    start_time = time.time()
    mlp_model = create_mlp_model(input_dim=len(feature_cols))
    mlp_model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    device_context = tf.device('/GPU:0' if len(gpus) > 0 else '/CPU:0')
    with device_context:
        history_mlp = mlp_model.fit(
            X_train_tf, y_train_tf,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(X_test_tf, y_test_tf),
            callbacks=[early_stopping, reduce_lr],
            verbose=0
        )

    train_time_mlp = time.time() - start_time

    y_pred_proba_mlp = mlp_model.predict(X_test_tf, batch_size=batch_size).flatten()
    y_pred_mlp = (y_pred_proba_mlp > 0.5).astype(int)

    advanced_models['mlp'] = mlp_model
    advanced_results['mlp'] = {
        'y_pred': y_pred_mlp,
        'y_pred_proba': y_pred_proba_mlp,
        'train_time': train_time_mlp
    }
    print(f"MLP: {train_time_mlp:.2f}s")

[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
MLP: 61.88s


In [15]:
if TF_AVAILABLE:
    # 2. Wide & Deep Model
    def create_wide_deep_model(input_dim, wide_dim=None, deep_dims=[256, 128], dropout=0.3):
        wide_dim = wide_dim or input_dim // 2

        inputs = layers.Input(shape=(input_dim,))
        wide_input = layers.Lambda(lambda x: x[:, :wide_dim])(inputs)
        wide_output = layers.Dense(1, use_bias=False)(wide_input)

        deep_hidden = inputs
        for hidden_dim in deep_dims:
            deep_hidden = layers.Dense(hidden_dim, activation='relu')(deep_hidden)
            deep_hidden = layers.BatchNormalization()(deep_hidden)
            deep_hidden = layers.Dropout(dropout)(deep_hidden)

        deep_output = layers.Dense(1, use_bias=False)(deep_hidden)
        combined = layers.Add()([wide_output, deep_output])
        outputs = layers.Activation('sigmoid')(combined)

        return tf_models.Model(inputs=inputs, outputs=outputs)

    start_time = time.time()
    wd_model = create_wide_deep_model(input_dim=len(feature_cols))
    wd_model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    with device_context:
        history_wd = wd_model.fit(
            X_train_tf, y_train_tf,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(X_test_tf, y_test_tf),
            callbacks=[early_stopping, reduce_lr],
            verbose=0
        )

    train_time_wd = time.time() - start_time

    y_pred_proba_wd = wd_model.predict(X_test_tf, batch_size=batch_size).flatten()
    y_pred_wd = (y_pred_proba_wd > 0.5).astype(int)

    advanced_models['wide_deep'] = wd_model
    advanced_results['wide_deep'] = {
        'y_pred': y_pred_wd,
        'y_pred_proba': y_pred_proba_wd,
        'train_time': train_time_wd
    }
    print(f"Wide & Deep: {train_time_wd:.2f}s")

    # GPU cleanup
    if len(gpus) > 0:
        K.clear_session()
        tf.keras.backend.clear_session()
        import gc
        gc.collect()

    print(f"Advanced models complete: {len(advanced_results)} models")

[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Wide & Deep: 54.69s
Advanced models complete: 2 models


In [16]:
# Ensemble model
start_time = time.time()
xgb_proba = model_results['xgboost']['y_pred_proba']
lgb_proba = model_results['lightgbm']['y_pred_proba']

ensemble_proba = 0.6 * xgb_proba + 0.4 * lgb_proba
ensemble_pred = (ensemble_proba > 0.5).astype(int)
train_time_ensemble = time.time() - start_time

advanced_results['ensemble'] = {
    'y_pred': ensemble_pred,
    'y_pred_proba': ensemble_proba,
    'train_time': train_time_ensemble
}
print(f"Ensemble: {train_time_ensemble:.4f}s")

Ensemble: 0.0022s


## 5. Model Evaluation & Comparison

In [17]:
all_model_results = {**model_results, **advanced_results}
all_metrics = []

print("Evaluating all models...")
for model_name, result in all_model_results.items():
    metrics = calculate_metrics(y_test, result['y_pred'], result['y_pred_proba'], model_name)
    metrics['train_time'] = result['train_time']
    all_metrics.append(metrics)

comparison_df = pd.DataFrame(all_metrics)

print("Model Comparison:")
display_columns = ['model', 'f1', 'precision', 'recall', 'roc_auc', 'precision_at_10', 'train_time']
display(comparison_df[display_columns].round(4))

# Best model
best_f1_idx = comparison_df['f1'].idxmax()
best_model_name = comparison_df.loc[best_f1_idx, 'model']
best_f1_score = comparison_df.loc[best_f1_idx, 'f1']

print(f"\nBest model: {best_model_name} (F1: {best_f1_score:.4f})")

Evaluating all models...
Model Comparison:


Unnamed: 0,model,f1,precision,recall,roc_auc,precision_at_10,train_time
0,logistic_regression,1.0,1.0,1.0,1.0,1.0,2.7085
1,xgboost,1.0,1.0,1.0,1.0,1.0,15.0378
2,lightgbm,1.0,1.0,1.0,1.0,1.0,15.9509
3,mlp,1.0,1.0,1.0,1.0,1.0,61.8783
4,wide_deep,1.0,1.0,1.0,1.0,1.0,54.6854
5,ensemble,1.0,1.0,1.0,1.0,1.0,0.0022



Best model: logistic_regression (F1: 1.0000)


## 6. Export Artifacts

In [19]:
experiment_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_name = f"instacart_recommendation_{experiment_timestamp}"
experiment_dir = checkpoints_path / experiment_name
experiment_dir.mkdir(exist_ok=True)

print(f"Saving artifacts to: {experiment_name}")

# Save models
model_files = {}
for model_name, model in trained_models.items():
    filename = f"{model_name}_{experiment_timestamp}.joblib"
    file_path = experiment_dir / filename
    joblib.dump(model, file_path)
    model_files[model_name] = str(file_path)

# Save TensorFlow models
if TF_AVAILABLE and advanced_models:
    for model_name, model in advanced_models.items():
        model_dir = experiment_dir / f"{model_name}_{experiment_timestamp}.keras"
        try:
            model.save(str(model_dir))
            model_files[model_name] = str(model_dir)
            print(f"Saved TensorFlow model: {model_name}")
        except Exception as e:
            print(f"Warning: Could not save {model_name}: {e}")
            # Try alternative save method
            try:
                alt_model_dir = experiment_dir / f"{model_name}_{experiment_timestamp}_weights.h5"
                model.save_weights(str(alt_model_dir))
                model_files[f"{model_name}_weights"] = str(alt_model_dir)
                print(f"Saved weights for: {model_name}")
            except Exception as e2:
                print(f"Could not save weights for {model_name}: {e2}")

# Save preprocessing
joblib.dump(scaler, experiment_dir / "feature_scaler.joblib")

# Save feature config
feature_config = {
    'feature_columns': feature_cols,
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'n_features': len(feature_cols)
}
with open(experiment_dir / "feature_config.json", 'w') as f:
    json.dump(feature_config, f, indent=2)

# Save results
comparison_df.to_csv(experiment_dir / "model_comparison.csv", index=False)

print(f"Models saved: {len(model_files)}")

Saving artifacts to: instacart_recommendation_20251124_135924
Saved TensorFlow model: mlp
Saved TensorFlow model: wide_deep
Models saved: 5


In [20]:
# Save experiment config
experiment_config = {
    'experiment_name': experiment_name,
    'timestamp': experiment_timestamp,
    'best_model': best_model_name,
    'best_f1_score': float(best_f1_score),
    'dataset_info': {
        'train_size': len(X_train),
        'test_size': len(X_test),
        'n_features': len(feature_cols),
        'reorder_rate': float(reorder_rate)
    },
    'models_trained': list(all_model_results.keys()),
    'random_seed': RANDOM_SEED
}

with open(experiment_dir / "experiment_config.json", 'w') as f:
    json.dump(experiment_config, f, indent=2)

print(f"Artifacts saved: {len(list(experiment_dir.glob('*')))} files")

Artifacts saved: 9 files


## 7. Deployment Export

In [21]:
deployment_dir = checkpoints_path / "deployment"
deployment_dir.mkdir(exist_ok=True)

# Get best model
best_model_obj = trained_models.get(best_model_name)
if best_model_obj is None:
    best_model_obj = trained_models['xgboost']  # Fallback

# Export final model
joblib.dump(best_model_obj, deployment_dir / "model.joblib")
with open(deployment_dir / "model.pkl", 'wb') as f:
    pickle.dump(best_model_obj, f)

# Export scaler
joblib.dump(scaler, deployment_dir / "standardizer.joblib")

# Export metadata
metadata = {
    'model_info': {
        'name': best_model_name,
        'version': '1.0',
        'creation_date': datetime.now().isoformat()
    },
    'performance': dict(comparison_df[comparison_df['model'] == best_model_name].iloc[0]),
    'deployment': {
        'requires_scaling': True,
        'input_features': len(feature_cols),
        'prediction_threshold': 0.5
    }
}

with open(deployment_dir / "metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

# Export feature order
with open(deployment_dir / "feature_order.json", 'w') as f:
    json.dump({
        'feature_names': feature_cols,
        'n_features': len(feature_cols),
        'preprocessing': {'scaling_required': True}
    }, f, indent=2)

print(f"Deployment files exported to: {deployment_dir}")

Deployment files exported to: /content/drive/MyDrive/23/experiments/checkpoints/deployment


## 8. Demo Predictions

In [None]:
n_demo_samples = min(10000, len(X_test))
demo_indices = np.random.choice(len(X_test), n_demo_samples, replace=False)

X_demo = X_test.iloc[demo_indices].copy()
y_demo_true = y_test.iloc[demo_indices].copy()

y_demo_pred = best_model_obj.predict(X_demo)
y_demo_proba = best_model_obj.predict_proba(X_demo)[:, 1]

# Get corresponding user and product IDs from test indice
demo_test_indices = X_test.iloc[demo_indices].index
demo_df_subset = df.loc[demo_test_indices]
demo_users = demo_df_subset['user_id'].values
demo_products = demo_df_subset['product_id'].values

demo_predictions_df = pd.DataFrame({
    'user_id': demo_users,
    'product_id': demo_products,
    'actual_reordered': y_demo_true.values,
    'predicted_reordered': y_demo_pred,
    'reorder_probability': y_demo_proba,
    'prediction_correct': (y_demo_pred == y_demo_true).astype(int)
})

demo_predictions_df.to_parquet(predictions_path / "demo_predictions.parquet", index=False)
demo_predictions_df.to_csv(predictions_path / "demo_predictions.csv", index=False)

print(f"Demo predictions exported: {len(demo_predictions_df):,} samples")
display(demo_predictions_df.head())

Demo predictions exported: 10,000 samples


Unnamed: 0,user_id,product_id,actual_reordered,predicted_reordered,reorder_probability,prediction_correct
633330,101488,12276,0,0,0.0,1
872794,187708,33041,0,0,0.0,1
1088825,4866,7644,1,1,1.0,1
1198012,72901,39928,1,1,1.0,1
1146403,201530,7538,1,0,7.271349e-30,0


In [24]:
# Dashboard metadata
dashboard_metadata = {
    'model_info': {
        'name': best_model_name,
        'f1_score': float(best_f1_score),
        'accuracy': float(comparison_df[comparison_df['model'] == best_model_name]['accuracy'].iloc[0])
    },
    'demo_data': {
        'total_samples': len(demo_predictions_df),
        'unique_users': demo_predictions_df['user_id'].nunique(),
        'unique_products': demo_predictions_df['product_id'].nunique(),
        'overall_accuracy': float(demo_predictions_df['prediction_correct'].mean())
    },
    'generation_info': {
        'timestamp': datetime.now().isoformat(),
        'model_used': best_model_name
    }
}

with open(predictions_path / "dashboard_metadata.json", 'w') as f:
    json.dump(dashboard_metadata, f, indent=2)

print("Dashboard metadata saved")

Dashboard metadata saved


## Summary

In [25]:
print("\n" + "="*50)
print("MODEL TRAINING & EVALUATION COMPLETE")
print("="*50)
print(f"Best Model: {best_model_name}")
print(f"F1-Score: {best_f1_score:.4f}")
print(f"Experiment: {experiment_name}")
print(f"Artifacts: {experiment_dir}")
print(f"Deployment: {deployment_dir}")
print(f"Demo Data: {predictions_path}")
print("="*50)


MODEL TRAINING & EVALUATION COMPLETE
Best Model: logistic_regression
F1-Score: 1.0000
Experiment: instacart_recommendation_20251124_135924
Artifacts: /content/drive/MyDrive/23/experiments/checkpoints/instacart_recommendation_20251124_135924
Deployment: /content/drive/MyDrive/23/experiments/checkpoints/deployment
Demo Data: /content/drive/MyDrive/23/predictions
