In [1]:
# PARAMETERS (Papermill-friendly)
# Configure base directory and derive common paths for Phase 2
import os

# Prefer env var for robustness when papermill parameters cell tag is missing
_env_base = os.getenv("RETAILSENSE_BASE_DIR")
if _env_base and _env_base.strip():
    RETAILSENSE_BASE_DIR = _env_base
elif 'RETAILSENSE_BASE_DIR' not in globals() or not RETAILSENSE_BASE_DIR:
    RETAILSENSE_BASE_DIR = r"F:\RetailSense_Lite"

BASE_DIR = RETAILSENSE_BASE_DIR
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
OUTPUT_DIR = os.path.join(BASE_DIR, 'outputs')
NOTEBOOKS_DIR = os.path.join(BASE_DIR, 'notebooks')

CLEANED_DATA_PATH = os.path.join(PROCESSED_DIR, 'cleaned_data.csv')
FEATURES_DATA_PATH = os.path.join(PROCESSED_DIR, 'data_with_all_features.csv')
FORECAST_CSV = os.path.join(OUTPUT_DIR, 'forecasting_results.csv')
ANOMALIES_CSV = os.path.join(OUTPUT_DIR, 'anomalies.csv')

# Ensure all paths are absolute
FEATURES_DATA_PATH = os.path.abspath(FEATURES_DATA_PATH)
CLEANED_DATA_PATH = os.path.abspath(CLEANED_DATA_PATH)
FORECAST_CSV = os.path.abspath(FORECAST_CSV)
ANOMALIES_CSV = os.path.abspath(ANOMALIES_CSV)

os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"üìÅ BASE_DIR: {BASE_DIR}")
print(f"üìÅ PROCESSED_DIR: {PROCESSED_DIR}")
print(f"üìÅ OUTPUT_DIR: {OUTPUT_DIR}")
print(f"üìÑ CLEANED_DATA_PATH: {CLEANED_DATA_PATH}")
print(f"üìÑ FEATURES_DATA_PATH: {FEATURES_DATA_PATH}")


üìÅ BASE_DIR: F:\RetailSense_Lite
üìÅ PROCESSED_DIR: F:\RetailSense_Lite\data\processed
üìÅ OUTPUT_DIR: F:\RetailSense_Lite\outputs
üìÑ CLEANED_DATA_PATH: F:\RetailSense_Lite\data\processed\cleaned_data.csv
üìÑ FEATURES_DATA_PATH: F:\RetailSense_Lite\data\processed\data_with_all_features.csv


In [2]:
# Cell 1: Setup and Imports for Phase 2
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
import joblib

warnings.filterwarnings('ignore')

# --------------------------
# Use parameterized base/paths from Parameters cell
# --------------------------
PROJECT_ROOT = BASE_DIR
DATA_CLEANED_PATH = CLEANED_DATA_PATH
DATA_FEATURES_PATH = FEATURES_DATA_PATH
OUTPUT_DIR = OUTPUT_DIR
FORECAST_CSV = FORECAST_CSV
ANOMALIES_CSV = ANOMALIES_CSV
NOTEBOOKS_DIR = NOTEBOOKS_DIR

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Fix sys.path to see project root and models folder
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# --------------------------
# Import custom modules
# --------------------------
from models.feature_engineering import FeatureEngineering
from models.baselines import BaselineModels
from models.forecasting import AdvancedForecasting
from models.anomaly_detection import AnomalyDetection

# --------------------------
# Debugging Info
# --------------------------
print(f"üìÇ Current Working Directory: {os.getcwd()}")
print(f"üîß Project Root Directory: {PROJECT_ROOT}")
print(f"üìÅ Output Directory: {OUTPUT_DIR}")
print(f"üìö Input (Cleaned Data) Path: {DATA_CLEANED_PATH}")
print(f"üìö Output (Feature Data) Path: {DATA_FEATURES_PATH}")
print(f"üìö Forecast CSV Path: {FORECAST_CSV}")
print(f"üìö Anomalies CSV Path: {ANOMALIES_CSV}")
print(f"üîç sys.path includes: {[p for p in sys.path if 'RetailSense_Lite' in p]}")

print("\nüöÄ RETAILSENSE PHASE 2: FEATURE ENGINEERING & MODELS INITIATED")
print("=" * 60)
print(f"üìÖ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

üìÇ Current Working Directory: F:\RetailSense_Lite
üîß Project Root Directory: F:\RetailSense_Lite
üìÅ Output Directory: F:\RetailSense_Lite\outputs
üìö Input (Cleaned Data) Path: F:\RetailSense_Lite\data\processed\cleaned_data.csv
üìö Output (Feature Data) Path: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
üìö Forecast CSV Path: F:\RetailSense_Lite\outputs\forecasting_results.csv
üìö Anomalies CSV Path: F:\RetailSense_Lite\outputs\anomalies.csv
üîç sys.path includes: ['f:\\RetailSense_Lite\\retailsense_env', 'f:\\RetailSense_Lite\\retailsense_env\\Lib\\site-packages', 'f:\\RetailSense_Lite\\retailsense_env\\Lib\\site-packages\\win32', 'f:\\RetailSense_Lite\\retailsense_env\\Lib\\site-packages\\win32\\lib', 'f:\\RetailSense_Lite\\retailsense_env\\Lib\\site-packages\\Pythonwin', 'F:\\RetailSense_Lite']

üöÄ RETAILSENSE PHASE 2: FEATURE ENGINEERING & MODELS INITIATED
üìÖ Started at: 2025-11-01 23:16:41


In [3]:
# Cell 2: Initialize Phase 2 Components + Upload Gate
import sys
import os
import pandas as pd
import glob
import shutil

# =====================================================
# Setup Project Environment (parameterized)
# =====================================================
PROJECT_ROOT = BASE_DIR

# Ensure sys.path includes the project root for module imports
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print(f"üìÇ Project root set to: {PROJECT_ROOT}")
print(f"üîç Models folder present? {'models' in os.listdir(PROJECT_ROOT)}")

# =====================================================
# Utility: Clear all Phase 2 outputs to avoid stale dashboards
# =====================================================
def clear_phase2_outputs():
    try:
        # Known output files to remove
        paths = [
            os.path.join(OUTPUT_DIR, "model_performance_comparison.png"),
            os.path.join(OUTPUT_DIR, "phase2_business_insights.csv"),
            os.path.join(OUTPUT_DIR, "forecasting_results.csv"),
            os.path.join(OUTPUT_DIR, "anomalies.csv"),
            os.path.join(OUTPUT_DIR, "xgboost_model.pkl"),
            os.path.join(OUTPUT_DIR, "lightgbm_model.pkl"),
            os.path.join(OUTPUT_DIR, "isolation_forest_model.pkl"),
            os.path.join(OUTPUT_DIR, "ocsvm_model.pkl"),
            os.path.join(OUTPUT_DIR, "anomaly_dashboard.png")
        ]
        for p in paths:
            if os.path.exists(p):
                os.remove(p)
        # Also clear engineered dataset to avoid accidental reuse
        if os.path.exists(FEATURES_DATA_PATH):
            os.remove(FEATURES_DATA_PATH)
        print("üßπ Cleared previous Phase 2 outputs to prevent stale results.")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to clear some outputs: {e}")

# =====================================================
# Import Custom Modules
# =====================================================
from models.feature_engineering import FeatureEngineering
from models.baselines import BaselineModels
from models.forecasting import AdvancedForecasting
from models.anomaly_detection import AnomalyDetection

# =====================================================
# Initialize Phase 2 Components
# =====================================================
def initialize_phase2_components():
    """Initialize all Phase 2 ML pipeline components."""
    components = {
        "feature_engineering": FeatureEngineering(),     # Feature creation
        "baseline_models": BaselineModels(),             # ARIMA & Prophet
        "advanced_forecasting": AdvancedForecasting(),   # XGBoost & LightGBM
        "anomaly_detection": AnomalyDetection()          # Isolation Forest & One-Class SVM
    }
    print("‚úÖ Phase 2 components initialized successfully")
    return components

# Instantiate pipeline components
components = initialize_phase2_components()

# =====================================================
# Require Uploaded Data Gate
# =====================================================
# Priority: env var UPLOADED_DATA_PATH -> default path under data/uploaded/
DEFAULT_UPLOAD_PATH = os.path.join(BASE_DIR, 'data', 'uploaded', 'uploaded_data.csv')
UPLOADED_DATA_PATH = os.getenv('UPLOADED_DATA_PATH')
if not UPLOADED_DATA_PATH or not UPLOADED_DATA_PATH.strip():
    UPLOADED_DATA_PATH = DEFAULT_UPLOAD_PATH

# Normalize path
UPLOADED_DATA_PATH = os.path.normpath(UPLOADED_DATA_PATH)

SKIP_PHASE2 = False
if not os.path.exists(UPLOADED_DATA_PATH):
    # No uploaded file -> clear outputs and skip downstream execution
    clear_phase2_outputs()
    SKIP_PHASE2 = True
    print("‚ùå No uploaded data file found.")
    print("   Set env 'UPLOADED_DATA_PATH' or place file at: " + DEFAULT_UPLOAD_PATH)
    print("   Phase 2 execution will be skipped to avoid showing stale results.")
else:
    # Use the uploaded file as the canonical cleaned input for Phase 2
    CLEANED_DATA_PATH = UPLOADED_DATA_PATH
    # Clear outputs before generating new ones to ensure dashboard reflects only this upload
    clear_phase2_outputs()
    print(f"‚úÖ Using uploaded data file: {CLEANED_DATA_PATH}")

# =====================================================
# Load Cleaned Data from Uploaded file (optionally filter by selected product)
# =====================================================
if not SKIP_PHASE2:
    DATA_CLEANED_PATH = CLEANED_DATA_PATH

    if os.path.exists(DATA_CLEANED_PATH):
        df_cleaned = pd.read_csv(DATA_CLEANED_PATH)
        print(f"üì• Uploaded dataset loaded: {df_cleaned.shape[0]} rows √ó {df_cleaned.shape[1]} columns")

        # Optional filtering: PRODUCT_NAME may actually be an ID or name; support common columns
        try:
            selected_value = None
            # Prefer environment variable for robustness
            env_product = os.getenv("PRODUCT_NAME")
            if env_product and env_product.strip():
                selected_value = env_product.strip()
            elif 'PRODUCT_NAME' in globals() and PRODUCT_NAME not in (None, "", "None"):
                selected_value = str(PRODUCT_NAME).strip()
            elif 'PRODUCT_ID' in globals() and PRODUCT_ID not in (None, "", "None"):
                selected_value = str(PRODUCT_ID).strip()

            if selected_value:
                product_cols = [
                    'product_id', 'product_name', 'sku', 'item_id', 'item', 'product', 'item_name', 'name'
                ]
                existing = [c for c in product_cols if c in df_cleaned.columns]
                if existing:
                    # Filter where any matching column equals selected value (string compare)
                    mask = False
                    for c in existing:
                        mask = mask | (df_cleaned[c].astype(str) == selected_value)
                    before = len(df_cleaned)
                    df_cleaned = df_cleaned[mask].copy()
                    after = len(df_cleaned)
                    print(f"üîé Product filter applied on columns {existing}. Rows: {before} ‚Üí {after}")

                    # Persist filtered dataset to a temporary processed file and point CLEANED_DATA_PATH to it
                    filtered_path = os.path.join(PROCESSED_DIR, "cleaned_data_filtered.csv")
                    df_cleaned.to_csv(filtered_path, index=False)
                    CLEANED_DATA_PATH = filtered_path
                    print(f"üíæ Filtered cleaned data saved to: {filtered_path}")
                else:
                    print("‚ö†Ô∏è No product identifier columns found; skipping product filter.")
        except Exception as e:
            print(f"‚ö†Ô∏è Product filtering skipped due to error: {e}")
    else:
        raise FileNotFoundError(f"‚ùå Missing uploaded file: {DATA_CLEANED_PATH}")

üìÇ Project root set to: F:\RetailSense_Lite
üîç Models folder present? True
‚úÖ FeatureEngineering initialized
‚úÖ Phase 2 components initialized successfully
üßπ Cleared previous Phase 2 outputs to prevent stale results.
‚úÖ Using uploaded data file: F:\RetailSense_Lite\data\uploaded\uploaded_data.csv
üì• Uploaded dataset loaded: 525 rows √ó 16 columns


In [4]:
# Cell 3: Feature Engineering Pipeline
import os

def run_feature_engineering(data_path):
    """Run complete feature engineering pipeline for Phase 2."""
    print("\nüîß STEP 1: FEATURE ENGINEERING")
    print("-" * 40)
    
    fe = components['feature_engineering']
    
    # Load and process cleaned data from Phase 1
    fe.load_data(data_path)
    
    # Run complete feature engineering (time-based, categorical encoding, metrics)
    engineered_df, feature_list = fe.run_complete_feature_engineering()
    
    # Define engineered output path for Phase 3 (parameterized) - ensure absolute
    engineered_path = os.path.abspath(FEATURES_DATA_PATH)
    
    # Ensure directory exists
    output_dir = os.path.dirname(engineered_path)
    os.makedirs(output_dir, exist_ok=True)
    
    # Save engineered data
    print(f"üíæ Saving to: {engineered_path}")
    saved_path = fe.save_engineered_data(engineered_path)
    
    # CRITICAL: Verify file was actually saved
    if not os.path.exists(saved_path):
        raise FileNotFoundError(f"‚ùå CRITICAL ERROR: File was not saved! Expected at: {saved_path}")
    
    # Verify file has content
    file_size = os.path.getsize(saved_path)
    if file_size == 0:
        raise ValueError(f"‚ùå CRITICAL ERROR: Saved file is empty! Path: {saved_path}")
    
    print(f"‚úÖ Feature engineering completed!")
    print(f"üìä Features created: {len(feature_list)}")
    print(f"üìÅ Data saved to: {saved_path}")
    print(f"‚úÖ File verification: {file_size:,} bytes written")
    print(f"üîç File exists check: {os.path.exists(saved_path)}")
    
    return engineered_df, feature_list, saved_path

# Execute feature engineering on uploaded cleaned data (guarded)
if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
    print("‚è≠Ô∏è Skipping feature engineering (no uploaded data).")
else:
    DATA_CLEANED_PATH = CLEANED_DATA_PATH
    engineered_data, all_features, engineered_file = run_feature_engineering(DATA_CLEANED_PATH)


üîß STEP 1: FEATURE ENGINEERING
----------------------------------------
üìä Loading data for feature engineering...
‚úÖ Data loaded: 525 rows, 16 columns

üöÄ Starting Complete Feature Engineering Pipeline
üîÑ Creating time-based features...
‚úÖ Created 13 time-based features
üîÑ Creating lag features for sales_qty...
‚úÖ Created 20 lag features
üîÑ Creating rolling features for sales_qty...


‚úÖ Created 60 rolling features
üîÑ Creating price-related features...


‚úÖ Created 6 price-related features
üîÑ Creating inventory features...
‚úÖ Created 6 inventory features
üîÑ Encoding categorical features...
‚úÖ Created 6 categorical features
üîÑ Creating interaction features...
‚úÖ Created 4 interaction features

‚úÖ Feature Engineering Complete!
üìä Total Features Created: 115
üìà Dataset Shape: (525, 67)
üíæ Saving to: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
üíæ Saving engineered dataset to F:\RetailSense_Lite\data\processed\data_with_all_features.csv...
‚úÖ Successfully saved 525 rows to F:\RetailSense_Lite\data\processed\data_with_all_features.csv
‚úÖ Engineered dataset and metadata saved successfully!
üìÅ CSV: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
üìÅ Metadata: F:\RetailSense_Lite\data\processed\feature_metadata.json
‚úÖ Feature engineering completed!
üìä Features created: 115
üìÅ Data saved to: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
‚úÖ File verification: 282,884 b

In [5]:
# Cell 4: Baseline Models Training and Forecast CSV Output (Robust)
import pandas as pd
import os

def run_baseline_models(data_path):
    """Train baseline ARIMA and Prophet models and save forecasting results."""
    print("\nüìà STEP 2: BASELINE MODELS (ARIMA & PROPHET)")
    print("-" * 50)
    
    baseline = components['baseline_models']
    
    # Load feature-engineered data
    baseline.load_data(data_path)
    
    # Train ARIMA and Prophet models
    arima_forecast = baseline.train_arima()
    prophet_forecast = baseline.train_prophet()
    
    # Helper function: convert any object to DataFrame with Predicted column
    def to_dataframe(pred, model_name):
        if isinstance(pred, pd.DataFrame):
            df = pred.copy()
        elif isinstance(pred, pd.Series):
            df = pd.DataFrame({'Predicted': pred.values}, index=pred.index)
        else:  # assume NumPy array
            df = pd.DataFrame({'Predicted': pred})
        df['Model'] = model_name
        return df
    
    # Convert forecasts to DataFrames
    arima_df = to_dataframe(arima_forecast, 'ARIMA')
    prophet_df = to_dataframe(prophet_forecast, 'Prophet')
    
    # Combine results
    results_df = pd.concat([arima_df, prophet_df], ignore_index=True)
    
    # Reset index to have a Date column if needed
    if 'index' in results_df.columns:
        results_df = results_df.reset_index().rename(columns={'index': 'Date'})
    
    # Add RMSE if available
    if hasattr(baseline, 'rmse_scores'):
        results_df['RMSE'] = results_df['Model'].map(baseline.rmse_scores)
    
    # Ensure outputs directory exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Save CSV for app
    forecast_csv_path = FORECAST_CSV
    results_df.to_csv(forecast_csv_path, index=False)
    
    # Optional: visualize forecasts
    baseline.plot_forecasts()
    
    print(f"‚úÖ Baseline models completed! Forecasting results saved to: {forecast_csv_path}")
    
    return results_df

# Run baseline models pipeline (guarded)
if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
    print("‚è≠Ô∏è Skipping baseline models (no uploaded data).")
else:
    baseline_results = run_baseline_models(engineered_file)


üìà STEP 2: BASELINE MODELS (ARIMA & PROPHET)
--------------------------------------------------
üìÇ Loading dataset from: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
‚úÖ Data loaded: 525 records, 67 columns
üîÑ Training ARIMA Model...


‚úÖ ARIMA trained for Apples
   AIC: 1225.18
   Next 4 weeks forecast: [218.13 217.87 217.87 217.87]
üîÑ Training Prophet Model...


23:16:42 - cmdstanpy - INFO - Chain [1] start processing


23:16:42 - cmdstanpy - INFO - Chain [1] done processing


‚úÖ Prophet trained for Apples
   Next 4 weeks forecast: [330.15 314.24 302.4  289.49]


‚úÖ Baseline models completed! Forecasting results saved to: F:\RetailSense_Lite\outputs\forecasting_results.csv


In [6]:
# Cell 5: Advanced Forecasting Models (XGBoost & LightGBM) with CSV export
import numpy as np
import pandas as pd
import joblib
import os

def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error (better than MAPE for 0 values)."""
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8)
    )

def resolve_current_product():
    env_p = os.getenv("PRODUCT_NAME")
    if env_p and env_p.strip():
        return env_p.strip()
    if 'PRODUCT_NAME' in globals() and PRODUCT_NAME not in (None, "", "None"):
        return str(PRODUCT_NAME)
    try:
        if 'engineered_data' in globals() and 'product_name' in engineered_data.columns:
            vals = engineered_data['product_name'].dropna().astype(str).unique().tolist()
            if len(vals) == 1:
                return vals[0]
    except Exception:
        pass
    return None

def run_advanced_forecasting(data_path):
    """Train XGBoost and LightGBM models with sMAPE evaluation and save CSV."""
    print("\nüöÄ STEP 3: ADVANCED FORECASTING (XGBOOST & LIGHTGBM)")
    print("-" * 60)

    forecaster = components['advanced_forecasting']

    # --- Load and prepare feature-engineered data ---
    forecaster.load_and_prepare_data(data_path)

    # --- Train models ---
    results = forecaster.train_models()

    # --- Compute sMAPE ---
    for model_name in ["xgb", "lgb"]:
        if f"{model_name}_pred" in results and "y_test" in results:
            results[f"{model_name}_metrics"]["sMAPE"] = smape(results["y_test"], results[f"{model_name}_pred"])
            results[f"{model_name}_metrics"].pop("MAPE", None)

    # --- Visualizations ---
    forecaster.plot_feature_importance()
    forecaster.plot_predictions()

    # --- Ensure outputs directory exists ---
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # --- Save trained models ---
    if hasattr(forecaster, 'xgb_model'):
        joblib.dump(forecaster.xgb_model, os.path.join(OUTPUT_DIR, "xgboost_model.pkl"))
    if hasattr(forecaster, 'lgb_model'):
        joblib.dump(forecaster.lgb_model, os.path.join(OUTPUT_DIR, "lightgbm_model.pkl"))

    # --- Save CSV with metrics for dashboard ---
    current_product = resolve_current_product()
    metrics_df = pd.DataFrame({
        "Product": [current_product, current_product],
        "Model": ["XGBoost", "LightGBM"],
        "RMSE": [results.get("xgb_metrics", {}).get("RMSE", np.nan), results.get("lgb_metrics", {}).get("RMSE", np.nan)],
        "MAE": [results.get("xgb_metrics", {}).get("MAE", np.nan), results.get("lgb_metrics", {}).get("MAE", np.nan)],
        "sMAPE": [results.get("xgb_metrics", {}).get("sMAPE", np.nan), results.get("lgb_metrics", {}).get("sMAPE", np.nan)]
    })

    metrics_csv_path = FORECAST_CSV
    metrics_df.to_csv(metrics_csv_path, index=False)

    print(f"‚úÖ Advanced forecasting completed successfully! Results saved to: {metrics_csv_path}")
    if current_product:
        print(f"üìå Forecast results tagged for product: {current_product}")
    return results

# Run advanced forecasting pipeline (guarded)
if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
    print("‚è≠Ô∏è Skipping advanced forecasting (no uploaded data).")
else:
    forecasting_results = run_advanced_forecasting(engineered_file)


üöÄ STEP 3: ADVANCED FORECASTING (XGBOOST & LIGHTGBM)
------------------------------------------------------------
üìä Loading engineered dataset for advanced forecasting from: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
‚úÖ Data loaded: 525 records, 67 columns
üöÄ Training advanced forecasting models...
‚úÖ Features prepared: 58 numeric features
üìä Training set: 420, Test set: 105
üîÑ Training XGBoost with early stopping...


‚úÖ XGBoost - RMSE: 5.47, MAE: 3.78, MAPE: 4.0% (stopped at 221 iterations)
üîÑ Training LightGBM with early stopping...
‚ÑπÔ∏è LightGBM version: 4.6.0
‚úÖ LightGBM - RMSE: 9.40, MAE: 6.35, MAPE: 7.1% (stopped at 188 iterations)


‚úÖ Advanced forecasting completed successfully! Results saved to: F:\RetailSense_Lite\outputs\forecasting_results.csv


In [7]:
# Cell 6: Anomaly Detection Models (With Default Visualization)
import joblib
import os
import pandas as pd

def run_anomaly_detection(data_path, visualize=True):
    """Run anomaly detection pipeline and save anomalies.csv for dashboard."""
    print("\nüîç STEP 4: ANOMALY DETECTION")
    print("-" * 40)

    detector = components['anomaly_detection']  # Components already initialized

    # Load feature-engineered data
    detector.load_data(data_path)

    # Prepare features for anomaly detection
    X, X_scaled, X_pca, feature_names = detector.prepare_features()

    # Run anomaly detection models (Isolation Forest, One-Class SVM, etc.)
    detector.run_models(X_scaled, X_pca)

    # Optional visualization (default is True)
    if visualize:
        if hasattr(detector, "visualize_dashboard"):
            detector.visualize_dashboard()
        else:
            print("‚ö†Ô∏è No visualize_dashboard() method found in anomaly_detection component.")

    # Ensure outputs directory exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # --- Build structured anomalies DataFrame ---
    anomalies_df = pd.DataFrame({
        "week_start": detector.df['week_start'] if 'week_start' in detector.df else detector.df.index,
        "sales_qty": detector.df['sales_qty'] if 'sales_qty' in detector.df else detector.df.iloc[:,0],
        "is_anomaly_iforest": detector.df['is_anomaly_iforest'] if 'is_anomaly_iforest' in detector.df else pd.Series([False]*len(detector.df))
    })

    # Save CSV for dashboard
    anomaly_csv_path = ANOMALIES_CSV
    anomalies_df.to_csv(anomaly_csv_path, index=False)

    # --- Save anomaly detection models safely ---
    if hasattr(detector, "save_models"):
        detector.save_models()
    else:
        if hasattr(detector, "iforest"):
            joblib.dump(detector.iforest, os.path.join(OUTPUT_DIR, "isolation_forest_model.pkl"))
        if hasattr(detector, "ocsvm"):
            joblib.dump(detector.ocsvm, os.path.join(OUTPUT_DIR, "ocsvm_model.pkl"))
        print("‚úÖ Models saved manually")

    # Build summary of results
    total_anomalies_iforest = anomalies_df['is_anomaly_iforest'].sum()
    total_records = anomalies_df.shape[0]

    results = {
        "features_used": feature_names,
        "total_records": int(total_records),
        "total_anomalies_iforest": int(total_anomalies_iforest)
    }

    print("\n‚úÖ Anomaly Detection Completed Successfully!")
    print(f"   ‚Ä¢ Isolation Forest anomalies: {results['total_anomalies_iforest']}")
    print(f"üìÅ Anomalies saved to: {anomaly_csv_path}")

    return results

# Run anomaly detection pipeline (guarded)
if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
    print("‚è≠Ô∏è Skipping anomaly detection (no uploaded data).")
else:
    anomaly_results = run_anomaly_detection(FEATURES_DATA_PATH)


üîç STEP 4: ANOMALY DETECTION
----------------------------------------
üìä Loading data for anomaly detection...
‚úÖ Data loaded: 525 records, 67 columns
üîÑ Preparing features for anomaly detection...
‚úÖ Features prepared: 59 numeric features ‚Üí 2 PCA components
üîÑ Training Isolation Forest...


üîÑ Training One-Class SVM...
‚úÖ Models trained and anomaly flags added
üìà Generating anomaly dashboard...


‚úÖ Anomaly dashboard saved to F:\RetailSense_Lite\outputs\anomaly_dashboard.png
‚úÖ Models saved to F:\RetailSense_Lite\outputs

‚úÖ Anomaly Detection Completed Successfully!
   ‚Ä¢ Isolation Forest anomalies: 27
üìÅ Anomalies saved to: F:\RetailSense_Lite\outputs\anomalies.csv


In [8]:
# -------------------
# Cell 7: Model Performance Comparison (Fixed)
# -------------------
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def compare_model_performance(forecasting_results, anomaly_results, visualize=True):
    print("\nüìä STEP 5: MODEL PERFORMANCE COMPARISON")
    print("-" * 45)
    
    performance_summary = {'Model Type': [], 'Model Name': [], 'RMSE': [], 'MAE': [], 'sMAPE': []}

    # Handle dict output from run_advanced_forecasting
    if isinstance(forecasting_results, dict):
        for model_key, display_name in [('xgb_metrics', 'XGBoost'), ('lgb_metrics', 'LightGBM')]:
            if model_key in forecasting_results:
                metrics = forecasting_results[model_key]
                performance_summary['Model Type'].append('Advanced Forecasting')
                performance_summary['Model Name'].append(display_name)
                performance_summary['RMSE'].append(metrics.get('RMSE', np.nan))
                performance_summary['MAE'].append(metrics.get('MAE', np.nan))
                performance_summary['sMAPE'].append(metrics.get('sMAPE', np.nan))

    # Handle DataFrame (optional, e.g., loaded from CSV)
    elif isinstance(forecasting_results, pd.DataFrame) and not forecasting_results.empty:
        for idx, row in forecasting_results.iterrows():
            performance_summary['Model Type'].append('Advanced Forecasting')
            performance_summary['Model Name'].append(row['Model'])
            performance_summary['RMSE'].append(row.get('RMSE', np.nan))
            performance_summary['MAE'].append(row.get('MAE', np.nan))
            performance_summary['sMAPE'].append(row.get('sMAPE', np.nan))

    perf_df = pd.DataFrame(performance_summary)

    if not perf_df.empty:
        print("\nüéØ FORECASTING MODEL PERFORMANCE:")
        print(perf_df.to_string(index=False))

        if visualize:
            fig, axes = plt.subplots(1, 3, figsize=(18, 5))
            metrics = ['RMSE', 'MAE', 'sMAPE']
            for i, metric in enumerate(metrics):
                axes[i].bar(perf_df['Model Name'], perf_df[metric], alpha=0.7)
                axes[i].set_title(f'{metric} Comparison')
                axes[i].set_ylabel(metric)
                axes[i].tick_params(axis='x', rotation=45)
            plt.tight_layout()

            # Save figure
            os.makedirs(OUTPUT_DIR, exist_ok=True)
            fig_path = os.path.join(OUTPUT_DIR, 'model_performance_comparison.png')
            plt.savefig(fig_path, dpi=300, bbox_inches='tight')
            plt.show()
            print(f"‚úÖ Performance comparison chart saved at: {fig_path}")

    # Anomaly summary
    if anomaly_results is not None:
        print("\nüö® ANOMALY DETECTION SUMMARY:")
        if 'total_anomalies_iforest' in anomaly_results:
            print(f"  ‚Ä¢ Isolation Forest: {anomaly_results['total_anomalies_iforest']} anomalies")
        if 'total_anomalies_ocsvm' in anomaly_results:
            print(f"  ‚Ä¢ One-Class SVM: {anomaly_results.get('total_anomalies_ocsvm', 0)} anomalies")

# Execute comparison with default plotting (guarded)
if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
    print("‚è≠Ô∏è Skipping model performance comparison (no uploaded data).")
else:
    compare_model_performance(forecasting_results, anomaly_results)


üìä STEP 5: MODEL PERFORMANCE COMPARISON
---------------------------------------------

üéØ FORECASTING MODEL PERFORMANCE:
          Model Type Model Name     RMSE      MAE    sMAPE
Advanced Forecasting    XGBoost 5.472354 3.784855 3.340859
Advanced Forecasting   LightGBM 9.396908 6.345216 5.656022


‚úÖ Performance comparison chart saved at: F:\RetailSense_Lite\outputs\model_performance_comparison.png

üö® ANOMALY DETECTION SUMMARY:
  ‚Ä¢ Isolation Forest: 27 anomalies


In [9]:
# -------------------
# Cell 8: Business Insights Generation (Updated)
# -------------------
import os
import joblib
import pandas as pd
from datetime import datetime
import numpy as np

def generate_business_insights():
    """Generate comprehensive business insights from all models"""
    print("\nüí° STEP 6: BUSINESS INSIGHTS GENERATION")
    print("-" * 55)

    insights = []

    # --- Forecasting insights ---
    if 'forecasting_results' in globals() and forecasting_results and 'xgb_metrics' in forecasting_results and 'lgb_metrics' in forecasting_results:
        xgb_smape = forecasting_results['xgb_metrics'].get('sMAPE', 999)
        lgb_smape = forecasting_results['lgb_metrics'].get('sMAPE', 999)

        best_model = "XGBoost" if xgb_smape < lgb_smape else "LightGBM"
        best_smape = min(xgb_smape, lgb_smape)

        insights.append(f"üéØ FORECASTING: {best_model} performed best with sMAPE={best_smape:.2f}%")

        if best_smape < 15:
            insights.append("‚úÖ Forecasting accuracy is EXCELLENT (sMAPE < 15%) - ready for production")
        elif best_smape < 25:
            insights.append("‚ö†Ô∏è Forecasting accuracy is GOOD (sMAPE < 25%) - some fine-tuning recommended")
        else:
            insights.append("üîß Forecasting accuracy is WEAK (sMAPE > 25%) - revisit feature engineering")

    # --- Anomaly insights ---
    if 'anomaly_results' in globals() and anomaly_results:
        if 'total_anomalies_iforest' in anomaly_results:
            insights.append(f"üö® Isolation Forest flagged {anomaly_results['total_anomalies_iforest']} unusual records")
        if 'total_anomalies_ocsvm' in anomaly_results:
            insights.append(f"üö® One-Class SVM flagged {anomaly_results['total_anomalies_ocsvm']} unusual records")

    # --- Feature importance insights ---
    try:
        xgb_model_path = os.path.join(OUTPUT_DIR, "xgboost_model.pkl")
        if os.path.exists(xgb_model_path) and 'all_features' in globals():
            xgb_model = joblib.load(xgb_model_path)
            feature_importance = pd.DataFrame({
                'feature': all_features,
                'importance': xgb_model.feature_importances_
            }).sort_values('importance', ascending=False)

            top_features = feature_importance.head(3)['feature'].tolist()
            insights.append(f"üîë TOP PREDICTIVE FEATURES: {', '.join(top_features)}")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not load feature importance: {e}")

    # --- Save insights ---
    insights_report = {
        'timestamp': datetime.now().isoformat(),
        'total_insights': len(insights),
        'insights': insights
    }

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    output_file = os.path.join(OUTPUT_DIR, "phase2_business_insights.csv")
    pd.DataFrame([insights_report]).to_csv(output_file, index=False)

    print("\nüíº BUSINESS INSIGHTS:")
    for i, insight in enumerate(insights, 1):
        print(f"  {i}. {insight}")

    print(f"\n‚úÖ Insights report saved at: {output_file}")
    return insights

# Run business insights generation (guarded)
if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
    print("‚è≠Ô∏è Skipping business insights (no uploaded data).")
else:
    business_insights = generate_business_insights()


üí° STEP 6: BUSINESS INSIGHTS GENERATION
-------------------------------------------------------
‚ö†Ô∏è Could not load feature importance: All arrays must be of the same length

üíº BUSINESS INSIGHTS:
  1. üéØ FORECASTING: XGBoost performed best with sMAPE=3.34%
  2. ‚úÖ Forecasting accuracy is EXCELLENT (sMAPE < 15%) - ready for production
  3. üö® Isolation Forest flagged 27 unusual records

‚úÖ Insights report saved at: F:\RetailSense_Lite\outputs\phase2_business_insights.csv


In [10]:
# -------------------
# Cell 9: Phase 2 Completion Summary (Enhanced)
# -------------------
import os
from datetime import datetime
import pandas as pd
import numpy as np

def phase2_completion_summary_enhanced():
    print("\n" + "="*60)

    # If no uploaded data, do not attempt to summarize anything
    if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
        print("‚ùå No uploaded data detected. Phase 2 summary is not generated to avoid stale results.")
        print("   Set 'UPLOADED_DATA_PATH' or place file at `data/uploaded/uploaded_data.csv` and rerun.")
        print("="*60)
        return
    
    # --- Dataset Info ---
    if 'engineered_data' in globals():
        shape = engineered_data.shape
        num_features = len(all_features) if 'all_features' in globals() else 'N/A'
        time_range = f"{engineered_data['week_start'].min()} ‚Üí {engineered_data['week_start'].max()}" if 'week_start' in engineered_data else "N/A"
        print(f"üìä DATASET PROCESSED:\n   ‚Ä¢ Engineered shape: {shape}\n   ‚Ä¢ Features created: {num_features}\n   ‚Ä¢ Time range: {time_range}\n")
    
    # --- Models Trained ---
    print("ü§ñ MODELS TRAINED:")
    print("   ‚úÖ Baseline Models: ARIMA, Prophet")
    print("   ‚úÖ Advanced Models: XGBoost, LightGBM")
    print("   ‚úÖ Anomaly Detection: Isolation Forest, One-Class SVM\n")
    
    # --- Best Forecasting Performance ---
    best_model, best_smape = "N/A", None

    # Prefer in-memory results if available
    if 'forecasting_results' in globals() and isinstance(forecasting_results, dict):
        xgb_smape = forecasting_results.get('xgb_metrics', {}).get('sMAPE', np.inf)
        lgb_smape = forecasting_results.get('lgb_metrics', {}).get('sMAPE', np.inf)
        if np.isfinite(xgb_smape) or np.isfinite(lgb_smape):
            if xgb_smape < lgb_smape:
                best_model, best_smape = "XGBoost", xgb_smape
            else:
                best_model, best_smape = "LightGBM", lgb_smape

    # Fallback: try reading metrics CSV if memory object not present
    if best_smape is None:
        try:
            if 'FORECAST_CSV' in globals() and os.path.exists(FORECAST_CSV):
                df_metrics = pd.read_csv(FORECAST_CSV)
                if {'Model','sMAPE'}.issubset(df_metrics.columns) and not df_metrics.empty:
                    row = df_metrics.loc[df_metrics['sMAPE'].astype(float).idxmin()]
                    best_model = str(row['Model'])
                    best_smape = float(row['sMAPE'])
        except Exception:
            pass

    if best_smape is not None and np.isfinite(best_smape):
        print(f"üéØ BEST FORECASTING PERFORMANCE:\n   ‚Ä¢ {best_model} with {best_smape:.2f}% sMAPE\n")
    else:
        print("üéØ BEST FORECASTING PERFORMANCE:\n   ‚Ä¢ N/A\n")
    
    # --- Anomaly Detection Summary ---
    iso_anom = anomaly_results.get('total_anomalies_iforest', 'N/A') if 'anomaly_results' in globals() else 'N/A'
    svm_anom = anomaly_results.get('total_anomalies_ocsvm', 'N/A') if 'anomaly_results' in globals() else 'N/A'
    print("üö® ANOMALY DETECTION SUMMARY:")
    print(f"   ‚Ä¢ Isolation Forest: {iso_anom} anomalies")
    print(f"   ‚Ä¢ One-Class SVM: {svm_anom} anomalies\n")
    
    # --- Generated Files ---
    print("üìÅ FILES GENERATED:")
    outputs_dir = OUTPUT_DIR
    processed_file = FEATURES_DATA_PATH
    files_list = [
        processed_file,
        os.path.join(outputs_dir, "model_performance_comparison.png"),
        os.path.join(outputs_dir, "phase2_business_insights.csv"),
        os.path.join(outputs_dir, "forecasting_results.csv"),
        os.path.join(outputs_dir, "anomalies.csv")
    ]
    for f in files_list:
        status = "‚úÖ" if os.path.exists(f) else "‚ö†Ô∏è Missing"
        print(f"   {status} {f}")
    
    print("\n‚úÖ PHASE 2 COMPLETED SUCCESSFULLY!")
    print(f"üìÑ Summary generated at: {outputs_dir}\n")
    print("="*60)

# Run enhanced Phase 2 summary
phase2_completion_summary_enhanced()


üìä DATASET PROCESSED:
   ‚Ä¢ Engineered shape: (525, 67)
   ‚Ä¢ Features created: 115
   ‚Ä¢ Time range: 2023-10-30 00:00:00 ‚Üí 2025-10-27 00:00:00

ü§ñ MODELS TRAINED:
   ‚úÖ Baseline Models: ARIMA, Prophet
   ‚úÖ Advanced Models: XGBoost, LightGBM
   ‚úÖ Anomaly Detection: Isolation Forest, One-Class SVM

üéØ BEST FORECASTING PERFORMANCE:
   ‚Ä¢ XGBoost with 3.34% sMAPE

üö® ANOMALY DETECTION SUMMARY:
   ‚Ä¢ Isolation Forest: 27 anomalies
   ‚Ä¢ One-Class SVM: N/A anomalies

üìÅ FILES GENERATED:
   ‚úÖ F:\RetailSense_Lite\data\processed\data_with_all_features.csv
   ‚úÖ F:\RetailSense_Lite\outputs\model_performance_comparison.png
   ‚úÖ F:\RetailSense_Lite\outputs\phase2_business_insights.csv
   ‚úÖ F:\RetailSense_Lite\outputs\forecasting_results.csv
   ‚úÖ F:\RetailSense_Lite\outputs\anomalies.csv

‚úÖ PHASE 2 COMPLETED SUCCESSFULLY!
üìÑ Summary generated at: F:\RetailSense_Lite\outputs



In [11]:
# -------------------
# Cell 10: Phase 2 Validation and Testing (Corrected)
# -------------------
import os
from datetime import datetime
import pandas as pd

def validate_phase2_outputs_enhanced():
    """Validate all Phase 2 outputs before moving to Phase 3"""
    print("\nüîç PHASE 2 VALIDATION")
    print("-" * 60)

    # Short-circuit when no uploaded data is present
    if 'SKIP_PHASE2' in globals() and SKIP_PHASE2:
        print("‚ùå Validation skipped: no uploaded data file detected.")
        print("   Set 'UPLOADED_DATA_PATH' or place file at `data/uploaded/uploaded_data.csv` and rerun Phase 2.")
        print("-"*60)
        print(f"üìÖ Validation completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        return False

    outputs_dir = OUTPUT_DIR
    processed_dir = PROCESSED_DIR

    validation_results = {
        "feature_engineering": False,
        "baseline_models": False,
        "advanced_forecasting": False,
        "anomaly_detection": False,
        "outputs": False
    }

    # --- Feature engineering validation ---
    fe_file = FEATURES_DATA_PATH
    if os.path.exists(fe_file):
        validation_results["feature_engineering"] = True
        print(f"‚úÖ Feature engineering file found: {fe_file}")

    # --- Baseline models validation ---
    if 'baseline_results' in globals() and isinstance(baseline_results, pd.DataFrame) and not baseline_results.empty:
        validation_results["baseline_models"] = True
        print("‚úÖ Baseline models ran successfully (ARIMA, Prophet)")

    # --- Advanced forecasting validation ---
    if 'forecasting_results' in globals() and isinstance(forecasting_results, dict) and "xgb_metrics" in forecasting_results:
        validation_results["advanced_forecasting"] = True
        print("‚úÖ Advanced forecasting models ran successfully (XGBoost, LightGBM)")

    # --- Anomaly detection validation ---
    if 'anomaly_results' in globals() and isinstance(anomaly_results, dict) and 'total_anomalies_iforest' in anomaly_results:
        validation_results["anomaly_detection"] = True
        print("‚úÖ Anomaly detection models ran successfully (Isolation Forest, One-Class SVM)")

    # --- Outputs validation ---
    output_files = [
        os.path.join(outputs_dir, "model_performance_comparison.png"),
        os.path.join(outputs_dir, "phase2_business_insights.csv"),
        os.path.join(outputs_dir, "forecasting_results.csv"),
        os.path.join(outputs_dir, "anomalies.csv")
    ]
    missing_files = [f for f in output_files if not os.path.exists(f)]
    if not missing_files:
        validation_results["outputs"] = True
        print("‚úÖ All key output files generated")
    else:
        print("‚ö†Ô∏è Missing output files:")
        for f in missing_files:
            print(f"   ‚ùå {f}")

    # --- Final check ---
    all_passed = all(validation_results.values())
    print("\n" + "-"*60)
    if all_passed:
        print("üéâ ALL VALIDATIONS PASSED! ‚úÖ")
        print("üöÄ Ready to proceed to Phase 3: Business Layer & Dashboard")
    else:
        print("‚ö†Ô∏è Some validations failed:")
        for comp, status in validation_results.items():
            if not status:
                print(f"   ‚ùå {comp} FAILED")
    print("-"*60)
    print(f"üìÖ Validation completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

    return all_passed

# --- Run enhanced validation ---
validation_passed = validate_phase2_outputs_enhanced()


üîç PHASE 2 VALIDATION
------------------------------------------------------------
‚úÖ Feature engineering file found: F:\RetailSense_Lite\data\processed\data_with_all_features.csv
‚úÖ Baseline models ran successfully (ARIMA, Prophet)
‚úÖ Advanced forecasting models ran successfully (XGBoost, LightGBM)
‚úÖ Anomaly detection models ran successfully (Isolation Forest, One-Class SVM)
‚úÖ All key output files generated

------------------------------------------------------------
üéâ ALL VALIDATIONS PASSED! ‚úÖ
üöÄ Ready to proceed to Phase 3: Business Layer & Dashboard
------------------------------------------------------------
üìÖ Validation completed at: 2025-11-01 23:16:49

