In [None]:
# -*- coding: utf-8 -*-
"""
Inventory Demand Forecasting Project

This script builds a complete machine learning pipeline to forecast sales demand
for different store-item combinations.

It covers the following steps:
1.  Data Loading and Preprocessing: Loads the dataset and creates time-based,
    lag, and rolling window features.
2.  Train/Validation Split: Splits the data into training and validation sets
    based on a time threshold.
3.  Model Training: Implements and trains three different regression models:
    - RandomForestRegressor
    - XGBRegressor (with a robust, version-agnostic failsafe)
    - LGBMRegressor (with early stopping)
4.  Model Evaluation: Evaluates the models on the validation set using
    Root Mean Squared Error (RMSE) and Mean Absolute Error (MAE).
5.  Visualization: Plots actual vs. predicted sales and feature importance
    for the boosting models.
"""

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# XGBoost imports (both high-level wrapper and low-level API)
import xgboost as xgb
from xgboost import XGBRegressor

# LightGBM imports
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping as lgb_early_stopping

# --- Configuration & Styling ---
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")


def load_and_preprocess(filepath: str) -> pd.DataFrame:
    """
    Loads data and performs preprocessing and feature engineering.
    """
    print("Step 1: Loading and preprocessing data...")
    start_time = time.time()

    try:
        df = pd.read_csv(filepath, parse_dates=['date'])
    except FileNotFoundError:
        print(f"Error: The file {filepath} was not found.")
        return pd.DataFrame()

    # Sort for proper lag/rolling creation
    df.sort_values(by=['store', 'item', 'date'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Time-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)

    # Lag features (within each store-item series)
    grouped = df.groupby(['store', 'item'])['sales']
    for lag in (7, 14, 30):
        df[f'sales_lag_{lag}'] = grouped.shift(lag)

    # Rolling mean features (shifted so they don't include current day)
    for window in (7, 30):
        df[f'sales_rollmean_{window}'] = grouped.shift(1).rolling(window=window, min_periods=1).mean()

    # Drop rows with NaNs introduced by lags/rolls
    before = df.shape[0]
    df.dropna(inplace=True)
    after = df.shape[0]

    end_time = time.time()
    print(f"Preprocessing done in {end_time - start_time:.2f}s — dropped {before - after} rows (NaNs).")
    print(f"Data shape after preprocessing: {df.shape}")
    return df


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, model_name: str):
    """
    Evaluate and print RMSE and MAE for predictions.
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"--- {model_name} Evaluation ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}\n")
    return {'RMSE': rmse, 'MAE': mae}


def plot_actual_vs_predicted(df_val: pd.DataFrame, y_pred: np.ndarray, model_name: str,
                             store_id: int = 1, item_id: int = 1):
    """
    Plot actual vs predicted sales for a chosen store-item pair.
    y_pred must be aligned (same order) as df_val.
    """
    dfp = df_val.copy()
    dfp['predicted_sales'] = np.array(y_pred).flatten()
    mask = (dfp['store'] == store_id) & (dfp['item'] == item_id)
    sub = dfp.loc[mask]
    if sub.empty:
        print(f"No data for store={store_id}, item={item_id} in validation set — skipping plot.")
        return

    plt.figure(figsize=(14, 6))
    plt.plot(sub['date'], sub['sales'], label='Actual', linewidth=2)
    plt.plot(sub['date'], sub['predicted_sales'], label='Predicted', linestyle='--', linewidth=2)
    plt.title(f"{model_name}: Actual vs Predicted (Store {store_id}, Item {item_id})")
    plt.xlabel("Date")
    plt.ylabel("Sales")
    plt.legend()
    plt.grid(True)
    filename = f"{model_name}_actual_vs_predicted_store{store_id}_item{item_id}.png"
    plt.savefig(filename, bbox_inches='tight')
    plt.close()
    print(f"Saved plot: {filename}")


def plot_feature_importance(model, features: list, model_name: str):
    """
    Plot feature importances. Supports sklearn-style models (feature_importances_),
    LightGBM (sklearn wrapper), and xgboost.Booster (low-level).
    """
    # sklearn-style (RandomForest, LGBM, XGBRegressor)
    if hasattr(model, 'feature_importances_'):
        imp = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False).head(15)
    else:
        # xgboost Booster fallback
        try:
            # Booster object
            if isinstance(model, xgb.core.Booster) or model.__class__.__name__ == 'Booster':
                score = model.get_score(importance_type='gain')  # keys like 'f0', 'f1' or feature names
                # Map to feature names robustly
                imp_map = {f: 0.0 for f in features}
                for k, v in score.items():
                    if k.startswith('f'):
                        # 'f{index}' -> index position in features
                        try:
                            idx = int(k[1:])
                            if 0 <= idx < len(features):
                                imp_map[features[idx]] = v
                        except ValueError:
                            # unexpected formatting; ignore
                            pass
                    else:
                        # If key is actual feature name
                        if k in imp_map:
                            imp_map[k] = v
                imp = pd.Series(imp_map).sort_values(ascending=False).head(15)
            else:
                print(f"Model type {type(model)} not supported for feature importance plotting.")
                return
        except Exception as e:
            print("Error extracting feature importance:", e)
            return

    plt.figure(figsize=(10, 6))
    sns.barplot(x=imp.values, y=imp.index)
    plt.title(f"Top Feature Importances: {model_name}")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    fname = f"{model_name}_feature_importance.png"
    plt.savefig(fname, bbox_inches='tight')
    plt.close()
    print(f"Saved feature importance: {fname}")


def main():
    # Load & preprocess
    data = load_and_preprocess('train.csv')
    if data.empty:
        return

    # Time-based train/validation split (last 3 months of 2017 as validation)
    train_df = data[data['date'] < '2017-10-01'].copy()
    val_df = data[data['date'] >= '2017-10-01'].copy()

    # Features & target
    features = [c for c in data.columns if c not in ['date', 'sales']]
    target = 'sales'

    X_train = train_df[features]
    y_train = train_df[target]
    X_val = val_df[features]
    y_val = val_df[target]

    print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

    models = {}
    predictions = {}
    results = {}

    # -----------------------------
    # Random Forest (baseline)
    # -----------------------------
    print("\nTraining RandomForestRegressor...")
    t0 = time.time()
    rf = RandomForestRegressor(
        n_estimators=100,
        max_depth=12,
        min_samples_leaf=5,
        n_jobs=-1,
        random_state=42
    )
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_val)
    t1 = time.time()
    print(f"RandomForest done in {t1 - t0:.2f}s")
    models['RandomForest'] = rf
    predictions['RandomForest'] = rf_pred
    results['RandomForest'] = evaluate_model(y_val, rf_pred, 'RandomForest')

    # -----------------------------
    # XGBoost (robust, failsafe)
    # -----------------------------
    print("\nTraining XGBoost (robust fallback implementation)...")
    t0 = time.time()
    xgb_preds = None
    xgb_model = None

    xgb_params = dict(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        tree_method="hist"
    )

    # Try high-level sklearn API with early_stopping_rounds first (may or may not be accepted)
    try:
        xgb_model = XGBRegressor(**xgb_params)
        try:
            # Preferred: ask sklearn wrapper for early stopping & eval_metric (works on many versions)
            xgb_model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=20,
                eval_metric="rmse",
                verbose=False
            )
            xgb_preds = xgb_model.predict(X_val)
            print("XGBRegressor.fit(...) succeeded with early stopping.")
        except TypeError:
            # Some xgboost versions raise TypeError when early_stopping_rounds / eval_metric passed here.
            # Retry without early stopping.
            print("XGBRegressor.fit(...) refused early stopping args — retrying without them.")
            xgb_model.fit(X_train, y_train)
            xgb_preds = xgb_model.predict(X_val)
    except Exception as high_err:
        # If sklearn wrapper completely fails, fall back to low-level xgb.train API using DMatrix.
        print("High-level XGBRegressor failed:", repr(high_err))
        print("Falling back to low-level xgboost.train() with DMatrix and early stopping.")
        try:
            dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
            dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
            params = {
                "objective": "reg:squarederror",
                "tree_method": "hist",
                "eta": 0.05,
                "max_depth": 7,
                "subsample": 0.8,
                "colsample_bytree": 0.8,
                "eval_metric": "rmse",
                "seed": 42
            }
            watchlist = [(dtrain, "train"), (dval, "eval")]
            booster = xgb.train(
                params,
                dtrain,
                num_boost_round=1000,
                evals=watchlist,
                early_stopping_rounds=20,
                verbose_eval=False
            )
            xgb_model = booster  # note: Booster object (not sklearn wrapper)
            xgb_preds = booster.predict(dval)
            print("Low-level xgboost.train() completed with early stopping.")
        except Exception as low_err:
            print("Low-level xgboost.train() also failed:", repr(low_err))
            raise RuntimeError("XGBoost training failed in both sklearn wrapper and low-level API.") from low_err

    t1 = time.time()
    print(f"XGBoost done in {t1 - t0:.2f}s")
    models['XGBoost'] = xgb_model
    predictions['XGBoost'] = np.array(xgb_preds).flatten()
    results['XGBoost'] = evaluate_model(y_val, predictions['XGBoost'], 'XGBoost')

    # -----------------------------
    # LightGBM
    # -----------------------------
    print("\nTraining LightGBMRegressor...")
    t0 = time.time()
    lgb = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        n_jobs=-1,
        random_state=42
    )
    # Use callback early stopping if supported
    try:
        lgb.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb_early_stopping(stopping_rounds=20, verbose=False)],
        )
    except TypeError:
        # Some versions accept early_stopping_rounds directly instead of callbacks
        print("LightGBM.fit(...) refused callbacks argument — retrying with early_stopping_rounds.")
        lgb.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            early_stopping_rounds=20
        )
    lgb_preds = lgb.predict(X_val)
    t1 = time.time()
    print(f"LightGBM done in {t1 - t0:.2f}s")
    models['LightGBM'] = lgb
    predictions['LightGBM'] = np.array(lgb_preds).flatten()
    results['LightGBM'] = evaluate_model(y_val, predictions['LightGBM'], 'LightGBM')

    # -----------------------------
    # Compare results
    # -----------------------------
    results_df = pd.DataFrame(results).T
    print("\n--- Model Performance Summary ---")
    print(results_df.sort_values(by='RMSE'))

    best_model_name = results_df['RMSE'].idxmin()
    print(f"Best model by RMSE: {best_model_name}")

    # -----------------------------
    # Visualizations
    # -----------------------------
    print("\nGenerating visualizations for the best model and feature importances...")
    # Plot actual vs predicted for a sample store-item
    plot_actual_vs_predicted(val_df, predictions[best_model_name], best_model_name, store_id=1, item_id=1)

    # Feature importance plots for boosting models (if available)
    plot_feature_importance(models['XGBoost'], features, 'XGBoost')
    plot_feature_importance(models['LightGBM'], features, 'LightGBM')

    # Optionally also show RF importance
    plot_feature_importance(models['RandomForest'], features, 'RandomForest')

    print("\nAll done. Artifacts (plots) saved in the current directory.")


if __name__ == "__main__":
    main()


Step 1: Loading and preprocessing data...
Preprocessing done in 3.59s — dropped 15000 rows (NaNs).
Data shape after preprocessing: (898000, 14)
Train shape: (852000, 12), Validation shape: (46000, 12)

Training RandomForestRegressor...
RandomForest done in 158.29s
--- RandomForest Evaluation ---
RMSE: 8.2027
MAE : 6.2737


Training XGBoost (robust fallback implementation)...
XGBRegressor.fit(...) refused early stopping args — retrying without them.
XGBoost done in 71.12s
--- XGBoost Evaluation ---
RMSE: 7.6418
MAE : 5.9015


Training LightGBMRegressor...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1254
[LightGBM] [Info] Number of data points in the train set: 852000, number of used features: 12
[LightGBM] [Info] Start training from score 52.522494
LightGBM done in 42.44s
--- Lig