In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna



# -----------------------------
# Dataset
# -----------------------------

In [None]:
df = pd.read_csv("sales_data_ready_dynamic.csv")

# Date features
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["dayofweek"] = df["date"].dt.dayofweek
    df.drop(columns=["date"], inplace=True)
    # Encode categorical
for col in df.select_dtypes(include=["object"]).columns:
    if col != "sales":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        

# -----------------------------
# Feature Engineering: Lag / Rolling
# -----------------------------

In [None]:
df = df.sort_values(["year", "month", "day"]).reset_index(drop=True)
df["lag_1"] = df["sales"].shift(1).fillna(method="bfill")
df["lag_7"] = df["sales"].shift(7).fillna(method="bfill")
df["rolling_7"] = df["sales"].rolling(7, min_periods=1).mean().fillna(method="bfill")

X = df.drop(columns=["sales"])
y = df["sales"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# -----------------------------
# Optuna RF Tuning
# -----------------------------

In [None]:
def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 700),
        "max_depth": trial.suggest_int("max_depth", 5, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 6),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": 42,
        "n_jobs": 1
    }
    model = RandomForestRegressor(**params)
    score = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring="r2").mean()
    return score

study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=20)

print("\n✅ RF Best Hyperparameters:", study_rf.best_params)
rf_model = RandomForestRegressor(**study_rf.best_params, random_state=42, n_jobs=1)
rf_model.fit(X_train_scaled, y_train)



# -----------------------------
# XGB Model
# -----------------------------

In [None]:
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="rmse",
    n_jobs=1
)
xgb_model.fit(X_train_scaled, y_train)


# -----------------------------
# LGBM Model
# -----------------------------

In [None]:
lgbm_model = LGBMRegressor(
    n_estimators=600,
    num_leaves=64,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=1
)
lgbm_model.fit(X_train_scaled, y_train)


# -----------------------------
# Optuna for ElasticNet Meta Learner
# -----------------------------

In [None]:
def objective_meta(trial):
    alpha = trial.suggest_float("alpha", 0.001, 1.0, log=True)
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)
    meta = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42, max_iter=5000)

    stack_model = StackingRegressor(
        estimators=[("rf", rf_model), ("xgb", xgb_model), ("lgbm", lgbm_model)],
        final_estimator=meta,
        cv=3,
        n_jobs=1
    )
    score = cross_val_score(stack_model, X_train_scaled, y_train, cv=3, scoring="r2").mean()
    return score
study_meta = optuna.create_study(direction="maximize")
study_meta.optimize(objective_meta, n_trials=15)

print("\n✅ ElasticNet Best Hyperparameters:", study_meta.best_params)

# Final Stacking Model
final_meta = ElasticNet(**study_meta.best_params, random_state=42, max_iter=5000)
stack_model = StackingRegressor(
    estimators=[("rf", rf_model), ("xgb", xgb_model), ("lgbm", lgbm_model)],
    final_estimator=final_meta,
    cv=5,
    n_jobs=1
)
stack_model.fit(X_train_scaled, y_train)


# -----------------------------
# Predictions
# -----------------------------

In [None]:
y_pred_train = stack_model.predict(X_train_scaled)
y_pred_test = stack_model.predict(X_test_scaled)


# -----------------------------
# Metrics
# -----------------------------

In [None]:
print("\n📊 Final Ensemble Performance:")
print("Train R²:", r2_score(y_train, y_pred_train))
print("Test R² :", r2_score(y_test, y_pred_test))
print("MAE:", mean_absolute_error(y_test, y_pred_test))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))


In [None]:
df