# Retail Sales Forecasting Model
This notebook loads engineered features from Snowflake, trains a regression model, and evaluates predictions.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

session = get_active_session()


In [None]:
train_df = session.table("RETAIL_TRAIN").to_pandas()
test_df  = session.table("RETAIL_TEST").to_pandas()

train_df.head(), test_df.head()


In [None]:
feature_cols = [
    "SALES_SCALED",
    "PROMOTION_SCALED",
    "FAMILY_ENCODED",
    "LAG_1", "LAG_7", "LAG_30",
    "ROLLING_AVG_7", "ROLLING_AVG_30", "ROLLING_STD_30",
    "DAY_OF_WEEK", "MONTH", "WEEK_OF_YEAR",
    "LOG_SALES", "PROMO_EFFECT"
]

X_train = train_df[feature_cols]
y_train = train_df["SALES"]

X_test = test_df[feature_cols]
y_test = test_df["SALES"]


In [None]:
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42
)

model.fit(X_train, y_train)


In [None]:
preds = model.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae  = mean_absolute_error(y_test, preds)
r2   = r2_score(y_test, preds)

rmse, mae, r2


In [None]:
print("\nðŸŽ¯ MODEL PERFORMANCE REPORT\n")
print(f"ðŸ“Œ RMSE:      {rmse:,.4f}")
print(f"ðŸ“Œ MAE:       {mae:,.4f}")
print(f"ðŸ“Œ RÂ² Score:  {r2:,.4f}")
print("\n(Closer to 1 is better for RÂ², lower is better for RMSE/MAE)")



In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(y_test.values[:300], label="Actual")
plt.plot(preds[:300], label="Predicted")
plt.title("Actual vs Predicted Sales â€” Sample 300 Points")
plt.legend()
plt.show()



In [None]:
session.write_pandas(pred_df, "PREDICTIONS_RETAIL")



In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8), dpi=150)

plt.plot(
    y_test.values[:200],
    label="Actual Sales",
    linewidth=2.5,
    alpha=0.9,
    color="tab:blue"
)

plt.plot(
    preds[:200],
    label="Predicted Sales",
    linewidth=2.5,
    linestyle="--",
    alpha=0.9,
    color="tab:orange"
)

plt.title(
    "Actual vs Predicted Retail Sales\n(First 200 Time Steps)",
    fontsize=18,
    fontweight='bold'
)

plt.xlabel("Time Index (first 200 points)", fontsize=14)
plt.ylabel("Sales Units", fontsize=14)

plt.grid(True, linestyle="--", alpha=0.5)

plt.legend(fontsize=14)

plt.tight_layout()
plt.show()
