In [None]:
# 📦 Step 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import shap
import warnings
warnings.filterwarnings('ignore')
print("✅ Step 1: Libraries imported.")

In [None]:
# 📂 Step 2: Load data
train = pd.read_csv("/kaggle/input/retail-forecasting-dataset/train.csv")
test = pd.read_csv("/kaggle/input/retail-forecasting-dataset/test.csv")
features = pd.read_csv("/kaggle/input/retail-forecasting-dataset/features.csv")
stores = pd.read_csv("/kaggle/input/retail-forecasting-dataset/stores.csv")
print("✅ Step 2: Data loaded.")

In [None]:
# 🧹 Step 3: Merge datasets
train = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
train = train.merge(stores, on='Store', how='left')
test = test.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
test = test.merge(stores, on='Store', how='left')
print("✅ Step 3: Data merged.")

In [None]:
# 🗓️ Step 4: Add date-based features
for df in [train, test]:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
    df['Day'] = df['Date'].dt.dayofweek
    df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
    df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
    df['WeekOfMonth'] = df['Date'].dt.day // 7 + 1
    df['IsWeekend'] = df['Day'].isin([5, 6]).astype(int)
print("✅ Step 4: Date features added.")


In [None]:
# 🧠 Step 5: Create lag and rolling features
train = train.sort_values(['Store', 'Dept', 'Date'])
train['Lag_1'] = train.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
train['Lag_4'] = train.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(4)
train['Lag_52'] = train.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(52)

train['Rolling_4'] = train.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(lambda x: x.shift(1).rolling(4).mean())
train['Sales_Change'] = train['Lag_1'] - train['Lag_4']
train = train.dropna()
print("✅ Step 5: Lag and rolling features created.")


In [None]:
# 🧾 Step 6: Feature list
features_cols = [
    'Store', 'Dept', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size',
    'IsHoliday', 'Year', 'Month', 'Week', 'Day',
    'IsMonthStart', 'IsMonthEnd', 'WeekOfMonth', 'IsWeekend',
    'Lag_1', 'Lag_4', 'Lag_52', 'Rolling_4', 'Sales_Change'
]

X = train[features_cols]
y = train['Weekly_Sales']
print("✅ Step 6: Features and target selected.")


In [None]:
# 🧪 Step 7: Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=False, test_size=0.2)
print("✅ Step 7: Data split.")


In [None]:
# 🤖 Step 8: Train model
model = xgb.XGBRegressor(n_estimators=100, max_depth=8, learning_rate=0.1, n_jobs=-1)
model.fit(X_train, y_train)
print("✅ Step 8: Model trained.")


In [None]:
# 📉 Step 9: Evaluate model
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"✅ Step 9: Model evaluated. Validation RMSE: {rmse:.2f}")


In [None]:
# 📊 Step 10: Plot actual vs predicted
plt.figure(figsize=(12, 4))
plt.plot(y_val.values[:100], label="Actual")
plt.plot(val_preds[:100], label="Predicted")
plt.title("Actual vs Predicted Sales (Sample)")
plt.legend()
plt.show()
print("✅ Step 10: Plot displayed.")

In [None]:
# 🔍 Step 11: SHAP analysis
explainer = shap.Explainer(model)
shap_values = explainer(X_val[:200])  # subset for speed
shap.plots.beeswarm(shap_values)
print("✅ Step 11: SHAP analysis done.")

In [None]:
# 🚀 Step 12: Predict on test set (fill missing lag features with median)
test['Lag_1'] = train['Lag_1'].median()
test['Lag_4'] = train['Lag_4'].median()
test['Lag_52'] = train['Lag_52'].median()
test['Rolling_4'] = train['Rolling_4'].median()
test['Sales_Change'] = train['Sales_Change'].median()

X_test = test[features_cols]
test['Weekly_Sales'] = model.predict(X_test)
print("✅ Step 12: Predictions made on test set.")

In [None]:
# 💾 Step 13: Save results
submission = test[['Store', 'Dept', 'Date', 'Weekly_Sales']]
submission.to_csv("enhanced_sales_predictions.csv", index=False)
print("✅ Step 13: Submission file saved as 'enhanced_sales_predictions.csv'")
