In [2]:
# notebooks/4_model_optimization.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import joblib

# 1. Load Data
df = pd.read_csv("../data/processed/walmart_final.csv")

# 2. Prepare Features/Target
features = [col for col in df.columns if col not in ['Date', 'Weekly_Sales', 'Store']]
X = df[features]
y = df['Weekly_Sales']

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # <-- THIS DEFINES X_train_scaled
X_test_scaled = scaler.transform(X_test)

# 5. Parameter Grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 6. Initialize and Fit GridSearch
xgb = XGBRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)  # <-- NOW THIS WILL WORK

# 7. Evaluate Best Model
best_xgb = grid_search.best_estimator_
test_pred = best_xgb.predict(X_test_scaled)
mae = mean_absolute_error(y_test, test_pred)

print(f"Best MAE: {mae:,.0f}")
print("Best Parameters:", grid_search.best_params_)

# 8. Save Artifacts
joblib.dump(best_xgb, '../models/xgb_optimized.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best MAE: 77,748
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}


['../models/scaler.pkl']