In [1]:
# ## 04_model_training.ipynb (Without XGBoost)

# ### 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import os

# ### 2. Load Data
features_path = "../data/processed/walmart_with_features.csv"
df = pd.read_csv(features_path, parse_dates=['Date'])

print("✅ Data Loaded:", df.shape)
print(df.head())

if 'Dept' not in df.columns:
    df['Dept'] = 1
    print("\n⚠️ 'Dept' column not found in data. A dummy column has been created for training.")

# ### 3. Prepare Data: Define Features and Target, and handle data types
df['Store'] = df['Store'].astype(object)
df['Dept'] = df['Dept'].astype(object)

X = df.drop(columns=["Weekly_Sales", "Date"])
y = df["Weekly_Sales"]

print("\nFeatures (X) head:")
print(X.head())
print("\nTarget (y) head:")
print(y.head())

# ### 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("\nTraining set:", X_train.shape, "Testing set:", X_test.shape)

# ### 5. Train Robust Random Forest Pipeline
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test  = X_test.replace([np.inf, -np.inf], np.nan)

if hasattr(y_train, "isna"):
    _mask = ~y_train.isna()
    X_train = X_train.loc[_mask]
    y_train = y_train.loc[_mask]

cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns identified: {cat_cols}")
print(f"Numerical columns identified: {num_cols}")

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ]), cat_cols),
    ],
    remainder="drop",
)

rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
)

model = Pipeline(steps=[("prep", preprocess), ("rf", rf_model)])

model.fit(X_train, y_train)
print(f"\n✅ Training complete. Used {len(num_cols)} numeric and {len(cat_cols)} categorical features.")

# ### 6. Predictions (pipeline-safe)
y_pred = model.predict(X_test)
print("✅ Predictions shape:", y_pred.shape)

# ### 7. Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n📊 Model Evaluation")
print(f"MAE : {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²  : {r2:.4f}")

# ### 8. Save Pipeline
save_path = "../models/rf_sales_pipeline.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
joblib.dump(model, save_path)
print(f"💾 Saved pipeline to: {save_path}")

✅ Data Loaded: (6435, 21)
   Store       Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
0      1 2010-02-05      1.074309             0    -0.995136   -1.713800   
1      1 2010-02-12      1.071198             1    -1.201170   -1.766089   
2      1 2010-02-19      1.017382             0    -1.124178   -1.840166   
3      1 2010-02-26      0.654458             0    -0.760907   -1.737766   
4      1 2010-03-05      0.914805             0    -0.767955   -1.598328   

        CPI  Unemployment  Dept  Year  ...  Week  Day  day_of_week  \
0  1.004175      0.056964     1  2010  ...     5    5            4   
1  1.007880      0.056964     1  2010  ...     6   12            4   
2  1.009074      0.056964     1  2010  ...     7   19            4   
3  1.009849      0.056964     1  2010  ...     8   26            4   
4  1.010624      0.056964     1  2010  ...     9    5            4   

   is_weekend  sales_lag_1  sales_lag_2  sales_lag_4  rolling_mean_4w  \
0           0    -1.845

  X_train = X_train.replace([np.inf, -np.inf], np.nan)
  X_test  = X_test.replace([np.inf, -np.inf], np.nan)



✅ Training complete. Used 19 numeric and 0 categorical features.
✅ Predictions shape: (1287,)

📊 Model Evaluation
MAE : 0.09
RMSE: 0.14
R²  : 0.9795
💾 Saved pipeline to: ../models/rf_sales_pipeline.pkl


In [21]:

# --- NEW ADDITION TO HANDLE MISSING DEPT COLUMN ---
if 'Dept' not in df.columns:
    df['Dept'] = 1
    print("\n 'Dept' column not found in data. A dummy column has been created for training.")
# ----------------------------------------------------

# ### 3. Prepare Data: Define Features and Target, and handle data types
df['Store'] = df['Store'].astype(object)
df['Dept'] = df['Dept'].astype(object)

X = df.drop(columns=["Weekly_Sales", "Date"])
y = df["Weekly_Sales"]

print("\nFeatures (X) head:")
print(X.head())
print("\nTarget (y) head:")
print(y.head())


Features (X) head:
  Store  Holiday_Flag  Temperature  Fuel_Price       CPI  Unemployment  year  \
0     1             0    -0.767955   -1.598328  1.010624      0.056964  2010   
1     1             0    -0.155815   -1.506821  1.011399      0.056964  2010   
2     1             0    -0.329861   -1.391349  1.007206      0.056964  2010   
3     1             0    -0.499568   -1.365204  1.002185      0.056964  2010   
4     1             0     0.087089   -1.393527  0.997164     -0.101907  2010   

   month  week  day_of_week  is_weekend  sales_lag_1  sales_lag_2  \
0      3     9            4           0     0.654458     1.017382   
1      3    10            4           0     0.914805     0.654458   
2      3    11            4           0     0.707959     0.914805   
3      3    12            4           0     0.767132     0.707959   
4      4    13            4           0     0.644951     0.767132   

   sales_lag_4  rolling_mean_4w  rolling_std_4w Dept  
0     1.074309         0.9543

In [22]:
 ### 3. Prepare Data: Define Features and Target, and handle data types
# Convert 'Store' and 'Dept' to object type if they are numeric,
# so they are correctly identified as categorical by the ColumnTransformer.
# This ensures consistency for OrdinalEncoder.
df['Store'] = df['Store'].astype(object)
df['Dept'] = df['Dept'].astype(object)

# Drop the 'Date' column as its components (Year, Month, etc.) are already features.
# 'Weekly_Sales' is the target.
X = df.drop(columns=["Weekly_Sales", "Date"])
y = df["Weekly_Sales"]

print("\nFeatures (X) head:")
print(X.head())
print("\nTarget (y) head:")
print(y.head())


Features (X) head:
  Store  Holiday_Flag  Temperature  Fuel_Price       CPI  Unemployment  year  \
0     1             0    -0.767955   -1.598328  1.010624      0.056964  2010   
1     1             0    -0.155815   -1.506821  1.011399      0.056964  2010   
2     1             0    -0.329861   -1.391349  1.007206      0.056964  2010   
3     1             0    -0.499568   -1.365204  1.002185      0.056964  2010   
4     1             0     0.087089   -1.393527  0.997164     -0.101907  2010   

   month  week  day_of_week  is_weekend  sales_lag_1  sales_lag_2  \
0      3     9            4           0     0.654458     1.017382   
1      3    10            4           0     0.914805     0.654458   
2      3    11            4           0     0.707959     0.914805   
3      3    12            4           0     0.767132     0.707959   
4      4    13            4           0     0.644951     0.767132   

   sales_lag_4  rolling_mean_4w  rolling_std_4w Dept  
0     1.074309         0.9543

In [23]:
# ### 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Training set:", X_train.shape, "Testing set:", X_test.shape)


Training set: (5004, 17) Testing set: (1251, 17)


In [24]:
 ### 5. Train Robust Random Forest Pipeline
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test  = X_test.replace([np.inf, -np.inf], np.nan)

if hasattr(y_train, "isna"):
    _mask = ~y_train.isna()
    X_train = X_train.loc[_mask]
    y_train = y_train.loc[_mask]

cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns identified: {cat_cols}")
print(f"Numerical columns identified: {num_cols}")

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ]), cat_cols),
    ],
    remainder="drop",
)

rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
)

model = Pipeline(steps=[("prep", preprocess), ("rf", rf_model)])

model.fit(X_train, y_train)
print(f"\n Training complete. Used {len(num_cols)} numeric and {len(cat_cols)} categorical features.")


  X_train = X_train.replace([np.inf, -np.inf], np.nan)
  X_test  = X_test.replace([np.inf, -np.inf], np.nan)



Categorical columns identified: []
Numerical columns identified: ['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'year', 'month', 'week', 'day_of_week', 'is_weekend', 'sales_lag_1', 'sales_lag_2', 'sales_lag_4', 'rolling_mean_4w', 'rolling_std_4w', 'Dept']

 Training complete. Used 17 numeric and 0 categorical features.


In [18]:
# ### 6. Predictions (pipeline-safe)
y_pred = model.predict(X_test)
print("✅ Predictions shape:", y_pred.shape)


✅ Predictions shape: (1287,)


In [17]:
 ### 8. Save Pipeline
save_path = "../models/rf_sales_pipeline.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
joblib.dump(model, save_path)
print(f" Saved pipeline to: {save_path}")







 Saved pipeline to: ../models/rf_sales_pipeline.pkl


In [16]:
# ### 8. Save Pipeline
import os, joblib

save_path = "../models/rf_sales_pipeline.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
joblib.dump(model, save_path)
print(f" Saved pipeline to: {save_path}")


 Saved pipeline to: ../models/rf_sales_pipeline.pkl
