In [12]:
# --- Step 1: Import Libraries --- #
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from prophet import Prophet

print("Libraries imported successfully.")

Libraries imported successfully.


In [13]:
# --- Step 2: Load and Merge Data ---
# Make sure the paths are correct for your project structure
try:
    df_train = pd.read_csv('../datasets/train.csv', low_memory=False, parse_dates=['Date'])
    df_store = pd.read_csv('../datasets/store.csv', low_memory=False)
    print("Files loaded successfully.")
except FileNotFoundError:
    print("Error: Make sure 'train.csv' and 'store.csv' are in your datasets folder.")
    exit()

# Merge the datasets
df = pd.merge(df_train, df_store, how='left', on='Store')
print("Datasets merged successfully.")

Files loaded successfully.
Datasets merged successfully.


In [14]:
# --- Step 3: Full Data Preparation (for both models) ---
print("Preparing data...")
# 1. Filter out days when stores were closed
df = df[df['Open'] == 1].copy()
df = df.drop('Open', axis=1)

# 2. Rename columns for Prophet
df = df.rename(columns={'Date': 'ds', 'Sales': 'y'})

# 3. Engineer Date Features
df['Year'] = df['ds'].dt.year
df['Month'] = df['ds'].dt.month
df['Day'] = df['ds'].dt.day
df['DayOfWeek'] = df['ds'].dt.dayofweek

# 4. Handle 'StateHoliday'
df['StateHoliday'] = df['StateHoliday'].replace({0: '0'})

# 5. One-Hot Encode Categorical Features
categorical_features = ['StoreType', 'Assortment', 'StateHoliday']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# 6. Fill Missing Values
imputer = SimpleImputer(strategy='median')
df['CompetitionDistance'] = imputer.fit_transform(df[['CompetitionDistance']])

# 7. Drop unnecessary columns
df = df.drop(['Customers', 'Store', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], axis=1)

print("Data preparation and feature engineering complete.")

Preparing data...
Data preparation and feature engineering complete.


In [15]:
# --- Step 4: Split Data into Train and Test Sets ---
# For time series, we must split by date. We'll use the last 6 weeks as our test set.
cutoff_date = df['ds'].max() - pd.to_timedelta('42 days')
df_train = df[df['ds'] <= cutoff_date].copy()
df_test = df[df['ds'] > cutoff_date].copy()

print(f"Training set: {df_train.shape[0]} records")
print(f"Testing set:  {df_test.shape[0]} records")

Training set: 804110 records
Testing set:  40282 records


In [16]:
# --- Step 5: Engineer and Clean All Features (The "Regressors") ---
print("Performing feature engineering...")
# Extract time-based features
df['Year'] = df['ds'].dt.year
df['Month'] = df['ds'].dt.month
df['Day'] = df['ds'].dt.day
df['DayOfWeek'] = df['ds'].dt.dayofweek

# Handle the 'StateHoliday' column (convert 0 to '0')
df['StateHoliday'] = df['StateHoliday'].replace({0: '0'})

# One-Hot Encode all categorical features
categorical_features = ['StoreType', 'Assortment', 'StateHoliday']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# *** THIS IS THE FIX ***
# Handle ALL known missing numerical columns
cols_to_impute = ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']
imputer = SimpleImputer(strategy='median')
df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])
# *** END OF FIX ***

# Drop columns we won't use
df = df.drop(['Customers', 'Store', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], axis=1)
print("Feature engineering and missing value handling complete.")

Performing feature engineering...


KeyError: 'StateHoliday'

In [None]:
# --- Step 6: Split Data into Train and Test Sets ---
cutoff_date = df['ds'].max() - pd.to_timedelta('42 days')
df_train = df[df['ds'] <= cutoff_date].copy()
df_test = df[df['ds'] > cutoff_date].copy()

print(f"Training set: {df_train.shape[0]} records")
print(f"Testing set:  {df_test.shape[0]} records")

In [None]:
# --- Step 7: The Rematch - Prophet vs. Random Forest ---

# --- Model 1: Random Forest (Our Champion) ---
print("\n--- Training Random Forest Champion ---")
y_train_rf = df_train['y']
X_train_rf = df_train.drop(['ds', 'y'], axis=1)
y_test_rf = df_test['y']
X_test_rf = df_test.drop(['ds', 'y'], axis=1)

scaler = StandardScaler()
X_train_rf_scaled = scaler.fit_transform(X_train_rf)
X_test_rf_scaled = scaler.transform(X_test_rf)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_features=0.5)
rf_model.fit(X_train_rf_scaled, y_train_rf)
rf_predictions = rf_model.predict(X_test_rf_scaled)
rf_mae = mean_absolute_error(y_test_rf, rf_predictions)

print(f"Random Forest MAE: €{rf_mae:,.2f}")

# --- Model 2: Prophet with Extra Regressors (The Challenger) ---
print("\n--- Training Prophet Challenger (This may take a few minutes)... ---")
regressor_names = list(df_train.drop(columns=['ds', 'y']).columns)

prophet_model_adv = Prophet(daily_seasonality=False)

for regressor in regressor_names:
    prophet_model_adv.add_regressor(regressor)

# Train the model
prophet_model_adv.fit(df_train)

# Make predictions on the test set
prophet_predictions_adv = prophet_model_adv.predict(df_test)
prophet_mae_adv = mean_absolute_error(df_test['y'], prophet_predictions_adv['yhat'])

print(f"Advanced Prophet MAE: €{prophet_mae_adv:,.2f}")

# --- Step 8: The Final Verdict ---
print("\n--- The Rematch: Final Results ---")
print(f"Random Forest MAE:     €{rf_mae:,.2f}")
print(f"Advanced Prophet MAE:  €{prophet_mae_adv:,.2f}")

if prophet_mae_adv < rf_mae:
    print("\nNew Champion! The tuned Prophet model wins!")
else:
    print("\nThe reigning champion, Random Forest, holds its ground!")