In [1]:
# --- Step 1: Import Libraries ---
import pandas as pd
import numpy as np
import xgboost as xgb  # Import the XGBoost library
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- Step 2: Load and Merge Data ---
# Make sure the paths are correct for your project structure
try:
    df_train = pd.read_csv('../datasets/train.csv', low_memory=False, parse_dates=['Date'])
    df_store = pd.read_csv('../datasets/store.csv', low_memory=False)
    print("Files loaded successfully.")
except FileNotFoundError:
    print("Error: Make sure 'train.csv' and 'store.csv' are in your datasets folder.")
    exit()

# Merge the datasets
df = pd.merge(df_train, df_store, how='left', on='Store')
print("Datasets merged successfully.")

Files loaded successfully.
Datasets merged successfully.


In [3]:
# --- Step 3: Full Data Preparation (Our "Champion" Pipeline) ---
print("Preparing all features...")

# 1. Filter out days when stores were closed
df = df[df['Open'] == 1].copy()
df = df.drop('Open', axis=1)

# 2. Rename columns
df = df.rename(columns={'Date': 'ds', 'Sales': 'y'})

# 3. Engineer Date Features
df['Year'] = df['ds'].dt.year
df['Month'] = df['ds'].dt.month
df['Day'] = df['ds'].dt.day
df['DayOfWeek'] = df['ds'].dt.dayofweek

# 4. Handle 'StateHoliday'
df['StateHoliday'] = df['StateHoliday'].replace({0: '0'})

# 5. One-Hot Encode Categorical Features
categorical_features = ['StoreType', 'Assortment', 'StateHoliday']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# 6. Fill Missing Values
cols_to_impute = ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']
imputer = SimpleImputer(strategy='median')
df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])

# 7. Drop unnecessary columns
df = df.drop(['Customers', 'Store', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], axis=1, errors='ignore')

print("Data preparation and feature engineering complete.")

Preparing all features...
Data preparation and feature engineering complete.


In [4]:
# --- Step 4: Split Data into Train and Test Sets ---
# We'll use the last 6 weeks as our test set (a standard time series split)
cutoff_date = df['ds'].max() - pd.to_timedelta('42 days')
df_train = df[df['ds'] <= cutoff_date].copy()
df_test = df[df['ds'] > cutoff_date].copy()

# Prepare X and y, dropping the date column
y_train = df_train['y']
X_train = df_train.drop(['ds', 'y'], axis=1)
y_test = df_test['y']
X_test = df_test.drop(['ds', 'y'], axis=1)

# Align columns to ensure test set matches train set
train_cols = X_train.columns
X_test = X_test.reindex(columns=train_cols, fill_value=0)

print(f"Training set: {X_train.shape[0]} records")
print(f"Testing set:  {X_test.shape[0]} records")

Training set: 804110 records
Testing set:  40282 records


In [5]:
# --- Step 5: Scale the Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data scaling complete.")

Data scaling complete.


In [6]:
# --- Step 6: The Showdown - Train and Evaluate Both Models ---

# --- Model 1: Random Forest (Our Champion) ---
print("\n--- Training Random Forest Champion ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_features=0.5)
rf_model.fit(X_train_scaled, y_train)
rf_predictions = rf_model.predict(X_test_scaled)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MAE: €{rf_mae:,.2f}")

# --- Model 2: XGBoost (The Challenger) ---
print("\n--- Training XGBoost Challenger ---")
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror')
xgb_model.fit(X_train_scaled, y_train)
xgb_predictions = xgb_model.predict(X_test_scaled)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MAE:       €{xgb_mae:,.2f}")


--- Training Random Forest Champion ---
Random Forest MAE: €798.54

--- Training XGBoost Challenger ---
XGBoost MAE:       €1,036.54


In [7]:
# --- Step 7: The Final Verdict ---
print("\n--- The Showdown: Final Results ---")
print(f"Random Forest MAE: €{rf_mae:,.2f}")
print(f"XGBoost MAE:       €{xgb_mae:,.2f}")

if xgb_mae < rf_mae:
    print("\nWE HAVE A NEW CHAMPION! XGBoost wins!")
else:
    print("\nThe reigning champion, Random Forest, holds its ground!")


--- The Showdown: Final Results ---
Random Forest MAE: €798.54
XGBoost MAE:       €1,036.54

The reigning champion, Random Forest, holds its ground!
