In [19]:
# --- Step 1: Import Libraries --- #
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

In [20]:
# --- Step 2: Load and Merge Data --- #
# Make sure the paths are correct for your project structure
try:
    df_train = pd.read_csv('../datasets/train.csv', low_memory=False)
    df_store = pd.read_csv('../datasets/store.csv', low_memory=False)
    print("Files loaded successfully.")
except FileNotFoundError:
    print("Error: Make sure 'train.csv' and 'store.csv' are in your datasets folder.")
    exit()

df = pd.merge(df_train, df_store, how='left', on='Store')
print("Datasets merged successfully.")

Files loaded successfully.
Datasets merged successfully.


In [21]:
# --- Step 3: Initial Cleaning & Filtering (Lessons from EDA) --- #
# Filter out days when stores were closed, as sales are always 0.
df = df[df['Open'] == 1].copy()
# We no longer need the 'Open' column as it's now always 1.
df = df.drop('Open', axis=1)

# Convert the 'Date' column to a proper datetime object
df['Date'] = pd.to_datetime(df['Date'])
print("Initial cleaning and filtering complete.")

Initial cleaning and filtering complete.


In [22]:
# --- Step 4: Feature Engineering --- #
print("Performing feature engineering on the Date column...")
# Extract time-based features from the 'Date' column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek # Monday=0, Sunday=6

# We no longer need the original 'Date' column
df = df.drop('Date', axis=1)

# Handle the 'StateHoliday' column which has mixed types (0 and '0')
df['StateHoliday'] = df['StateHoliday'].replace({0: '0'})

# Convert other categorical columns into numbers using One-Hot Encoding
categorical_features = ['StoreType', 'Assortment', 'StateHoliday']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
print("Feature engineering complete.")

Performing feature engineering on the Date column...
Feature engineering complete.


In [23]:
# --- Step 5: Final Data Preparation for Modeling --- #
# Define our target (y) and initial features (X)
y = df['Sales']
# We drop columns that are either results, IDs, or too complex for this model
X = df.drop(['Sales', 'Customers', 'Store', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], axis=1)

# One-Hot Encode all remaining text columns
X = pd.get_dummies(X, drop_first=True)

# Impute missing 'CompetitionDistance' values with the median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# Convert back to DataFrame
X = pd.DataFrame(X_imputed, columns=X.columns)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# --- Step 6: Train and Evaluate the Model --- #
print("\n--- Training the Random Forest Regressor (This may take a few minutes)... ---")

# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_features=0.5)
model.fit(X_train_scaled, y_train)

print("--- Model training complete. Evaluating... ---")

# Make predictions on the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, predictions)

print("\n--- Final Model Evaluation ---")
print(f"Mean Absolute Error (MAE): €{mae:,.2f}")


--- Training the Random Forest Regressor (This may take a few minutes)... ---
--- Model training complete. Evaluating... ---

--- Final Model Evaluation ---
Mean Absolute Error (MAE): €789.71
