In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)
mlflow.set_experiment("RandomForest_Iterative_Baseline")

features_data = pd.read_csv('data/features.csv')
train_data = pd.read_csv('data/train.csv')
stores = pd.read_csv('data/stores.csv')

df = train_data.merge(features_data, on=['Store', 'Date'], how='inner').merge(stores, on=['Store'], how='inner')

if 'IsHoliday_y' in df.columns:
    df.drop(['IsHoliday_y'], axis=1, inplace=True)
    df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Store', 'Dept', 'Date'])
df = df[df['Weekly_Sales'] >= 0]

target = 'Weekly_Sales'

features_step0 = ['Store', 'Dept', 'IsHoliday']

split_date = df['Date'].quantile(0.8)
train_df = df[df['Date'] <= split_date]
val_df = df[df['Date'] > split_date]

X_train = train_df[features_step0].copy()
y_train = train_df[target].copy()
X_val = val_df[features_step0].copy()
y_val = val_df[target].copy()

for col in ['Store', 'Dept']:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    
    # Only transform values that exist in the training set, others will be set to -1
    X_val[col] = X_val[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

with mlflow.start_run(run_name="Step0_Baseline"):
    params = {
        "n_estimators": 100,
        "max_depth": 10,
        "min_samples_split": 2,
        "random_state": 42,
        "n_jobs": -1
    }
    
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    
    mlflow.log_params(params)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(model, "model")
    
    print(f"RandomForest Step 0 MAE: {mae:.4f}")

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
df["Quarter"] = df["Date"].dt.quarter
df["DayOfWeek"] = df["Date"].dt.weekday
df["IsWeekend"] = df["DayOfWeek"].isin([5, 6]).astype(int)

features_step1 = [
    'Store', 'Dept', 'IsHoliday',
    'Year', 'Month', 'Week', 'Quarter',
    'DayOfWeek', 'IsWeekend'
]

# Split data
train_df = df[df['Date'] <= split_date]
val_df = df[df['Date'] > split_date]

X_train = train_df[features_step1].copy()
y_train = train_df[target].copy()
X_val = val_df[features_step1].copy()
y_val = val_df[target].copy()

# Encode categorical features
for col in ['Store', 'Dept']:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = X_val[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

with mlflow.start_run(run_name="Step1_TimeBasedFeatures"):
    params = {
        "n_estimators": 150,
        "max_depth": 15,
        "min_samples_split": 2,
        "random_state": 42,
        "n_jobs": -1
    }
    
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    
    mlflow.log_params(params)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(model, "model")
    
    print(f"RandomForest Step 1 MAE: {mae:.4f}")

superbowl_dates = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
laborday_dates = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
thanksgiving_dates = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
christmas_dates = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

df['Is_SuperBowl'] = 0
df['Is_LaborDay'] = 0
df['Is_Thanksgiving'] = 0
df['Is_Christmas'] = 0

df.loc[df['Date'].isin(superbowl_dates), 'Is_SuperBowl'] = 1
df.loc[df['Date'].isin(laborday_dates), 'Is_LaborDay'] = 1
df.loc[df['Date'].isin(thanksgiving_dates), 'Is_Thanksgiving'] = 1
df.loc[df['Date'].isin(christmas_dates), 'Is_Christmas'] = 1

features_step2 = [
    'Store', 'Dept', 'IsHoliday',    
    'Month', 'Year', 'Week', 'Quarter',
    'Is_SuperBowl', 'Is_LaborDay', 'Is_Thanksgiving', 'Is_Christmas'
]

train_df = df[df['Date'] <= split_date]
val_df = df[df['Date'] > split_date]

X_train = train_df[features_step2].copy()
y_train = train_df[target].copy()
X_val = val_df[features_step2].copy()
y_val = val_df[target].copy()

for col in ['Store', 'Dept']:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = X_val[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

with mlflow.start_run(run_name="Step2_HolidayFeatures"):
    params = {
        "n_estimators": 200,
        "max_depth": 20,
        "min_samples_split": 2,
        "random_state": 42,
        "n_jobs": -1
    }
    
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    
    mlflow.log_params(params)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(model, "model")
    
    print(f"RandomForest Step 2 MAE: {mae:.4f}")

df.to_csv("train_merged_full_rf.csv", index=False)
print("Saved processed data for inference")

from mlflow.tracking import MlflowClient

def log_the_model(run_id, description):
    client = MlflowClient()
    
    model_uri = f"runs:/{run_id}/model"
    model_name = "Best_RandomForest_Model"
    
    try:
        client.create_registered_model(model_name)
    except mlflow.exceptions.RestException:
        pass  
    
    client.create_model_version(
        name=model_name,
        source=model_uri,
        run_id=run_id,
        description=description
    )




RandomForest Step 0 MAE: 5749.0643
🏃 View run Step0_Baseline at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/25/runs/4048c92c3d3b4fe7b838f2f76251b380
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/25




RandomForest Step 1 MAE: 2644.1363
🏃 View run Step1_TimeBasedFeatures at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/25/runs/71a96c838124401197b7adf9749ca957
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/25




RandomForest Step 2 MAE: 1594.0438
🏃 View run Step2_HolidayFeatures at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/25/runs/d2c1072a85e5431b9b07d965bbee705f
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/25
Saved processed data for inference


In [3]:
log_the_model("d2c1072a85e5431b9b07d965bbee705f", "Bet random forest model")

2025/08/03 10:57:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best_RandomForest_Model, version 1
