In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import mlflow
import dagshub

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)
mlflow.set_experiment("XGBoost_Iterative_Baseline")

2025/08/01 18:57:23 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Iterative_Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/f2383a4b0e2f4b66bea409a00065738a', creation_time=1754060243516, experiment_id='10', last_update_time=1754060243516, lifecycle_stage='active', name='XGBoost_Iterative_Baseline', tags={}>

# Load and preprocess data

In [4]:
features_data = pd.read_csv('data/features.csv')
train_data = pd.read_csv('data/train.csv')
stores = pd.read_csv('data/stores.csv')

df = train_data.merge(features_data, on=['Store', 'Date'], how='inner').merge(stores, on=['Store'], how='inner')

if 'IsHoliday_y' in df.columns:
    df.drop(['IsHoliday_y'], axis=1, inplace=True)
    df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Store', 'Dept', 'Date'])
df = df[df['Weekly_Sales'] >= 0]

In [7]:
df=df.sort_values('Date')

# Step 0, use a few features

In [9]:
features_step0 = ['Store', 'Dept', 'IsHoliday']
target = 'Weekly_Sales'

split_date = df['Date'].quantile(0.8)
train_df = df[df['Date'] <= split_date]
val_df = df[df['Date'] > split_date]

X_train = train_df[features_step0]
y_train = train_df[target]

X_val = val_df[features_step0]
y_val = val_df[target]

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "seed": 42,
    "verbosity": 0,
    "learning_rate": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}

evals = [(dtrain, 'train'), (dval, 'val')]

with mlflow.start_run(run_name="Step0_Baseline"):
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        early_stopping_rounds=50,
        evals=evals,
        verbose_eval=50
    )

    y_pred = model.predict(dval)
    mae = mean_absolute_error(y_val, y_pred)

    mlflow.log_params(params)
    mlflow.log_metric("mae", mae)
    mlflow.xgboost.log_model(model, "model")

    print(f"XGBoost Step 0 MAE: {mae:.4f}")

[0]	train-mae:14371.87819	val-mae:14261.70120
[50]	train-mae:6752.62971	val-mae:6501.82036
[100]	train-mae:5940.54401	val-mae:5695.87137
[150]	train-mae:5556.52738	val-mae:5325.77721
[200]	train-mae:5269.94591	val-mae:5044.25628
[250]	train-mae:5070.53181	val-mae:4855.45118
[300]	train-mae:4852.46346	val-mae:4636.10713
[350]	train-mae:4689.13335	val-mae:4476.92122
[400]	train-mae:4518.32036	val-mae:4308.65365
[450]	train-mae:4354.25654	val-mae:4145.27780
[500]	train-mae:4247.63852	val-mae:4041.86393
[550]	train-mae:4186.44808	val-mae:3979.71676
[600]	train-mae:4125.29161	val-mae:3920.19380
[650]	train-mae:4051.45508	val-mae:3847.44046
[700]	train-mae:4010.94736	val-mae:3805.82071
[750]	train-mae:3939.11687	val-mae:3737.81563
[800]	train-mae:3883.19749	val-mae:3682.97217
[850]	train-mae:3837.20776	val-mae:3639.04430
[900]	train-mae:3800.63370	val-mae:3603.48100
[950]	train-mae:3747.12578	val-mae:3552.11011
[999]	train-mae:3725.87524	val-mae:3531.67900




XGBoost Step 0 MAE: 3531.6790
🏃 View run Step0_Baseline at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10/runs/60254500012f4a688d4feb47a47c56f7
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10


# Step 2, add time based features

In [11]:
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
df["Quarter"] = df["Date"].dt.quarter
df["DayOfWeek"] = df["Date"].dt.weekday
df["IsWeekend"] = df["DayOfWeek"].isin([5, 6]).astype(int)

In [12]:
features_step1 = [
    'Store', 'Dept', 'IsHoliday',
    'Year', 'Month', 'Week', 'Quarter',
    'DayOfWeek', 'IsWeekend'
]

split_date = df['Date'].quantile(0.8)
train_df = df[df['Date'] <= split_date]
val_df = df[df['Date'] > split_date]

X_train = train_df[features_step1]
y_train = train_df[target]

X_val = val_df[features_step1]
y_val = val_df[target]

train_set = xgb.DMatrix(X_train, label=y_train)
val_set = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "seed": 42,
    "learning_rate": 0.05,
    "max_depth": 6,
}

with mlflow.start_run(run_name="Step1_TimeBasedFeatures") as run:
    model = xgb.train(
        params,
        train_set,
        num_boost_round=1000,
        evals=[(train_set, "train"), (val_set, "val")],
        early_stopping_rounds=50,
        verbose_eval=50,
    )

    y_pred = model.predict(val_set)
    val_mae = mean_absolute_error(y_val, y_pred)
    mlflow.log_metric("val_mae", val_mae)
    mlflow.xgboost.log_model(model, artifact_path="model")
    print(f"XGBoost Step 1 MAE: {val_mae:.4f}")

[0]	train-mae:14758.75129	val-mae:14661.30299
[50]	train-mae:7878.83811	val-mae:7652.37068
[100]	train-mae:6339.36236	val-mae:6104.68396
[150]	train-mae:5584.70972	val-mae:5389.06671
[200]	train-mae:5090.98147	val-mae:4930.35533
[250]	train-mae:4727.27543	val-mae:4616.79250
[300]	train-mae:4461.36249	val-mae:4385.11416
[350]	train-mae:4287.14810	val-mae:4240.85322
[400]	train-mae:4145.06392	val-mae:4107.50270
[450]	train-mae:4022.41472	val-mae:3995.00074
[500]	train-mae:3913.80855	val-mae:3891.28828
[550]	train-mae:3820.18046	val-mae:3804.45314
[600]	train-mae:3733.05636	val-mae:3738.77330
[650]	train-mae:3645.79043	val-mae:3654.19788
[700]	train-mae:3554.25405	val-mae:3577.80468
[750]	train-mae:3481.91772	val-mae:3517.30652
[800]	train-mae:3423.02505	val-mae:3469.42542
[850]	train-mae:3352.00702	val-mae:3403.62442
[900]	train-mae:3287.04091	val-mae:3350.60906
[950]	train-mae:3227.57550	val-mae:3297.56515
[999]	train-mae:3150.75069	val-mae:3225.32726




XGBoost Step 1 MAE: 3225.3273
🏃 View run Step1_TimeBasedFeatures at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10/runs/78a914467b4446aab1b7285d77b1ef07
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10


# step 2, add holiday based features

In [14]:
superbowl_dates = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
laborday_dates = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
thanksgiving_dates = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
christmas_dates = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

df['Is_SuperBowl'] = 0
df['Is_LaborDay'] = 0
df['Is_Thanksgiving'] = 0
df['Is_Christmas'] = 0

df.loc[df['Date'].isin(superbowl_dates), 'Is_SuperBowl'] = 1
df.loc[df['Date'].isin(laborday_dates), 'Is_LaborDay'] = 1
df.loc[df['Date'].isin(thanksgiving_dates), 'Is_Thanksgiving'] = 1
df.loc[df['Date'].isin(christmas_dates), 'Is_Christmas'] = 1

In [16]:
from sklearn.model_selection import train_test_split

features_step2 = [
    'Store', 'Dept', 'IsHoliday',    
    'Month', 'Year', 'Week', 'Quarter',
    'Is_SuperBowl', 'Is_LaborDay', 'Is_Thanksgiving', 'Is_Christmas'
]

X = df[features_step2]
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

train_set = xgb.DMatrix(X_train, label=y_train)
val_set = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "tree_method": "hist",
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

with mlflow.start_run(run_name="XGBoost_Step2"):
    model = xgb.train(
        params,
        train_set,
        num_boost_round=1000,
        evals=[(train_set, "train"), (val_set, "val")],
        early_stopping_rounds=100,
        verbose_eval=50
    )

    y_pred = model.predict(val_set)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"XGBoost Step 2 MAE: {mae:.4f}")

    mlflow.log_params(params)
    mlflow.log_metric("val_mae", mae)
    mlflow.xgboost.log_model(model, "model")

[0]	train-mae:15205.47537	val-mae:15121.62524
[50]	train-mae:8123.57494	val-mae:7965.38167
[100]	train-mae:6505.29451	val-mae:6348.98331
[150]	train-mae:5931.65595	val-mae:5809.42671
[200]	train-mae:5502.61521	val-mae:5412.91517
[250]	train-mae:5255.78151	val-mae:5181.57143
[300]	train-mae:4988.95534	val-mae:4933.28736
[350]	train-mae:4783.03134	val-mae:4759.39461
[400]	train-mae:4637.38041	val-mae:4621.04423
[450]	train-mae:4524.19314	val-mae:4530.76180
[500]	train-mae:4359.18704	val-mae:4382.16156
[550]	train-mae:4214.37021	val-mae:4254.12737
[600]	train-mae:4124.07471	val-mae:4174.28148
[650]	train-mae:4027.23287	val-mae:4091.75994
[700]	train-mae:3925.04031	val-mae:3999.90237
[750]	train-mae:3859.33087	val-mae:3934.39573
[800]	train-mae:3759.81081	val-mae:3836.26341
[850]	train-mae:3695.86049	val-mae:3785.67720
[900]	train-mae:3627.13050	val-mae:3735.01717
[950]	train-mae:3591.55512	val-mae:3704.05741
[999]	train-mae:3529.06669	val-mae:3650.19438
XGBoost Step 2 MAE: 3650.1944




🏃 View run XGBoost_Step2 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10/runs/6f764691bb734ebb8f1bf93dbdf12e84
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10


# step 3, create lag features

In [21]:
LAG_WEEKS = [1, 2, 3, 4, 52]
for lag in LAG_WEEKS:
    df[f"Sales_Lag_{lag}"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].shift(lag)

In [22]:
df_lagged = df.dropna(subset=[f"Sales_Lag_{lag}" for lag in LAG_WEEKS])

In [25]:
features_step3 = [
    'Store', 'Dept', 'IsHoliday', 'Month', 'Year', 'Week', 'Quarter',
    'Temperature', 'Fuel_Price', 'Size', 'Type',
    'Is_SuperBowl', 'Is_LaborDay', 'Is_Thanksgiving', 'Is_Christmas',
    'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_52'
]
target = 'Weekly_Sales'

X = df_lagged[features_step3]
y = df_lagged[target]

for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category').cat.codes

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=False, test_size=0.2)

model = xgb.train(
    {
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "seed": 42
    },
    dtrain=xgb.DMatrix(X_train, label=y_train),
    num_boost_round=1000,
    evals=[(xgb.DMatrix(X_train, y_train), "train"), (xgb.DMatrix(X_val, y_val), "val")],
    early_stopping_rounds=50,
    verbose_eval=50
)

y_pred = model.predict(xgb.DMatrix(X_val))
mae = mean_absolute_error(y_val, y_pred)
print(f"XGBoost Step 3 MAE: {mae:.4f}")

with mlflow.start_run(run_name="XGBoost_Step3"):
    mlflow.log_params({"model": "xgboost", "step": "3", "lags": str(LAG_WEEKS)})
    mlflow.log_metric("mae", mae)
    mlflow.xgboost.log_model(model, "model")

[0]	train-mae:10899.10236	val-mae:10718.31034


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category').cat.codes


[50]	train-mae:1278.07373	val-mae:1315.98185
[67]	train-mae:1243.95762	val-mae:1313.98138
XGBoost Step 3 MAE: 1313.9814




🏃 View run XGBoost_Step3 at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10/runs/991ef63ee4f04aa79c8c55cc88e0afb8
🧪 View experiment at: https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow/#/experiments/10


# store merged train DF

In [26]:
import os

df_path = "train_merged_full.csv"
df.to_csv(df_path, index=False)

print(f"Saved and logged train merged dataframe at {df_path}")

Saved and logged train merged dataframe at train_merged_full.csv


# Log the best model

In [27]:
from mlflow.tracking import MlflowClient

def log_the_model(run_id, description):
    client = MlflowClient()
    
    model_uri = f"runs:/{run_id}/model"
    model_name = "Best_XGBoost_Model"
    
    try:
        client.create_registered_model(model_name)
    except mlflow.exceptions.RestException:
        pass  
    
    client.create_model_version(
        name=model_name,
        source=model_uri,
        run_id=run_id,
        description=description
    )

In [28]:
log_the_model("991ef63ee4f04aa79c8c55cc88e0afb8", "Step 3 XGBoost Model")

2025/08/01 19:24:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best_XGBoost_Model, version 1
