In [1]:
import pandas as pd
df=pd.read_csv("data/raw_conveyor_cart_data.csv")
df["TIMESTAMP"]=pd.to_datetime(df["TIMESTAMP"], unit = 's', errors = 'coerce')
df_resampled = df.set_index("TIMESTAMP").resample("1min").first().ffill().dropna().reset_index()
df_resampled.head()


Unnamed: 0,TIMESTAMP,CONVEYOR_STATUS_16,CONVEYOR_STATUS_03,CONVEYOR_STATUS_04,CONVEYOR_STATUS_01,CONVEYOR_STATUS_06,CONVEYOR_STATUS_08,CONVEYOR_STATUS_05,CONVEYOR_STATUS_12,CONVEYOR_STATUS_09,...,CONVEYOR_STATUS_11,CONVEYOR_STATUS_20,CONVEYOR_STATUS_19,CONVEYOR_STATUS_21,CONVEYOR_STATUS_25,CONVEYOR_STATUS_23,CONVEYOR_STATUS_24,CONVEYOR_STATUS_27,CONVEYOR_STATUS_26,CONVEYOR_STATUS_29
0,2026-01-17 16:55:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
1,2026-01-17 16:56:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
2,2026-01-17 16:57:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,2026-01-17 16:58:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,2026-01-17 16:59:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [2]:
df_resampled.columns

Index(['TIMESTAMP', 'CONVEYOR_STATUS_16', 'CONVEYOR_STATUS_03',
       'CONVEYOR_STATUS_04', 'CONVEYOR_STATUS_01', 'CONVEYOR_STATUS_06',
       'CONVEYOR_STATUS_08', 'CONVEYOR_STATUS_05', 'CONVEYOR_STATUS_12',
       'CONVEYOR_STATUS_09', 'CONVEYOR_STATUS_07', 'CONVEYOR_STATUS_02',
       'CONVEYOR_STATUS_13', 'CONVEYOR_STATUS_17', 'CONVEYOR_STATUS_22',
       'CONVEYOR_STATUS_14', 'CONVEYOR_STATUS_11', 'CONVEYOR_STATUS_20',
       'CONVEYOR_STATUS_19', 'CONVEYOR_STATUS_21', 'CONVEYOR_STATUS_25',
       'CONVEYOR_STATUS_23', 'CONVEYOR_STATUS_24', 'CONVEYOR_STATUS_27',
       'CONVEYOR_STATUS_26', 'CONVEYOR_STATUS_29'],
      dtype='object')

In [3]:
df_resampled["TIMESTAMP"].min(), df_resampled["TIMESTAMP"].max()

(Timestamp('2026-01-17 16:55:00'), Timestamp('2026-02-05 00:28:00'))

In [6]:
conveyor_cols = [col for col in df_resampled.columns if col.startswith("CONVEYOR_")]

## ENGINEERED FEATURES Function

Engineered features: 

feature_cols = [
    
    'num_carts_full',
    'num_carts_empty',
    'num_carts_maintenance',

    'num_carts_full_lag1',
    'num_carts_empty_lag1',
    'num_carts_maintenance_lag1',

    'num_carts_full_roll_mean',
    'num_carts_empty_roll_mean',
    'num_carts_maintenance_roll_mean',

    'num_carts_full_roll_std',
    'num_carts_empty_roll_std',
    'num_carts_maintenance_roll_std',

    'num_carts_full_diff1',
    'num_carts_empty_diff1',
    'num_carts_maintenance_diff1'
]


In [7]:
df_resampled["num_carts_full"]  = (df_resampled[conveyor_cols] == 0).sum(axis=1)
df_resampled["num_carts_empty"] = (df_resampled[conveyor_cols] == 1).sum(axis=1)
df_resampled["num_carts_maintenance"]  = (df_resampled[conveyor_cols] == 2).sum(axis=1)

In [8]:
import pandas as pd
import numpy as np


def create_cart_features(df, horizon=15, roll_window=5):
    """
    Create time-series features for cart availability prediction.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with columns:
        ['TIMESTAMP', 'num_carts_full', 'num_carts_empty', 'num_carts_maintenance']

    horizon : int
        Prediction horizon in minutes (e.g., 15, 30, 45, 60)

    roll_window : int
        Rolling window size in minutes

    Returns
    -------
    feature_df : pd.DataFrame
        DataFrame with engineered features and target column
    """

    df = df.sort_values("TIMESTAMP").reset_index(drop=True).copy()

    base_cols = ["num_carts_full", "num_carts_empty", "num_carts_maintenance"]

    # -------------------------
    # Lag Features
    # -------------------------
    for col in base_cols:
        df[f"{col}_lag1"] = df[col].shift(1)

    # -------------------------
    # Rolling Mean + Std
    # -------------------------
    for col in base_cols:
        df[f"{col}_roll_mean"] = df[col].rolling(roll_window).mean()
        df[f"{col}_roll_std"] = df[col].rolling(roll_window).std()

    # -------------------------
    # Difference / Momentum
    # -------------------------
    for col in base_cols:
        df[f"{col}_diff1"] = df[col].diff(1)

    # -------------------------
    # Target Creation
    # -------------------------
    df[f"y_carts_empty_{horizon}min"] = df["num_carts_empty"].shift(-horizon)

    # Drop rows where target not available
    df = df.dropna(subset=[f"y_carts_empty_{horizon}min"])

    return df


In [11]:
feature_df_15 = create_cart_features(df_resampled, horizon=15, roll_window=3)
feature_df_30 = create_cart_features(df_resampled, horizon=30, roll_window=5)
feature_df_45 = create_cart_features(df_resampled, horizon=45, roll_window=10)
feature_df_60 = create_cart_features(df_resampled, horizon=60, roll_window=15)


## Train the model

In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import plotly.graph_objects as go


def train_xgb_model(
    feature_df,
    horizon_min,
    feature_cols, 
    model_dir="model_files",
    plot=True,
    target_prefix = "y_carts_empty"
):
    """
    Train, evaluate, save and optionally plot XGBoost model.

    Parameters
    ----------
    feature_df : dataframe containing features and target
    horizon_min : int (15 / 30 / 45 / 60)
    model_dir : directory to save model
    plot : bool
    target : str, target column name
    model_dir : directory to save model
    plot : bool

    Returns
    -------
    model, predictions dataframe
    """
    print(f"\nTraining XGBoost model for {horizon_min} min horizon...")
    # ========================
    # MODEL
    # ========================
    model = XGBRegressor(
        n_estimators=600,
        max_depth=3,
        learning_rate=0.04,
        subsample=0.65,
        colsample_bytree=0.65,
        min_child_weight=5,
        gamma=0.5,
        reg_alpha=0.1,
        reg_lambda=1.0,
        objective="reg:squarederror",
        random_state=42
    )
    target_col = f"{target_prefix}_{horizon_min}min"

    # -------- 80 / 20 TIME SPLIT --------
    split_index = int(len(feature_df) * 0.8)

    train_df = feature_df.iloc[:split_index].copy()
    test_df = feature_df.iloc[split_index:].copy()

    Xtrain = train_df[feature_cols]
    ytrain = train_df[target_col]

    Xtest = test_df[feature_cols]
    ytest = test_df[target_col]
    
    print(f"Training samples: {len(Xtrain)}, Testing samples: {len(Xtest)} for {horizon_min} min horizon")
    model.fit(Xtrain, ytrain)

    # ========================
    # PREDICTIONS
    # ========================
    y_pred = model.predict(Xtest)
    y_pred_round = np.rint(y_pred).astype(int)

    # ========================
    # METRICS
    # ========================
    mae = mean_absolute_error(ytest, y_pred)
    r2_train = r2_score(ytrain, model.predict(Xtrain))
    r2_test = r2_score(ytest, y_pred)

    print(f"\n===== {horizon_min} MIN MODEL =====")
    print("MAE:", mae)
    print("R2 Train:", r2_train)
    print("R2 Test:", r2_test)

    # ========================
    # SAVE MODEL
    # ========================
    Path(model_dir).mkdir(exist_ok=True)

    model_path = Path(model_dir) / f"xgb_regressor_cart{horizon_min}min.json"
    model.save_model(model_path)

    print(f"Model saved at: {model_path}")

    # ========================
    # PLOT
    # ========================
    if plot:

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=test_df["TIMESTAMP"],
                y=ytest,
                mode="lines",
                name="Actual",
                line=dict(width=2)
            )
        )

        fig.add_trace(
            go.Scatter(
                x=test_df["TIMESTAMP"],
                y=y_pred_round,
                mode="lines",
                name="Predicted",
                line=dict(width=2)
            )
        )

        error = ytest.values - y_pred_round

        fig.add_trace(
            go.Scatter(
                x=test_df["TIMESTAMP"],
                y=error,
                mode="lines",
                name="Error"
            )
        )

        fig.update_layout(
            title=f"Actual vs Predicted ({horizon_min} min ahead)",
            xaxis_title="Timestamp",
            yaxis_title="Target",
            template="plotly_white"
        )

        fig.show()

    # ========================
    # RETURN RESULTS
    # ========================
    results_df = pd.DataFrame({
        "TIMESTAMP": test_df["TIMESTAMP"],
        "ACTUAL": ytest,
        "PREDICTED": y_pred_round
    })
    print(f"Training and evaluation completed for {horizon_min} min horizon.\n\n")
    return model, results_df


In [14]:
feature_cols = [
    'num_carts_full',
    'num_carts_empty',
    'num_carts_maintenance',

    'num_carts_full_lag1',
    'num_carts_empty_lag1',
    'num_carts_maintenance_lag1',

    'num_carts_full_roll_mean',
    'num_carts_empty_roll_mean',
    'num_carts_maintenance_roll_mean',

    'num_carts_full_roll_std',
    'num_carts_empty_roll_std',
    'num_carts_maintenance_roll_std',

    'num_carts_full_diff1',
    'num_carts_empty_diff1',
    'num_carts_maintenance_diff1'
]

model15, res15 = train_xgb_model(
    feature_df_15,
    horizon_min=15,
    feature_cols=feature_cols
)   


model30, res30 = train_xgb_model(
    feature_df_30,
    horizon_min=30,
    feature_cols=feature_cols
)   
    
model45, res45 = train_xgb_model(
    feature_df_45,
    horizon_min=45,
    feature_cols=feature_cols
)

model60, res60 = train_xgb_model(
    feature_df_60,
    horizon_min=60,
    feature_cols=feature_cols
)



Training XGBoost model for 15 min horizon...
Training samples: 21087, Testing samples: 5272 for 15 min horizon

===== 15 MIN MODEL =====
MAE: 0.6240860220152257
R2 Train: 0.898376178752355
R2 Test: 0.8666816531294459
Model saved at: model_files\xgb_regressor_cart15min.json


Training and evaluation completed for 15 min horizon.



Training XGBoost model for 30 min horizon...
Training samples: 21075, Testing samples: 5269 for 30 min horizon

===== 30 MIN MODEL =====
MAE: 0.9034027876811409
R2 Train: 0.8347344402961844
R2 Test: 0.7911839874457194
Model saved at: model_files\xgb_regressor_cart30min.json


Training and evaluation completed for 30 min horizon.



Training XGBoost model for 45 min horizon...
Training samples: 21063, Testing samples: 5266 for 45 min horizon

===== 45 MIN MODEL =====
MAE: 1.159560230173994
R2 Train: 0.7680681239853304
R2 Test: 0.6891321596139075
Model saved at: model_files\xgb_regressor_cart45min.json


Training and evaluation completed for 45 min horizon.



Training XGBoost model for 60 min horizon...
Training samples: 21051, Testing samples: 5263 for 60 min horizon

===== 60 MIN MODEL =====
MAE: 1.3208203122753075
R2 Train: 0.7173627333655977
R2 Test: 0.6085587495238642
Model saved at: model_files\xgb_regressor_cart60min.json


Training and evaluation completed for 60 min horizon.


