# 3. Model Training: RF vs XGBoost

**Models**:
- RandomForest (baseline)
- XGBoost (primary model)

## Load Libraries and Data

In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully.")

Libraries loaded successfully.


In [23]:
# Load processed data
df_clean = pd.read_csv('../data/processed_features.csv')
df_clean['booking_date'] = pd.to_datetime(df_clean['booking_date'])

print(f"Loaded {len(df_clean):,} rows")
print(f"Date range: {df_clean['booking_date'].min().date()} to {df_clean['booking_date'].max().date()}")
print(f"\nColumns: {df_clean.shape[1]}")

Loaded 14,551 rows
Date range: 2021-01-07 to 2025-12-30

Columns: 26


In [24]:
# Load preprocessing configuration
with open('../data/preprocessing_config.pkl', 'rb') as f:
    config = pickle.load(f)

feature_cols = config['feature_cols']
target_columns = config['target_columns']
le = config['label_encoder']

print(f"Features loaded: {len(feature_cols)}")
print(f"Target columns: {target_columns}")
print(f"\nFeature list: {feature_cols}")

Features loaded: 16
Target columns: ['Air', 'Express', 'International', 'Surface']

Feature list: ['lag_1_Air', 'lag_7_Air', 'roll_7_Air', 'lag_1_Express', 'lag_7_Express', 'roll_7_Express', 'lag_1_International', 'lag_7_International', 'roll_7_International', 'lag_1_Surface', 'lag_7_Surface', 'roll_7_Surface', 'day_of_week', 'day', 'month', 'company_encoded']


---
## Part 1: Initial Model Training

Standard train/test split using November 2025 as the cutoff.

In [25]:
# Split data at November 2025
split_date = '2025-11-01'
train_df = df_clean[df_clean['booking_date'] < split_date]
test_df = df_clean[df_clean['booking_date'] >= split_date].copy()

print(f"Training set: {len(train_df):,} rows ({train_df['booking_date'].min().date()} to {train_df['booking_date'].max().date()})")
print(f"Test set: {len(test_df):,} rows ({test_df['booking_date'].min().date()} to {test_df['booking_date'].max().date()})")

Training set: 14,071 rows (2021-01-07 to 2025-10-31)
Test set: 480 rows (2025-11-01 to 2025-12-30)


In [26]:
# Initialize model dictionaries
rf_models = {}
xgb_models = {}
evals_results = {}

print("Training models for each shipment type...\n")
print("="*70)

for target in target_columns:
    print(f"\nTraining models for: {target}")
    print("-" * 40)
    
    # RandomForest
    print("  Training RandomForest...")
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(train_df[feature_cols], train_df[f'target_{target}'])
    rf_models[target] = rf
    print("  ✓ RandomForest trained")
    
    # XGBoost with evaluation tracking
    print("  Training XGBoost...")
    xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
    xgb.fit(
        train_df[feature_cols], 
        train_df[f'target_{target}'],
        eval_set=[
            (train_df[feature_cols], train_df[f'target_{target}']), 
            (test_df[feature_cols], test_df[f'target_{target}'])
        ],
        verbose=False
    )
    xgb_models[target] = xgb
    evals_results[target] = xgb.evals_result()
    print("  ✓ XGBoost trained")

print("\n" + "="*70)
print("✓ ALL MODELS TRAINED SUCCESSFULLY")
print("="*70)
print(f"\nRandomForest models: {list(rf_models.keys())}")
print(f"XGBoost models: {list(xgb_models.keys())}")

Training models for each shipment type...


Training models for: Air
----------------------------------------
  Training RandomForest...
  ✓ RandomForest trained
  Training XGBoost...
  ✓ XGBoost trained

Training models for: Express
----------------------------------------
  Training RandomForest...
  ✓ RandomForest trained
  Training XGBoost...
  ✓ XGBoost trained

Training models for: International
----------------------------------------
  Training RandomForest...
  ✓ RandomForest trained
  Training XGBoost...
  ✓ XGBoost trained

Training models for: Surface
----------------------------------------
  Training RandomForest...
  ✓ RandomForest trained
  Training XGBoost...
  ✓ XGBoost trained

✓ ALL MODELS TRAINED SUCCESSFULLY

RandomForest models: ['Air', 'Express', 'International', 'Surface']
XGBoost models: ['Air', 'Express', 'International', 'Surface']


## Quick Validation on Test Set

In [27]:
# Make predictions on test set
print("Making predictions on test set...\n")

for target in target_columns:
    # XGBoost predictions
    test_df[f'pred_xgb_{target}'] = xgb_models[target].predict(test_df[feature_cols])
    
    # Calculate MAE
    mae = mean_absolute_error(test_df[f'target_{target}'], test_df[f'pred_xgb_{target}'])
    print(f"{target:15s} - Test MAE: {mae:.2f}")

print("\n✓ Initial validation complete")

Making predictions on test set...

Air             - Test MAE: 1.94
Express         - Test MAE: 1.41
International   - Test MAE: 0.69
Surface         - Test MAE: 1.98

✓ Initial validation complete


---
## Part 2: Backtest - The Rigorous Seasonality Check

**Objective**: Validate model performance by:
1. Training ONLY on pre-2025 data
2. Recursively forecasting ALL of 2025 (365 days)
3. Comparing predictions against actual 2025 values

**Key Feature**: Robust recursive loop that recalculates features at each step.

### Step 1: Train Backtest Models (Pre-2025 Data Only)

In [28]:
# Train on data before 2025
backtest_train = df_clean[df_clean['booking_date'].dt.year < 2025].copy()
backtest_models = {}

print(f"Backtest training set: {len(backtest_train):,} rows")
print(f"Date range: {backtest_train['booking_date'].min().date()} to {backtest_train['booking_date'].max().date()}")
print("\nTraining backtest models...\n")

for target in target_columns:
    print(f"  Training {target}...", end=" ")
    m = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
    m.fit(backtest_train[feature_cols], backtest_train[f'target_{target}'])
    backtest_models[target] = m
    print("✓")

print("\n✓ Backtest models trained on pre-2025 data")

Backtest training set: 11,639 rows
Date range: 2021-01-07 to 2024-12-31

Training backtest models...

  Training Air... ✓
  Training Express... ✓
  Training International... ✓
  Training Surface... ✓

✓ Backtest models trained on pre-2025 data


### Step 2: Recursive 2025 Forecast with Dynamic Feature Recalculation

**Critical Loop Logic**:
- For each day in 2025:
  1. Use recent history (last 10 days per company)
  2. Create placeholder rows for today
  3. **Recalculate ALL features dynamically** (lags, rolling means, calendar)
  4. Make predictions
  5. **Append predictions to history** (crucial for next iteration)

This ensures the model uses only information available at prediction time.

In [30]:
# Load original daily data (needed for maintaining full history)
df = pd.read_csv('../data/shipment_booking_data_2021_2025.csv')
df['booking_date'] = pd.to_datetime(df['booking_date'])

# Recreate df_daily for simulation
df_daily = df.set_index('booking_date').groupby([
    'company_name', 
    pd.Grouper(freq='D')
])['shipment_type'].value_counts().unstack(fill_value=0).reset_index()
df_daily.columns.name = None
df_daily = df_daily.sort_values(['company_name', 'booking_date']).reset_index(drop=True)

# Ensure all target columns exist
for col in target_columns:
    if col not in df_daily.columns:
        df_daily[col] = 0

# Add encoded company column (REQUIRED for simulation loop)
df_daily['company_encoded'] = le.transform(df_daily['company_name'])

print("df_daily recreated for simulation")
print(f"Shape: {df_daily.shape}")

df_daily recreated for simulation
Shape: (14607, 7)


In [31]:
# Define simulation parameters
start_sim = pd.Timestamp('2025-01-01')
end_sim = pd.Timestamp('2025-12-31')
sim_dates = pd.date_range(start_sim, end_sim, freq='D')

companies = df['company_name'].unique()

print(f"Simulating {len(sim_dates)} days of 2025")
print(f"Companies: {len(companies)}")
print("\nStarting recursive forecast loop...\n")

Simulating 365 days of 2025
Companies: 8

Starting recursive forecast loop...



In [32]:
# Initialize simulation history with pre-2025 data
simulation_history = df_daily[df_daily['booking_date'] < start_sim].copy()
simulation_history = simulation_history[['booking_date', 'company_name', 'company_encoded'] + target_columns]

predictions_2025 = []

print(f"Initial history: {len(simulation_history):,} rows")
print(f"Date range: {simulation_history['booking_date'].min().date()} to {simulation_history['booking_date'].max().date()}")

Initial history: 11,687 rows
Date range: 2021-01-01 to 2024-12-31


In [33]:
# MAIN RECURSIVE LOOP
for i, cur_date in enumerate(sim_dates):
    if i % 30 == 0:
        print(f"Processing Month {cur_date.month} (Day {i+1}/{len(sim_dates)})...")
    
    # 1. Get recent history (last 10 days per company)
    recent = simulation_history.groupby('company_name').tail(10).copy()
    
    # 2. Create placeholder rows for current date
    temp_rows = []
    for comp in companies:
        r = {
            'booking_date': cur_date, 
            'company_name': comp, 
            'company_encoded': le.transform([comp])[0]
        }
        for t in target_columns:
            r[t] = 0  # Placeholder values
        temp_rows.append(r)
    
    temp_df = pd.DataFrame(temp_rows)
    combined = pd.concat([recent, temp_df], ignore_index=True)
    combined = combined.sort_values(['company_name', 'booking_date'])
    
    # 3. RECALCULATE FEATURES DYNAMICALLY
    for col in target_columns:
        # Lag features
        combined[f'lag_1_{col}'] = combined.groupby('company_name')[col].shift(1)
        combined[f'lag_7_{col}'] = combined.groupby('company_name')[col].shift(7)
        # Rolling mean (shift(1) to avoid data leakage)
        combined[f'roll_7_{col}'] = combined.groupby('company_name')[col].transform(
            lambda x: x.shift(1).rolling(7, min_periods=1).mean()
        )
    
    # Calendar features
    combined['day_of_week'] = combined['booking_date'].dt.dayofweek
    combined['day'] = combined['booking_date'].dt.day
    combined['month'] = combined['booking_date'].dt.month
    
    # 4. Extract today's data and make predictions
    todays_data = combined[combined['booking_date'] == cur_date].copy()
    
    # Store predictions
    preds_to_store = todays_data[['booking_date', 'company_name', 'company_encoded'] + target_columns].copy()
    
    # Make predictions row by row
    row_preds = []
    for idx, row_series in todays_data.iterrows():
        comp = row_series['company_name']
        res_row = {'Date': cur_date, 'Company': comp}
        
        for t in target_columns:
            pred_val = backtest_models[t].predict(pd.DataFrame([row_series[feature_cols]]))[0]
            pred_val = max(0, pred_val)  # Non-negative constraint
            res_row[f'Pred_{t}'] = pred_val
            
            # Update preds_to_store with prediction
            preds_to_store.loc[preds_to_store['company_name'] == comp, t] = pred_val
        
        row_preds.append(res_row)
    
    predictions_2025.extend(row_preds)
    
    # 5. APPEND PREDICTIONS TO HISTORY (critical for next iteration)
    simulation_history = pd.concat([simulation_history, preds_to_store], ignore_index=True)

print("\n✓ Recursive forecast complete!")
print(f"Total predictions generated: {len(predictions_2025):,}")

Processing Month 1 (Day 1/365)...
Processing Month 1 (Day 31/365)...
Processing Month 3 (Day 61/365)...
Processing Month 4 (Day 91/365)...
Processing Month 5 (Day 121/365)...
Processing Month 5 (Day 151/365)...
Processing Month 6 (Day 181/365)...
Processing Month 7 (Day 211/365)...
Processing Month 8 (Day 241/365)...
Processing Month 9 (Day 271/365)...
Processing Month 10 (Day 301/365)...
Processing Month 11 (Day 331/365)...
Processing Month 12 (Day 361/365)...

✓ Recursive forecast complete!
Total predictions generated: 2,920


### Step 3: Compare Predictions vs Actuals

In [34]:
# Convert predictions to DataFrame
pred_2025_df = pd.DataFrame(predictions_2025)

print(f"Predictions DataFrame shape: {pred_2025_df.shape}")
print(f"\nFirst few predictions:")
display(pred_2025_df.head(10))

Predictions DataFrame shape: (2920, 6)

First few predictions:


Unnamed: 0,Date,Company,Pred_Air,Pred_Express,Pred_International,Pred_Surface
0,2025-01-01,BlueDart,3.592252,2.214094,0.905814,4.060779
1,2025-01-01,DHL Express,6.619059,3.559899,0.639675,7.229125
2,2025-01-01,DTDC,5.131885,2.686127,0.639777,6.09319
3,2025-01-01,Delhivery,6.774354,3.61094,1.250472,9.052608
4,2025-01-01,Ecom Express,5.027586,2.758713,0.718993,5.806388
5,2025-01-01,FedEx India,3.813979,2.078106,0.812825,4.170038
6,2025-01-01,Shadowfax,2.113458,1.512651,0.58458,2.226686
7,2025-01-01,XpressBees,3.585638,1.599782,0.393952,2.744924
8,2025-01-02,BlueDart,2.788623,1.878695,0.616988,4.658002
9,2025-01-02,DHL Express,6.066511,3.220142,0.752716,6.317264


In [35]:
# Get actual 2025 data
actual_2025 = df_clean[df_clean['booking_date'].dt.year == 2025].copy()
actual_2025 = actual_2025[['booking_date', 'company_name'] + [f'target_{t}' for t in target_columns]]
actual_2025 = actual_2025.rename(columns={'booking_date': 'Date', 'company_name': 'Company'})

print(f"Actual 2025 data shape: {actual_2025.shape}")
print(f"\nFirst few actuals:")
display(actual_2025.head(10))

Actual 2025 data shape: (2912, 6)

First few actuals:


Unnamed: 0,Date,Company,target_Air,target_Express,target_International,target_Surface
1455,2025-01-01,BlueDart,3.0,0.0,0.0,4.0
1456,2025-01-02,BlueDart,2.0,2.0,0.0,3.0
1457,2025-01-03,BlueDart,8.0,1.0,1.0,7.0
1458,2025-01-04,BlueDart,6.0,6.0,0.0,3.0
1459,2025-01-05,BlueDart,3.0,3.0,1.0,5.0
1460,2025-01-06,BlueDart,2.0,0.0,0.0,5.0
1461,2025-01-07,BlueDart,2.0,2.0,0.0,0.0
1462,2025-01-08,BlueDart,6.0,3.0,0.0,3.0
1463,2025-01-09,BlueDart,6.0,2.0,0.0,1.0
1464,2025-01-10,BlueDart,3.0,4.0,2.0,7.0


In [36]:
# Merge predictions with actuals
comparison_2025 = pd.merge(
    pred_2025_df,
    actual_2025,
    on=['Date', 'Company'],
    how='inner'
)

print(f"Comparison DataFrame shape: {comparison_2025.shape}")
print(f"\nColumns: {list(comparison_2025.columns)}")
print(f"\nFirst few comparisons:")
display(comparison_2025.head(10))

Comparison DataFrame shape: (2912, 10)

Columns: ['Date', 'Company', 'Pred_Air', 'Pred_Express', 'Pred_International', 'Pred_Surface', 'target_Air', 'target_Express', 'target_International', 'target_Surface']

First few comparisons:


Unnamed: 0,Date,Company,Pred_Air,Pred_Express,Pred_International,Pred_Surface,target_Air,target_Express,target_International,target_Surface
0,2025-01-01,BlueDart,3.592252,2.214094,0.905814,4.060779,3.0,0.0,0.0,4.0
1,2025-01-01,DHL Express,6.619059,3.559899,0.639675,7.229125,4.0,4.0,1.0,9.0
2,2025-01-01,DTDC,5.131885,2.686127,0.639777,6.09319,7.0,3.0,0.0,3.0
3,2025-01-01,Delhivery,6.774354,3.61094,1.250472,9.052608,12.0,3.0,0.0,3.0
4,2025-01-01,Ecom Express,5.027586,2.758713,0.718993,5.806388,4.0,4.0,0.0,6.0
5,2025-01-01,FedEx India,3.813979,2.078106,0.812825,4.170038,1.0,3.0,0.0,3.0
6,2025-01-01,Shadowfax,2.113458,1.512651,0.58458,2.226686,3.0,1.0,0.0,5.0
7,2025-01-01,XpressBees,3.585638,1.599782,0.393952,2.744924,4.0,2.0,1.0,3.0
8,2025-01-02,BlueDart,2.788623,1.878695,0.616988,4.658002,2.0,2.0,0.0,3.0
9,2025-01-02,DHL Express,6.066511,3.220142,0.752716,6.317264,5.0,3.0,0.0,7.0


In [37]:
# Quick metrics check
print("\nQuick MAE check for backtest:")
for target in target_columns:
    mae = mean_absolute_error(comparison_2025[f'target_{target}'], comparison_2025[f'Pred_{target}'])
    print(f"  {target:15s}: MAE = {mae:.2f}")


Quick MAE check for backtest:
  Air            : MAE = 1.79
  Express        : MAE = 1.37
  International  : MAE = 0.68
  Surface        : MAE = 1.86


## Save Models and Results

In [38]:
# Save models
import pickle

models_to_save = {
    'rf_models': rf_models,
    'xgb_models': xgb_models,
    'backtest_models': backtest_models,
    'evals_results': evals_results
}

with open('../models/trained_models.pkl', 'wb') as f:
    pickle.dump(models_to_save, f)

print("Models saved to: ../models/trained_models.pkl")

Models saved to: ../models/trained_models.pkl


In [39]:
# Save comparison data for evaluation
comparison_2025.to_csv('../data/backtest_comparison_2025.csv', index=False)
print("Backtest comparison saved to: backtest_comparison_2025.csv")

Backtest comparison saved to: backtest_comparison_2025.csv


---
## Training Summary

In [40]:
print("="*70)
print("TRAINING SUMMARY")
print("="*70)

print("\n1. INITIAL MODELS (Train/Test Split at Nov 2025)")
print(f"   - RandomForest models: {len(rf_models)}")
print(f"   - XGBoost models: {len(xgb_models)}")
print(f"   - Training curves saved: {len(evals_results)} shipment types")

print("\n2. BACKTEST MODELS (Trained on pre-2025 data)")
print(f"   - XGBoost models: {len(backtest_models)}")
print(f"   - Training period: 2021-2024")
print(f"   - Validation period: 2025 (365 days)")

print("\n3. RECURSIVE FORECAST")
print(f"   - Days simulated: {len(sim_dates)}")
print(f"   - Companies: {len(companies)}")
print(f"   - Total predictions: {len(predictions_2025):,}")

print("\n4. COMPARISON DATA")
print(f"   - Merged records: {len(comparison_2025):,}")
print(f"   - Columns: Predictions + Actuals for {len(target_columns)} shipment types")

print("\n" + "="*70)
print("✓ TRAINING PHASE COMPLETE")
print("="*70)

TRAINING SUMMARY

1. INITIAL MODELS (Train/Test Split at Nov 2025)
   - RandomForest models: 4
   - XGBoost models: 4
   - Training curves saved: 4 shipment types

2. BACKTEST MODELS (Trained on pre-2025 data)
   - XGBoost models: 4
   - Training period: 2021-2024
   - Validation period: 2025 (365 days)

3. RECURSIVE FORECAST
   - Days simulated: 365
   - Companies: 8
   - Total predictions: 2,920

4. COMPARISON DATA
   - Merged records: 2,912
   - Columns: Predictions + Actuals for 4 shipment types

✓ TRAINING PHASE COMPLETE
