# Predictive Freight Volume Forecasting

This project demonstrates time-series forecasting for freight volume prediction using machine learning. We generate synthetic freight data with seasonal patterns, economic indicators, and external factors, then train and evaluate multiple regression models to predict daily freight volumes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime, timedelta
import warnings

plt.style.use('seaborn-v0_8-darkgrid')
warnings.filterwarnings('ignore')

## Generate Synthetic Freight Data

In [None]:
np.random.seed(42)

# Date range: 3 years of daily data (2022-2024)
dates = pd.date_range(start='2022-01-01', end='2024-12-31', freq='D')
n_days = len(dates)

# Base volume with seasonal pattern (sine wave for seasonality)
t = np.arange(n_days)
seasonal = 500 * np.sin(2 * np.pi * t / 365)  # Annual seasonality

# Weekly pattern (lower on weekends)
day_of_week = dates.dayofweek
weekly_pattern = np.where(day_of_week < 5, 0, -150)  # Weekend penalty

# Upward trend
trend = 2 * t / 365  # ~2 units per year growth

# Random noise
noise = np.random.normal(0, 80, n_days)

# Base volume
base_volume = 2000
volume = base_volume + seasonal + weekly_pattern + trend + noise
volume = np.maximum(volume, 100)  # Ensure positive volumes

# Additional features
fuel_price_index = 100 + np.cumsum(np.random.randn(n_days) * 0.5) + 0.1 * t / 365
fuel_price_index = np.maximum(fuel_price_index, 80)

economic_indicator = 100 + np.cumsum(np.random.randn(n_days) * 0.3) + 0.05 * t / 365
economic_indicator = np.maximum(economic_indicator, 70)

weather_severity = np.random.uniform(0, 10, n_days)

# Holiday indicator (simplified: major US holidays)
month_day = dates.strftime('%m-%d')
holidays = ['01-01', '07-04', '12-25', '11-24', '11-25', '12-31']
is_holiday = month_day.isin(holidays).astype(int)

df = pd.DataFrame({
    'date': dates,
    'volume': volume,
    'fuel_price_index': fuel_price_index,
    'economic_indicator': economic_indicator,
    'weather_severity': weather_severity,
    'day_of_week': day_of_week,
    'month': dates.month,
    'is_holiday': is_holiday
})

print(df.head())
print()
df.info()

## Exploratory Data Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Volume over time (line plot)
axes[0, 0].plot(df['date'], df['volume'], linewidth=0.8, alpha=0.8)
axes[0, 0].set_title('Freight Volume Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Volume')

# 2. Monthly average volume (bar chart)
monthly_avg = df.groupby('month')['volume'].mean()
axes[0, 1].bar(monthly_avg.index, monthly_avg.values, color='steelblue', edgecolor='black')
axes[0, 1].set_title('Monthly Average Volume')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Average Volume')

# 3. Day-of-week pattern (box plot)
df['day_name'] = df['date'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_sorted = df.copy()
df_sorted['day_name'] = pd.Categorical(df_sorted['day_name'], categories=day_order, ordered=True)
df_sorted = df_sorted.sort_values('day_name')
sns.boxplot(data=df_sorted, x='day_name', y='volume', ax=axes[1, 0])
axes[1, 0].set_title('Volume by Day of Week')
axes[1, 0].set_xlabel('Day of Week')
axes[1, 0].set_ylabel('Volume')
axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Volume distribution (histogram)
axes[1, 1].hist(df['volume'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Volume Distribution')
axes[1, 1].set_xlabel('Volume')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('eda_plots.png', dpi=150, bbox_inches='tight')
plt.show()

## Feature Engineering

In [None]:
print('Shape before feature engineering:', df.shape)

# Lag features (7, 14, 28 days)
for lag in [7, 14, 28]:
    df[f'volume_lag_{lag}'] = df['volume'].shift(lag)

# Rolling means (7, 14, 30 days)
for window in [7, 14, 30]:
    df[f'volume_rolling_mean_{window}'] = df['volume'].rolling(window=window).mean()

# Rolling std (7 days)
df['volume_rolling_std_7'] = df['volume'].rolling(window=7).std()

# Month sin/cos encoding
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Drop NaN rows from lag/rolling features
df = df.dropna()

print('Shape after feature engineering:', df.shape)

## Model Training & Evaluation

In [None]:
# Train/test split (last 90 days as test)
test_size = 90
train_df = df.iloc[:-test_size]
test_df = df.iloc[-test_size:]

# Feature list (exclude date, volume)
exclude_cols = ['date', 'volume', 'day_name']
feature_cols = [c for c in df.columns if c not in exclude_cols]

X_train = train_df[feature_cols]
y_train = train_df['volume']
X_test = test_df[feature_cols]
y_test = test_df['volume']

# Train models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}
predictions = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    results[name] = {'MAE': mae, 'RMSE': rmse, 'R²': r2, 'MAPE (%)': mape}

# Print comparison table
results_df = pd.DataFrame(results).T
print(results_df.round(4))

# Store best model name for later use
best_model_name = min(results, key=lambda k: results[k]['RMSE'])

## Predictions vs Actuals

In [None]:
y_pred_best = predictions[best_model_name]
residuals = y_test.values - y_pred_best
residual_std = np.std(residuals)

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(test_df['date'].values, y_test.values, label='Actual', color='blue', linewidth=2)
ax.plot(test_df['date'].values, y_pred_best, label=f'Predicted ({best_model_name})', color='orange', linewidth=1.5, linestyle='--')
ax.fill_between(test_df['date'].values, y_pred_best - residual_std, y_pred_best + residual_std, alpha=0.3, color='orange', label='±1 Std Confidence')
ax.set_title('Predictions vs Actuals (Test Set)')
ax.set_xlabel('Date')
ax.set_ylabel('Volume')
ax.legend()
plt.tight_layout()
plt.savefig('predictions_vs_actuals.png', dpi=150, bbox_inches='tight')
plt.show()

## Feature Importance

In [None]:
# Use best tree-based model (Random Forest or Gradient Boosting)
tree_models = ['Random Forest', 'Gradient Boosting']
best_tree = best_model_name if best_model_name in tree_models else 'Gradient Boosting'
model_for_importance = models[best_tree]

if hasattr(model_for_importance, 'feature_importances_'):
    importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model_for_importance.feature_importances_
    }).sort_values('importance', ascending=True)
    top15 = importance.tail(15)
    
    plt.figure(figsize=(10, 8))
    plt.barh(top15['feature'], top15['importance'], color='steelblue', edgecolor='black')
    plt.xlabel('Importance')
    plt.title(f'Top 15 Feature Importance ({best_tree})')
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print('Linear Regression does not have feature_importances_. Using Gradient Boosting for importance plot.')
    gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb.fit(X_train, y_train)
    importance = pd.DataFrame({'feature': feature_cols, 'importance': gb.feature_importances_}).sort_values('importance', ascending=True)
    top15 = importance.tail(15)
    plt.figure(figsize=(10, 8))
    plt.barh(top15['feature'], top15['importance'], color='steelblue', edgecolor='black')
    plt.xlabel('Importance')
    plt.title('Top 15 Feature Importance (Gradient Boosting)')
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()

## Conclusions

- **Model Performance**: The best-performing model (based on RMSE) provides reliable freight volume predictions. Tree-based models (Random Forest, Gradient Boosting) typically outperform linear regression when capturing non-linear relationships and interactions.

- **Key Drivers**: Lag features (recent volume history) and rolling statistics are among the most important predictors. Seasonal patterns (month encoding) and day-of-week effects also contribute significantly to forecast accuracy.

- **Recommendations**: Consider retraining models periodically as new data becomes available. Explore additional features such as weather forecasts, economic indicators, and supply chain events. For production deployment, implement proper train/validation/test splits and cross-validation to avoid overfitting.