# 🚢 Global Supply Chain Delay & Disruption Risk Prediction

This notebook builds a binary classification model to predict whether a shipment will be **delayed or on-time**.
It covers the full ML pipeline: data loading → EDA → feature engineering → model training → evaluation → export.

## 1. Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
import joblib

## 2. Load Data

In [None]:
df = pd.read_csv('/content/global_supply_chain_disruption_v1.csv')
df.head()

## 3. Exploratory Data Analysis

Quick inspection of the dataset: shape, types, summary statistics, and column names.

In [None]:
print('Shape:', df.shape)
print('\nColumn names:')
print(df.columns.tolist())

In [None]:
df.info()

In [None]:
df.describe()

## 4. Feature Engineering

- Parse `Order_Date` into year, month, day, and day-of-week components.
- Drop `Order_ID` (non-informative identifier).
- Create binary target `Is_Delayed` (1 if `Delay_Days > 0`, else 0).
- Remove **data-leaky columns** that would not be available at prediction time.

In [None]:
# Parse date features
df['Order_Date'] = pd.to_datetime(df['Order_Date'])
df['Order_Year']      = df['Order_Date'].dt.year
df['Order_Month']     = df['Order_Date'].dt.month
df['Order_Day']       = df['Order_Date'].dt.day
df['Order_DayOfweek'] = df['Order_Date'].dt.dayofweek

df = df.drop('Order_ID', axis=1)
df.head()

In [None]:
# Create binary target variable
df['Is_Delayed'] = df['Delay_Days'].apply(lambda x: 1 if x > 0 else 0)
print('Target distribution (normalised):')
print(df['Is_Delayed'].value_counts(normalize=True).round(3))

In [None]:
# Drop leaky columns (known only AFTER delivery)
leaky_cols = [
    'Delay_Days',
    'Actual_Lead_Time_Days',
    'Delivery_Status',
    'Disruption_Event',
    'Mitigation_Action_Taken'
]
df = df.drop(leaky_cols, axis=1)
df.head()

## 5. Data Preprocessing

- Select safe (non-leaky) features.
- Label-encode categorical columns.
- Split into train / test sets (80 / 20).
- Scale features with `StandardScaler` for distance-based models.

In [None]:
safe_cols = [
    'Origin_City', 'Destination_City', 'Route_Type', 'Transportation_Mode',
    'Product_Category', 'Base_Lead_Time_Days', 'Scheduled_Lead_Time_Days',
    'Geopolitical_Risk_Index', 'Weather_Severity_Index', 'Inflation_Rate_Pct',
    'Shipping_Cost_USD', 'Order_Weight_Kg',
    'Order_Year', 'Order_Month', 'Order_Day', 'Order_DayOfweek'
]

x = df[safe_cols].copy()
y = df['Is_Delayed'].copy()

print('Features:', x.columns.tolist())
print('\nTarget distribution:')
print(y.value_counts(normalize=True).round(3))

In [None]:
# Label-encode categorical columns
le = LabelEncoder()
cat_cols = x.select_dtypes(include='object').columns
for col in cat_cols:
    x[col] = le.fit_transform(x[col])
x.head()

In [None]:
# Train / test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=23)

print('Train shape:', x_train.shape)
print('Test shape: ', x_test.shape)
print('\nTrain target distribution:')
print(y_train.value_counts(normalize=True).round(3))
print('\nTest target distribution:')
print(y_test.value_counts(normalize=True).round(3))

In [None]:
# Scale features (required for LR, SVM, KNN)
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc  = sc.transform(x_test)

## 6. Model Training

Five classifiers are trained and compared:
1. Logistic Regression
2. Random Forest
3. XGBoost
4. Support Vector Machine (SVM)
5. K-Nearest Neighbours (KNN)

In [None]:
# 1. Logistic Regression
lr = LogisticRegression(max_iter=5000, class_weight='balanced')
lr.fit(x_train_sc, y_train)
y_pred_lr = lr.predict(x_test_sc)
y_prob_lr = lr.predict_proba(x_test_sc)[:, 1]

In [None]:
# 2. Random Forest
rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=23)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
y_prob_rf = rf.predict_proba(x_test)[:, 1]

In [None]:
# 3. XGBoost
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    scale_pos_weight=7,
    eval_metric='logloss',
    random_state=23
)
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
y_prob_xgb = xgb.predict_proba(x_test)[:, 1]

In [None]:
# 4. Support Vector Machine
svm = SVC(kernel='rbf', probability=True, class_weight='balanced')
svm.fit(x_train_sc, y_train)
y_pred_svm = svm.predict(x_test_sc)
y_prob_svm = svm.predict_proba(x_test_sc)[:, 1]

In [None]:
# 5. K-Nearest Neighbours
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_sc, y_train)
y_pred_knn = knn.predict(x_test_sc)
y_prob_knn = knn.predict_proba(x_test_sc)[:, 1]

## 7. Model Evaluation & Comparison

All five models are evaluated on Accuracy and ROC-AUC.

In [None]:
models = {
    'LR' : (y_pred_lr,  y_prob_lr),
    'RF' : (y_pred_rf,  y_prob_rf),
    'XGB': (y_pred_xgb, y_prob_xgb),
    'SVM': (y_pred_svm, y_prob_svm),
    'KNN': (y_pred_knn, y_prob_knn),
}

for name, (pred, prob) in models.items():
    print(f'\n==== {name} ====')
    print('Accuracy:', accuracy_score(y_test, pred))
    print('ROC-AUC:', roc_auc_score(y_test, prob))
    print(classification_report(y_test, pred))

In [None]:
# Summary comparison table
model_results = pd.DataFrame({
    'Model'   : ['Logistic Regression', 'Random Forest', 'XGBoost', 'SVM', 'KNN'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_xgb),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_knn),
    ],
    'ROC_AUC' : [
        roc_auc_score(y_test, y_prob_lr),
        roc_auc_score(y_test, y_prob_rf),
        roc_auc_score(y_test, y_prob_xgb),
        roc_auc_score(y_test, y_prob_svm),
        roc_auc_score(y_test, y_prob_knn),
    ]
}).sort_values(by='ROC_AUC', ascending=False)

print(model_results)

In [None]:
# Feature importance — Random Forest
feat_imp_rf = pd.Series(rf.feature_importances_, index=x_train.columns).sort_values(ascending=False)

plt.figure(figsize=(6, 4))
feat_imp_rf.head(15).plot(kind='barh')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

## 8. Best Model — XGBoost (Tuned)

Re-train XGBoost with tuned hyperparameters, evaluate with a confusion matrix, and inspect feature importances.

In [None]:
# Final XGBoost model
xgb_final = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
    eval_metric='logloss',
    random_state=42
)

xgb_final.fit(x_train, y_train)

y_pred = xgb_final.predict(x_test)
y_prob = xgb_final.predict_proba(x_test)[:, 1]

print('=== FINAL XGBOOST MODEL ===')
print('Accuracy:', round(accuracy_score(y_test, y_pred), 4))
print('ROC-AUC :', round(roc_auc_score(y_test, y_prob), 4))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['On-Time', 'Delayed'],
            yticklabels=['On-Time', 'Delayed'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix — XGBoost')
plt.tight_layout()
plt.show()

In [None]:
# Feature importances — XGBoost
feat_imp_xgb = pd.Series(
    xgb_final.feature_importances_,
    index=x_train.columns
).sort_values(ascending=False).head(15)

feat_imp_xgb.plot(kind='bar', figsize=(6, 4))
plt.title('Top 15 Feature Importances — XGBoost')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.show()

## 9. Save Model

Serialise the trained XGBoost model to disk using `joblib`.

In [None]:
joblib.dump(xgb_final, 'xgb_supply_chain_model.pkl')
print('Model saved → xgb_supply_chain_model.pkl')