In [1]:
import joblib

In [2]:
# ============================================
# SMART LOGISTICS DECISION SYSTEM
# Phase 2: Final Risk Model (Logistic Regression)
# ============================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import joblib

print("Environment Ready ✅")

Environment Ready ✅


In [3]:
# ============================================
# LOAD CLEAN DATASET
# ============================================

df = pd.read_csv("../data/processed/clean_model_dataset.csv")

print("Dataset Loaded ✅")
print("Shape:", df.shape)

Dataset Loaded ✅
Shape: (1000, 26)


In [4]:
# ============================================
# DEFINE FEATURES AND TARGET
# ============================================

X = df.drop(columns=["Logistics_Delay"])
y = df["Logistics_Delay"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (1000, 25)
Target shape: (1000,)


In [5]:
# ============================================
# TRAIN-TEST SPLIT
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training size:", X_train.shape)
print("Test size:", X_test.shape)

Training size: (800, 25)
Test size: (200, 25)


In [6]:
# ============================================
# FEATURE SCALING
# ============================================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling complete ✅")

Feature scaling complete ✅


In [7]:
# ============================================
# TRAIN FINAL LOGISTIC REGRESSION MODEL
# ============================================

model = LogisticRegression(max_iter=2000, random_state=42)

model.fit(X_train_scaled, y_train)

print("Model trained successfully ✅")

Model trained successfully ✅


In [8]:
# ============================================
# EVALUATE MODEL
# ============================================

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

Accuracy: 0.77
Precision: 0.9135802469135802
Recall: 0.6548672566371682
F1 Score: 0.7628865979381443
ROC-AUC: 0.7881192147289187


In [9]:
# ============================================
# IMPORT GRIDSEARCH
# ============================================

from sklearn.model_selection import GridSearchCV

In [10]:
# ============================================
# DEFINE HYPERPARAMETER GRID
# ============================================

param_grid = {
    "C": [0.01, 0.1, 1, 5, 10, 50, 100],
    "penalty": ["l2"],
    "solver": ["lbfgs"]
}

In [11]:
# ============================================
# GRID SEARCH FOR LOGISTIC REGRESSION
# ============================================

log_reg = LogisticRegression(max_iter=5000, random_state=42)

grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV ROC-AUC:", grid_search.best_score_)

Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV ROC-AUC: 0.7877382810426289




In [12]:
# ============================================
# USE BEST TUNED MODEL
# APPLY CUSTOM THRESHOLD = 0.6
# ============================================

best_model = grid_search.best_estimator_

#Evaluate on test set
threshold = 0.6

y_prob = best_model.predict_proba(X_test_scaled)[:, 1]
y_pred_custom = (y_prob >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred_custom)
precision = precision_score(y_test, y_pred_custom)
recall = recall_score(y_test, y_pred_custom)
f1 = f1_score(y_test, y_pred_custom)
roc_auc = roc_auc_score(y_test, y_prob)

print("Threshold:", threshold)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)



Threshold: 0.6
Accuracy: 0.775
Precision: 0.9857142857142858
Recall: 0.6106194690265486
F1 Score: 0.7540983606557377
ROC-AUC: 0.7987997151866545


In [13]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [14]:
# ============================================
# IMPORT XGBOOST
# ============================================

from xgboost import XGBClassifier

print("XGBoost imported successfully ✅")

XGBoost imported successfully ✅


In [15]:
# ============================================
# TRAIN BASELINE XGBOOST MODEL
# ============================================

xgb_model = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)

print("XGBoost model trained successfully ✅")

XGBoost model trained successfully ✅


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [16]:
# ============================================
# EVALUATE XGBOOST
# ============================================

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_prob_xgb)

print("XGBoost Accuracy:", accuracy_xgb)
print("XGBoost Precision:", precision_xgb)
print("XGBoost Recall:", recall_xgb)
print("XGBoost F1 Score:", f1_xgb)
print("XGBoost ROC-AUC:", roc_auc_xgb)

XGBoost Accuracy: 0.695
XGBoost Precision: 0.7549019607843137
XGBoost Recall: 0.6814159292035398
XGBoost F1 Score: 0.7162790697674418
XGBoost ROC-AUC: 0.7682839995931239


In [17]:
# ============================================
# RETRAIN BEST LOGISTIC MODEL ON FULL DATA
# ============================================

# Define best model
final_model = LogisticRegression(
    C=0.01,
    penalty="l2",
    solver="lbfgs",
    max_iter=5000,
    random_state=42
)

# Scale full dataset
scaler_full = StandardScaler()
X_scaled_full = scaler_full.fit_transform(X)

# Train on entire dataset
final_model.fit(X_scaled_full, y)

print("Final Logistic Model Trained on Full Dataset ✅")

Final Logistic Model Trained on Full Dataset ✅




In [18]:
# ============================================
# GENERATE DELAY PROBABILITY
# ============================================

delay_probability = final_model.predict_proba(X_scaled_full)[:, 1]

df["delay_probability"] = delay_probability

print("Delay probability column added ✅")
df[["Logistics_Delay", "delay_probability"]].head()

Delay probability column added ✅


Unnamed: 0,Logistics_Delay,delay_probability
0,1,0.339254
1,1,0.870765
2,0,0.452376
3,1,0.84201
4,1,0.471364


In [19]:
# ============================================
# SAVE MODEL AND SCALER
# ============================================

import os
os.makedirs("E:/Projects/Smart-Logistics-System/models", exist_ok=True)

joblib.dump(final_model, "E:/Projects/Smart-Logistics-System/models/delay_model.pkl")
joblib.dump(scaler_full, "E:/Projects/Smart-Logistics-System/models/scaler.pkl")

print("Model and scaler saved successfully ✅")

Model and scaler saved successfully ✅


In [21]:
# ============================================
# SAVE DATASET WITH DELAY PROBABILITY
# ============================================

df.to_csv("E:/Projects/Smart-Logistics-System/data/processed/dataset_with_delay_probability.csv", index=False)

print("Dataset with delay probability saved ✅")

Dataset with delay probability saved ✅


In [22]:
# ============================================
# THRESHOLD OPTIMIZATION
# ============================================

import numpy as np

thresholds = np.arange(0.1, 0.9, 0.01)

results = []

for t in thresholds:
    y_pred_t = (y_prob >= t).astype(int)
    
    precision = precision_score(y_test, y_pred_t)
    recall = recall_score(y_test, y_pred_t)
    f1 = f1_score(y_test, y_pred_t)
    
    results.append([t, precision, recall, f1])

threshold_df = pd.DataFrame(results, columns=["Threshold", "Precision", "Recall", "F1"])

threshold_df.sort_values(by="F1", ascending=False).head()

Unnamed: 0,Threshold,Precision,Recall,F1
51,0.61,0.985714,0.610619,0.754098
49,0.59,0.985714,0.610619,0.754098
50,0.6,0.985714,0.610619,0.754098
60,0.7,1.0,0.60177,0.751381
54,0.64,1.0,0.60177,0.751381
