In [2]:
# ============================================
# SMART LOGISTICS DECISION SYSTEM
# Phase 2: Model Training & Risk Scoring
# ============================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print("Environment Ready ✅")

Environment Ready ✅


In [4]:
# ============================================
# LOAD CLEAN DATASET
# ============================================

df = pd.read_csv("../data/processed/clean_model_dataset.csv")

print("Dataset loaded successfully ✅")
print("Shape:", df.shape)

Dataset loaded successfully ✅
Shape: (1000, 26)


In [5]:
# ============================================
# DEFINE FEATURES (X) AND TARGET (y)
# ============================================

X = df.drop(columns=["Logistics_Delay"])
y = df["Logistics_Delay"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (1000, 25)
Target shape: (1000,)


In [6]:
# ============================================
# TRAIN-TEST SPLIT (80/20)
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (800, 25)
Test set size: (200, 25)


In [7]:
# ============================================
# IMPORT MODELS
# ============================================

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

print("Models imported successfully ✅")

Models imported successfully ✅


In [8]:
# ============================================
# DEFINE MODELS
# ============================================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier()
}

print("Models defined successfully ✅")

Models defined successfully ✅


In [9]:
# ============================================
# MODEL BENCHMARKING
# ============================================

results = []

for name, model in models.items():
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict class
    y_pred = model.predict(X_test)
    
    # Predict probability
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    
    results.append([name, accuracy, precision, recall, f1, roc_auc])

# Convert to DataFrame
results_df = pd.DataFrame(results, columns=[
    "Model", "Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"
])

results_df.sort_values(by="ROC-AUC", ascending=False)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
0,Logistic Regression,0.765,0.902439,0.654867,0.758974,0.794731
1,Random Forest,0.74,0.886076,0.619469,0.729167,0.787661
2,Gradient Boosting,0.72,0.843373,0.619469,0.714286,0.758722
4,KNN,0.555,0.595238,0.663717,0.627615,0.531177
3,SVM,0.565,0.565,1.0,0.722045,0.479097
