In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, confusion_matrix
)
from dask.distributed import Client

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from joblib import parallel_backend

# ðŸŸ¢ Start Dask client for parallelization
client = Client()
print(client)

# ðŸ“‚ Load datasets
train_df = pd.read_csv("Training_Top3Features.csv")
test_df = pd.read_csv("Testing_Top3Features.csv")

X_train = train_df.drop(columns=['went_on_backorder'])
y_train = train_df['went_on_backorder']
X_test = test_df.drop(columns=['went_on_backorder'])
y_test = test_df['went_on_backorder']

print(f"âœ… Training shape: {X_train.shape}")
print(f"âœ… Testing shape: {X_test.shape}")

# ðŸŸ¢ Define models & their hyperparameters
models = {
    "CatBoost": (
        CatBoostClassifier(silent=True, random_state=42),
        {'depth': [4, 6], 'learning_rate': [0.01, 0.1], 'iterations': [50, 100]}
    ),
    "LGBM": (
        LGBMClassifier(random_state=42),
        {'num_leaves': [15, 31], 'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100]}
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {'n_estimators': [50, 100], 'max_depth': [5, None]}
    ),
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        {'max_depth': [3, 6], 'learning_rate': [0.01, 0.1], 'n_estimators': [50, 100]}
    ),
    "ANN": (
        MLPClassifier(random_state=42, max_iter=500),
        {
            'hidden_layer_sizes': [(14, 14, 10)],
            'activation': ['relu'],
            'learning_rate_init': [0.001, 0.01]
        }
    ),
    "KNN": (
        KNeighborsClassifier(),
        {'n_neighbors': [3, 5, 7]}
    ),
    "SVM": (
        SVC(probability=True, random_state=42),
        {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    ),
    "DecisionTree": (
        DecisionTreeClassifier(random_state=42),
        {'max_depth': [5, None], 'criterion': ['gini', 'entropy']}
    ),
}

results = []

# ðŸŸ¢ Train & evaluate each model
for name, (model, params) in models.items():
    print(f"\nðŸ”· Training {name} ...")
    with parallel_backend('dask'):
        clf = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=0)
        clf.fit(X_train, y_train)
        
    best_model = clf.best_estimator_
    preds = best_model.predict(X_test)
    probs = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    roc_auc = roc_auc_score(y_test, probs) if probs is not None else "N/A"
    
    print(f"âœ… {name} Best Params: {clf.best_params_}")
    print(f"âœ… {name} Test Accuracy: {acc:.4f}")
    print(f"âœ… {name} F1 Score: {f1:.4f}")
    print(f"âœ… {name} ROC AUC: {roc_auc}")
    print(f"âœ… {name} Confusion Matrix:\n{cm}")
    
    results.append({
        "Model": name,
        "Best Params": clf.best_params_,
        "Accuracy": acc,
        "F1": f1,
        "ROC AUC": roc_auc,
        "Confusion Matrix": cm.tolist()
    })

client.close()



<Client: 'tcp://127.0.0.1:51180' processes=4 threads=8, memory=8.00 GiB>
âœ… Training shape: (10000, 3)
âœ… Testing shape: (4000, 3)

ðŸ”· Training CatBoost ...
âœ… CatBoost Best Params: {'depth': 6, 'iterations': 100, 'learning_rate': 0.1}
âœ… CatBoost Test Accuracy: 0.8117
âœ… CatBoost F1 Score: 0.8046
âœ… CatBoost ROC AUC: 0.8740206249999999
âœ… CatBoost Confusion Matrix:
[[1697  303]
 [ 450 1550]]

ðŸ”· Training LGBM ...
[LightGBM] [Info] Number of positive: 3333, number of negative: 3333
[LightGBM] [Info] Number of positive: 3333, number of negative: 3333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001654 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 623
[LightGBM] [Info] Number of positive: 3333, number of negative: 3333
[LightG

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

âœ… XGBoost Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
âœ… XGBoost Test Accuracy: 0.8135
âœ… XGBoost F1 Score: 0.8111
âœ… XGBoost ROC AUC: 0.873692125
âœ… XGBoost Confusion Matrix:
[[1652  348]
 [ 398 1602]]

ðŸ”· Training ANN ...
âœ… ANN Best Params: {'activation': 'relu', 'hidden_layer_sizes': (14, 14, 10), 'learning_rate_init': 0.01}
âœ… ANN Test Accuracy: 0.8155
âœ… ANN F1 Score: 0.8112
âœ… ANN ROC AUC: 0.876204
âœ… ANN Confusion Matrix:
[[1677  323]
 [ 415 1585]]

ðŸ”· Training KNN ...
âœ… KNN Best Params: {'n_neighbors': 5}
âœ… KNN Test Accuracy: 0.8007
âœ… KNN F1 Score: 0.7881
âœ… KNN ROC AUC: 0.8428402500000001
âœ… KNN Confusion Matrix:
[[1721  279]
 [ 518 1482]]

ðŸ”· Training SVM ...
âœ… SVM Best Params: {'C': 0.1, 'kernel': 'linear'}
âœ… SVM Test Accuracy: 0.8080
âœ… SVM F1 Score: 0.8050
âœ… SVM ROC AUC: 0.867808
âœ… SVM Confusion Matrix:
[[1647  353]
 [ 415 1585]]

ðŸ”· Training DecisionTree ...
âœ… DecisionTree Best Params: {'criterion': 'gini