In [25]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.metrics import accuracy_score, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [26]:
%store -r X_train
X_train = X_train

%store -r X_val
X_val = X_val

%store -r y_train
y_train = y_train

%store -r y_val
y_val = y_val

In [27]:
# 2. Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVC": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

In [28]:
# 3. Train, predict, evaluate with timing
results = []

for name, model in models.items():
    # Training time
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()
    
    # Prediction time
    start_pred = time.time()
    y_pred = model.predict(X_val)
    end_pred = time.time()
    
    # Metrics
    acc = accuracy_score(y_val, y_pred)
    train_time = end_train - start_train
    pred_time = end_pred - start_pred
    
    results.append({
        "Model": name,
        "Test Accuracy": acc,
        "Train Time (s)": train_time,
        "Prediction Time (s)": pred_time
    })
    
    print("="*50)
    print(f"🔹 {name}")
    print(f"Test Accuracy: {acc:.4f}")
    print(f"Training Time: {train_time:.4f} sec")
    print(f"Prediction Time: {pred_time:.6f} sec")
    print("Classification Report:\n", classification_report(y_val, y_pred))

🔹 Logistic Regression
Test Accuracy: 0.7654
Training Time: 0.0405 sec
Prediction Time: 0.000728 sec
Classification Report:
               precision    recall  f1-score   support

       False       0.78      0.74      0.76      1424
        True       0.75      0.79      0.77      1445

    accuracy                           0.77      2869
   macro avg       0.77      0.77      0.77      2869
weighted avg       0.77      0.77      0.77      2869

🔹 KNN
Test Accuracy: 0.7654
Training Time: 0.0020 sec
Prediction Time: 3.611320 sec
Classification Report:
               precision    recall  f1-score   support

       False       0.76      0.78      0.77      1424
        True       0.77      0.75      0.76      1445

    accuracy                           0.77      2869
   macro avg       0.77      0.77      0.77      2869
weighted avg       0.77      0.77      0.77      2869

🔹 SVC
Test Accuracy: 0.7881
Training Time: 9.9314 sec
Prediction Time: 0.890653 sec
Classification Report:
       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


🔹 XGBoost
Test Accuracy: 0.7975
Training Time: 0.3929 sec
Prediction Time: 0.003434 sec
Classification Report:
               precision    recall  f1-score   support

       False       0.81      0.77      0.79      1424
        True       0.78      0.82      0.80      1445

    accuracy                           0.80      2869
   macro avg       0.80      0.80      0.80      2869
weighted avg       0.80      0.80      0.80      2869

[LightGBM] [Info] Number of positive: 2933, number of negative: 2891
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 5824, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503606 -> initscore=0.014423
[LightGBM] [Info] Start training from score 0.014423
🔹 LightGBM
Test Accuracy: 0.7961
Training Time: 0.5373 sec
Prediction Ti

In [29]:
# 4. Compare results in a table
results_df = pd.DataFrame(results).sort_values(by="Test Accuracy", ascending=False)
print("\n📊 Model Comparison:\n", results_df)


📊 Model Comparison:
                  Model  Test Accuracy  Train Time (s)  Prediction Time (s)
7             CatBoost       0.799930       10.249419             0.014925
5              XGBoost       0.797490        0.392875             0.003434
6             LightGBM       0.796096        0.537251             0.008997
2                  SVC       0.788079        9.931385             0.890653
3        Random Forest       0.783548        1.175706             0.082987
1                  KNN       0.765423        0.001997             3.611320
0  Logistic Regression       0.765423        0.040546             0.000728
4          Naive Bayes       0.748344        0.011261             0.002666


In [30]:
# 2. Крос-валідація (StratifiedKFold зберігає баланс класів)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# 3. Список моделей
models = {
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

In [31]:
# 4. Оцінка моделей
results = []

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")
    results.append({
        "Model": name,
        "Mean Accuracy": scores.mean(),
        "Std Dev": scores.std()
    })
    print("="*50)
    print(f"🔹 {name}")
    print("Accuracy per fold:", scores)
    print(f"Mean Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

[LightGBM] [Info] Number of positive: 2640, number of negative: 2601
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1651
[LightGBM] [Info] Number of data points in the train set: 5241, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503721 -> initscore=0.014883
[LightGBM] [Info] Start training from score 0.014883
[LightGBM] [Info] Number of positive: 2639, number of negative: 2602
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1653
[LightGBM] [Info] Number of data points in the train set: 5241, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503530 -> initscore=0.014120
[LightGBM] [Info] Start training from score 0.014120
[LightGBM] [Info] Numb

In [32]:
# 5. Порівняльна таблиця
results_df = pd.DataFrame(results).sort_values(by="Mean Accuracy", ascending=False)
print("\n📊 Cross-Validation Results:\n", results_df)


📊 Cross-Validation Results:
       Model  Mean Accuracy   Std Dev
1  CatBoost       0.812161  0.012951
0  LightGBM       0.809243  0.010953
