In [1]:
# Step3_model_training_binary_fixed.py

import numpy as np
import pandas as pd
import os
import joblib
import warnings

# ---------------------------
# Suppress warnings
# ---------------------------
warnings.filterwarnings("ignore")

# ---------------------------
# ML Imports
# ---------------------------
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ---------------------------
# Base directory (updated)
# ---------------------------
BASE_DIR = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML_Final_Project"

# ---------------------------
# Paths
# ---------------------------
PROCESSED_PATH = os.path.join(BASE_DIR, "data", "processed")
MODEL_PATH = os.path.join(BASE_DIR, "models")
os.makedirs(MODEL_PATH, exist_ok=True)

# ---------------------------
# Load processed datasets
# ---------------------------
X_train = np.load(os.path.join(PROCESSED_PATH, "X_train_binary.npy"))
X_test  = np.load(os.path.join(PROCESSED_PATH, "X_test_binary.npy"))
y_train = np.load(os.path.join(PROCESSED_PATH, "y_train_binary.npy"), allow_pickle=True)
y_test  = np.load(os.path.join(PROCESSED_PATH, "y_test_binary.npy"), allow_pickle=True)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# ---------------------------
# Define binary classifiers
# ---------------------------
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=500, random_state=42, class_weight="balanced", n_jobs=-1
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=500, random_state=42, class_weight="balanced", n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=500, max_depth=6, learning_rate=0.03,
        use_label_encoder=False, eval_metric='logloss', random_state=42
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=500, learning_rate=0.03, class_weight="balanced", random_state=42, n_jobs=-1
    ),
    "CatBoost": CatBoostClassifier(
        iterations=500, depth=6, learning_rate=0.03, loss_function="Logloss",
        verbose=0, random_seed=42
    )
}

results = []

# ---------------------------
# Train & evaluate models
# ---------------------------
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro")
    macro_recall = recall_score(y_test, y_pred, average="macro")

    print(f"{name} Results → Accuracy: {acc:.4f}, Macro F1: {macro_f1:.4f}, Macro Recall: {macro_recall:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    results.append({"Model": name, "Accuracy": acc, "Macro_F1": macro_f1, "Macro_Recall": macro_recall})

    # Save model
    joblib.dump(model, os.path.join(MODEL_PATH, f"{name}_binary_classifier.joblib"))

# ---------------------------
# Save results summary
# ---------------------------
results_df = pd.DataFrame(results).sort_values("Macro_F1", ascending=False)
results_file = os.path.join(PROCESSED_PATH, "model_results_step3_binary.csv")
results_df.to_csv(results_file, index=False)

print("\n✅ Step 3 (Binary) completed successfully!")
print("Results saved at:", results_file)
print("Models saved at:", MODEL_PATH)


Train shape: (2400, 65)
Test shape : (600, 65)

Training RandomForest...
RandomForest Results → Accuracy: 0.5883, Macro F1: 0.4594, Macro Recall: 0.4870

Classification Report:
               precision    recall  f1-score   support

           0     0.6396    0.8325    0.7234       388
           1     0.3158    0.1415    0.1954       212

    accuracy                         0.5883       600
   macro avg     0.4777    0.4870    0.4594       600
weighted avg     0.5252    0.5883    0.5369       600

Confusion Matrix:
 [[323  65]
 [182  30]]

Training ExtraTrees...
ExtraTrees Results → Accuracy: 0.5467, Macro F1: 0.4763, Macro Recall: 0.4805

Classification Report:
               precision    recall  f1-score   support

           0     0.6343    0.7062    0.6683       388
           1     0.3214    0.2547    0.2842       212

    accuracy                         0.5467       600
   macro avg     0.4778    0.4805    0.4763       600
weighted avg     0.5237    0.5467    0.5326       600
