In [1]:
# Step4_binary_prob_tuned_fixed.py

import os
import warnings
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

# ---------------------------
# Suppress warnings
# ---------------------------
warnings.filterwarnings("ignore")

# ---------------------------
# Base directory (updated)
# ---------------------------
BASE_DIR = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML_Final_Project"

RAW_DATA_PATH = os.path.join(BASE_DIR, "data", "raw", "Global_Cybersecurity_Threats_2015-2024 (1).csv")
MODEL_PATH = os.path.join(BASE_DIR, "models")
PROCESSED_PATH = os.path.join(BASE_DIR, "data", "processed")
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(PROCESSED_PATH, exist_ok=True)

RANDOM_STATE = 42

# ---------------------------
# Load raw dataset
# ---------------------------
df = pd.read_csv(RAW_DATA_PATH)
print("Loaded dataset shape:", df.shape)

# ---------------------------
# Target & features
# ---------------------------
target_col = "Financial Loss (in Million $)"
X_raw = df.drop(columns=[target_col])
y_raw = df[target_col].values

# ---------------------------
# Train-test split (no leakage)
# ---------------------------
X_train, X_test, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=RANDOM_STATE
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ---------------------------
# Feature engineering
# ---------------------------
numeric_features = ['Number of Affected Users', 'Incident Resolution Time (in Hours)']
categorical_features = ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']

for df_ in [X_train, X_test]:
    df_['Users_per_Hour'] = df_['Number of Affected Users'] / (df_['Incident Resolution Time (in Hours)'] + 1)
    df_['Log_Users'] = np.log1p(df_['Number of Affected Users'])

numeric_features_ext = numeric_features + ['Users_per_Hour', 'Log_Users']

# ---------------------------
# Preprocessing pipeline
# ---------------------------
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, numeric_features_ext),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

# ---------------------------
# Threshold tuning for binary target
# ---------------------------
X_tr_sub, X_val_sub, y_tr_sub_raw, y_val_sub_raw = train_test_split(
    X_train, y_train_raw, test_size=0.2, random_state=RANDOM_STATE
)

best_thr, best_f1 = None, -1
for perc in [40, 45, 50, 55, 60, 65, 70]:
    thr = np.percentile(y_tr_sub_raw, perc)
    y_tr_bin = (y_tr_sub_raw > thr).astype(int)
    y_val_bin = (y_val_sub_raw > thr).astype(int)

    rf_tmp = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, class_weight="balanced")
    pipe = Pipeline([('pre', preprocessor), ('clf', rf_tmp)])
    pipe.fit(X_tr_sub, y_tr_bin)
    preds_val = pipe.predict(X_val_sub)
    f1 = f1_score(y_val_bin, preds_val, average='macro')

    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"✅ Selected High threshold (Million $): {best_thr:.2f} with Macro F1 {best_f1:.4f}")

# ---------------------------
# Create binary target
# ---------------------------
y_train = (y_train_raw > best_thr).astype(int)
y_test = (y_test_raw > best_thr).astype(int)
print("Train class distribution:", np.bincount(y_train))
print("Test class distribution:", np.bincount(y_test))

# ---------------------------
# Models
# ---------------------------
models = {
    "RandomForest": RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE, class_weight="balanced"),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=500, random_state=RANDOM_STATE, class_weight="balanced"),
    "XGBoost": XGBClassifier(n_estimators=800, max_depth=6, learning_rate=0.03,
                             use_label_encoder=False, eval_metric='logloss',
                             random_state=RANDOM_STATE),
    "LightGBM": LGBMClassifier(n_estimators=800, learning_rate=0.03, random_state=RANDOM_STATE, class_weight="balanced"),
    "CatBoost": CatBoostClassifier(iterations=900, depth=8, learning_rate=0.05,
                                   loss_function="Logloss", verbose=0, random_seed=RANDOM_STATE)
}

# ---------------------------
# Preprocess full training data
# ---------------------------
X_train_final = X_train[numeric_features_ext + categorical_features]
X_test_final = X_test[numeric_features_ext + categorical_features]

X_train_proc = preprocessor.fit_transform(X_train_final)
X_test_proc = preprocessor.transform(X_test_final)

# Save preprocessor
joblib.dump(preprocessor, os.path.join(MODEL_PATH, "preprocessor_step4_binary.joblib"))

# ---------------------------
# Probability threshold tuning function
# ---------------------------
def tune_prob_threshold(model, X_tr, y_tr, X_val, y_val, name="model"):
    model.fit(X_tr, y_tr)
    if hasattr(model, "predict_proba"):
        val_probs = model.predict_proba(X_val)[:, 1]
    else:  # fallback for models without predict_proba
        val_probs = (model.decision_function(X_val) - model.decision_function(X_val).min()) / \
                    (model.decision_function(X_val).max() - model.decision_function(X_val).min() + 1e-9)

    best_cut, best_f1_score = 0.5, -1
    for cut in np.linspace(0.2, 0.8, 13):
        preds_val = (val_probs > cut).astype(int)
        f1 = f1_score(y_val, preds_val, average='macro')
        if f1 > best_f1_score:
            best_f1_score = f1
            best_cut = cut
    print(f"→ {name}: selected prob cutoff {best_cut:.2f} with val Macro F1 {best_f1_score:.4f}")
    return best_cut, best_f1_score

# ---------------------------
# Tune, train, and evaluate models
# ---------------------------
results = {}
chosen_cutoffs = {}

X_train_sub_proc, X_val_sub_proc, y_train_sub, y_val_sub = train_test_split(
    X_train_proc, y_train, test_size=0.2, random_state=RANDOM_STATE
)

for name, model in models.items():
    print(f"\nTuning probability threshold for {name}...")
    cut, val_f1 = tune_prob_threshold(model, X_train_sub_proc, y_train_sub, X_val_sub_proc, y_val_sub, name=name)
    chosen_cutoffs[name] = cut

    print(f"Training {name} on full training set...")
    model.fit(X_train_proc, y_train)

    if hasattr(model, "predict_proba"):
        test_probs = model.predict_proba(X_test_proc)[:, 1]
    else:
        test_probs = (model.decision_function(X_test_proc) - model.decision_function(X_test_proc).min()) / \
                     (model.decision_function(X_test_proc).max() - model.decision_function(X_test_proc).min() + 1e-9)

    preds = (test_probs > cut).astype(int)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='macro')
    rec = recall_score(y_test, preds, average='macro')
    cm = confusion_matrix(y_test, preds)

    print(f"{name} → Accuracy: {acc:.4f} | Macro F1: {f1:.4f} | Macro Recall: {rec:.4f} | Cutoff: {cut:.2f}")
    print("Confusion Matrix:\n", cm)

    results[name] = {"Accuracy": acc, "Macro_F1": f1, "Macro_Recall": rec, "Cutoff": cut}
    joblib.dump(model, os.path.join(MODEL_PATH, f"{name}_step4_binary_prob_tuned.joblib"))

# ---------------------------
# Save results
# ---------------------------
results_df = pd.DataFrame(results).T.reset_index().rename(columns={"index": "Model"})
results_df.to_csv(os.path.join(PROCESSED_PATH, "step4_binary_prob_tuned.csv"), index=False)

cutoffs_df = pd.Series(chosen_cutoffs, name="Cutoff").reset_index().rename(columns={"index": "Model"})
cutoffs_df.to_csv(os.path.join(PROCESSED_PATH, "step4_prob_cutoffs.csv"), index=False)

print("\n✅ STEP 4 COMPLETED SUCCESSFULLY")
print("Results saved to:", os.path.join(PROCESSED_PATH, "step4_binary_prob_tuned.csv"))
print("Models saved to:", MODEL_PATH)
print("Chosen probability cutoffs saved to:", os.path.join(PROCESSED_PATH, "step4_prob_cutoffs.csv"))


Loaded dataset shape: (3000, 10)
Train shape: (2400, 9) Test shape: (600, 9)
✅ Selected High threshold (Million $): 56.28 with Macro F1 0.4949
Train class distribution: [1345 1055]
Test class distribution: [330 270]

Tuning probability threshold for RandomForest...
→ RandomForest: selected prob cutoff 0.45 with val Macro F1 0.5225
Training RandomForest on full training set...
RandomForest → Accuracy: 0.5233 | Macro F1: 0.5168 | Macro Recall: 0.5168 | Cutoff: 0.45
Confusion Matrix:
 [[192 138]
 [148 122]]

Tuning probability threshold for ExtraTrees...
→ ExtraTrees: selected prob cutoff 0.45 with val Macro F1 0.5437
Training ExtraTrees on full training set...
ExtraTrees → Accuracy: 0.4850 | Macro F1: 0.4800 | Macro Recall: 0.4800 | Cutoff: 0.45
Confusion Matrix:
 [[175 155]
 [154 116]]

Tuning probability threshold for XGBoost...
→ XGBoost: selected prob cutoff 0.40 with val Macro F1 0.5084
Training XGBoost on full training set...
XGBoost → Accuracy: 0.5300 | Macro F1: 0.5296 | Macro Re