In [2]:
# Step4_feature_engineering_and_tuning.ipynb

import pandas as pd
import numpy as np
import os
import warnings

# ---------------------------
# Suppress warnings globally
# ---------------------------
warnings.filterwarnings("ignore")

# Also silence joblib loky backend warnings
import joblib
joblib.parallel_backend('loky', inner_max_num_threads=1)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------------------------
# Paths
# ---------------------------
RAW_DATA_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\raw\Global_Cybersecurity_Threats_2015-2024 (1).csv"
PROCESSED_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\processed"
MODEL_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\models"
os.makedirs(MODEL_PATH, exist_ok=True)

# ---------------------------
# Load dataset
# ---------------------------
df = pd.read_csv(RAW_DATA_PATH)

# ---------------------------
# Feature engineering
# ---------------------------
numeric_features = ['Number of Affected Users', 'Incident Resolution Time (in Hours)']
categorical_features = ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
target = 'Financial Loss (in Million $)'

df['AttackType_TargetIndustry'] = df['Attack Type'] + "_" + df['Target Industry']
categorical_features.append('AttackType_TargetIndustry')

df['Loss_per_User'] = df[target] / (df['Number of Affected Users'] + 1)
numeric_features.append('Loss_per_User')

X = df[numeric_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------
# Preprocessing
# ---------------------------
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ---------------------------
# Models + hyperparameter grids (5 models only)
# ---------------------------
models_params = {
    "CatBoost": (
        Pipeline([('preprocessor', preprocessor),
                  ('model', CatBoostRegressor(verbose=0, random_state=42))]),
        {"model__depth": [6, 8, 10],
         "model__learning_rate": [0.05, 0.1],
         "model__iterations": [200, 500]}
    ),
    "LightGBM": (
        Pipeline([('preprocessor', preprocessor),
                  ('model', LGBMRegressor(random_state=42, verbose=-1))]),
        {"model__n_estimators": [100, 200],
         "model__max_depth": [5, 10, -1],
         "model__learning_rate": [0.05, 0.1]}
    ),
    "RandomForest": (
        Pipeline([('preprocessor', preprocessor),
                  ('model', RandomForestRegressor(random_state=42))]),
        {"model__n_estimators": [100, 200],
         "model__max_depth": [10, 20, None]}
    ),
    "GradientBoosting": (
        Pipeline([('preprocessor', preprocessor),
                  ('model', GradientBoostingRegressor(random_state=42))]),
        {"model__n_estimators": [100, 200],
         "model__learning_rate": [0.05, 0.1],
         "model__max_depth": [3, 5]}
    ),
    "XGBoost": (
        Pipeline([('preprocessor', preprocessor),
                  ('model', XGBRegressor(random_state=42, verbosity=0))]),
        {"model__n_estimators": [100, 200],
         "model__max_depth": [5, 7],
         "model__learning_rate": [0.05, 0.1],
         "model__subsample": [0.8, 1.0]}
    )
}

results = {}

# ---------------------------
# Train, tune, evaluate
# ---------------------------
for name, (pipeline, params) in models_params.items():
    print(f"\nTuning {name}...")
    search = RandomizedSearchCV(
        pipeline,
        param_distributions=params,
        n_iter=min(10, len(params)),
        cv=3,
        scoring='r2',
        random_state=42,
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    best_params = search.best_params_

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("Best params:", best_params)
    print(f"{name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

    results[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}
    joblib.dump(best_model, os.path.join(MODEL_PATH, f"{name}_tuned.joblib"))

# ---------------------------
# Save results
# ---------------------------
results_df = pd.DataFrame(results).T
results_file = os.path.join(PROCESSED_PATH, "model_results_step4.csv")
results_df.to_csv(results_file, index=True)

print(f"\nStep 4 (5 models tuned) completed successfully!")
print(f"Results saved at: {results_file}")
print(f"Models saved in: {MODEL_PATH}")


Tuning CatBoost...
Best params: {'model__learning_rate': 0.05, 'model__iterations': 500, 'model__depth': 10}
CatBoost -> RMSE: 4.66, MAE: 1.96, R²: 0.9731

Tuning LightGBM...
Best params: {'model__n_estimators': 200, 'model__max_depth': 10, 'model__learning_rate': 0.1}
LightGBM -> RMSE: 3.54, MAE: 1.64, R²: 0.9845

Tuning RandomForest...
Best params: {'model__n_estimators': 100, 'model__max_depth': 10}
RandomForest -> RMSE: 4.76, MAE: 1.90, R²: 0.9720

Tuning GradientBoosting...
Best params: {'model__n_estimators': 200, 'model__max_depth': 3, 'model__learning_rate': 0.1}
GradientBoosting -> RMSE: 4.30, MAE: 2.15, R²: 0.9772

Tuning XGBoost...
Best params: {'model__subsample': 0.8, 'model__n_estimators': 200, 'model__max_depth': 7, 'model__learning_rate': 0.1}
XGBoost -> RMSE: 4.00, MAE: 1.53, R²: 0.9802

Step 4 (5 models tuned) completed successfully!
Results saved at: C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\processed\model_results_step4.csv
Models saved i