In [3]:
# Step3_model_training.ipynb

import numpy as np
import joblib
import os
import pandas as pd
import warnings

# ---------------------------
# Suppress warnings globally
# ---------------------------
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------------------------
# Paths
# ---------------------------
PROCESSED_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\processed"
MODEL_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\models"
os.makedirs(MODEL_PATH, exist_ok=True)

# ---------------------------
# Load processed datasets
# ---------------------------
X_train = np.load(os.path.join(PROCESSED_PATH, "X_train.npy"))
X_test = np.load(os.path.join(PROCESSED_PATH, "X_test.npy"))
y_train = np.load(os.path.join(PROCESSED_PATH, "y_train.npy"))
y_test = np.load(os.path.join(PROCESSED_PATH, "y_test.npy"))

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# ---------------------------
# Define models (5 models only)
# ---------------------------
models = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=200, random_state=42, verbose=-1),  # suppress LightGBM logs
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42, verbosity=0)   # suppress XGBoost logs
}

results = {}

# ---------------------------
# Train and evaluate models
# ---------------------------
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}
    print(f"{name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

    # Save each model
    joblib.dump(model, os.path.join(MODEL_PATH, f"{name}.joblib"))

# ---------------------------
# Save results
# ---------------------------
results_df = pd.DataFrame(results).T
results_file = os.path.join(PROCESSED_PATH, "model_results_step3.csv")
results_df.to_csv(results_file, index=True)

print(f"\nStep 3 completed. Results saved at: {results_file}")
print(f"Models saved in: {MODEL_PATH}")

Train shape: (2400, 65)
Test shape: (600, 65)

Training CatBoost...
CatBoost -> RMSE: 30.40, MAE: 25.63, R²: -0.1432

Training LightGBM...
LightGBM -> RMSE: 30.97, MAE: 25.97, R²: -0.1859

Training RandomForest...
RandomForest -> RMSE: 29.51, MAE: 25.02, R²: -0.0768

Training GradientBoosting...
GradientBoosting -> RMSE: 29.72, MAE: 25.44, R²: -0.0920

Training XGBoost...
XGBoost -> RMSE: 33.83, MAE: 28.05, R²: -0.4153

Step 3 completed. Results saved at: C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\processed\model_results_step3.csv
Models saved in: C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\models
