<a href="https://colab.research.google.com/github/Rijan4449/XGB-OOA/blob/main/XGBOO_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [2]:
import pandas as pd
import numpy as np
import time
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# -----------------------
# 1. Load dataset
# -----------------------
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/pinakafinal_final.csv")

# PH waters test set
with open("/content/drive/My Drive/Colab Notebooks/ph_waters.txt", "r", encoding="utf-8") as f:
    ph_waters = {line.strip() for line in f if line.strip()}

ph_test = df[df["waterbody_name"].isin(ph_waters)]
non_ph = df[~df["waterbody_name"].isin(ph_waters)]
test_size = int(0.2 * len(df))
if len(ph_test) < test_size:
    additional_needed = test_size - len(ph_test)
    extra_non_ph = non_ph.sample(n=additional_needed, random_state=42)
    test_set = pd.concat([ph_test, extra_non_ph])
    train_set = df.drop(test_set.index)
else:
    test_set = ph_test
    train_set = non_ph

In [7]:
# -----------------------
# 2. Define columns
# -----------------------
drop_cols = ['fish_id', 'common_name', 'status']
high_cardinality = ['species', 'waterbody_name']
low_cardinality = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'feeding_type']
numeric_cols = [
    'temp_max', 'weight_max', 'length_max', 'temp_pref_min', 'temp_pref_max',
    'fecundity_mean', 'fecundity_min', 'fecundity_max',
    'trophic_level_estimate', 'trophic_level',
    'wb_ph_min', 'wb_ph_max', 'wb_salinity_min', 'wb_salinity_max',
    'wb_do_min', 'wb_do_max', 'wb_bod_min', 'wb_bod_max',
    'wb_turbidity_min', 'wb_turbidity_max', 'wb_temp_min', 'wb_temp_max'
]

In [8]:
# -----------------------
# 3. Preprocessor
# -----------------------
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
low_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
high_card_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder())
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('low_cat', low_card_transformer, low_cardinality),
    ('high_cat', high_card_transformer, high_cardinality)
], remainder='drop')

In [9]:
#-----------------------
# 4. Risk discretization
# -----------------------
def discretize_risk(y):
    bins = [0, 0.33, 0.66, 1.0]
    labels = [0, 1, 2]  # 0=Low, 1=Medium, 2=High
    return pd.cut(y, bins=bins, labels=labels, include_lowest=True).astype(int)

In [10]:
# -----------------------
# 5. Fitness function for OOA
# -----------------------
def fitness_function(params, X, y, preprocessor, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_results = {
        "accuracy": [], "precision": [], "recall": [], "f1": [],
        "roc_auc": [], "logloss": [], "train_accuracy": []
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        y_tr_class = discretize_risk(y_tr)
        y_val_class = discretize_risk(y_val)

        # Preprocess
        X_tr_num = preprocessor.fit_transform(X_tr, y_tr_class)
        X_val_num = preprocessor.transform(X_val)

        # SMOTE
        smote = SMOTE(random_state=42)
        X_tr_res, y_tr_res = smote.fit_resample(X_tr_num, y_tr_class)

        # Train
        model = xgb.XGBClassifier(
            n_estimators=int(params["n_estimators"]),
            learning_rate=params["learning_rate"],
            max_depth=int(params["max_depth"]),
            subsample=params["subsample"],
            colsample_bytree=params["colsample_bytree"],
            gamma=params["gamma"],
            reg_lambda=params["reg_lambda"],
            objective="multi:softprob",
            eval_metric="mlogloss",
            use_label_encoder=False,
            random_state=42
        )
        model.fit(X_tr_res, y_tr_res)

        # Predictions
        y_pred_val = model.predict(X_val_num)
        y_pred_tr = model.predict(X_tr_res)
        y_val_probs = model.predict_proba(X_val_num)

        # Metrics
        cv_results["accuracy"].append(accuracy_score(y_val_class, y_pred_val))
        cv_results["precision"].append(precision_score(y_val_class, y_pred_val, average="weighted", zero_division=0))
        cv_results["recall"].append(recall_score(y_val_class, y_pred_val, average="weighted"))
        cv_results["f1"].append(f1_score(y_val_class, y_pred_val, average="weighted"))
        cv_results["roc_auc"].append(roc_auc_score(pd.get_dummies(y_val_class), y_val_probs, multi_class="ovr"))
        cv_results["logloss"].append(log_loss(pd.get_dummies(y_val_class), y_val_probs))
        cv_results["train_accuracy"].append(accuracy_score(y_tr_res, y_pred_tr))

    return np.mean(cv_results["roc_auc"]), cv_results


In [11]:
# -----------------------
# 6. OOA Search
# -----------------------
def run_OOA(X, y, preprocessor, max_iter=10, n_agents=5):
    agents = []
    for _ in range(n_agents):
        agents.append({
            "n_estimators": np.random.randint(100, 800),
            "learning_rate": np.random.uniform(0.01, 0.3),
            "max_depth": np.random.randint(3, 10),
            "subsample": np.random.uniform(0.5, 1.0),
            "colsample_bytree": np.random.uniform(0.5, 1.0),
            "gamma": np.random.uniform(0, 5),
            "reg_lambda": np.random.uniform(0.1, 10)
        })

    best_agent = None
    best_score = -np.inf
    best_cv_results = None

    for it in range(1, max_iter + 1):
        for i, agent in enumerate(agents):
            score, cv_results = fitness_function(agent, X, y, preprocessor)
            if score > best_score:
                best_score = score
                best_agent = agent.copy()
                best_cv_results = cv_results
        print(f"Iteration {it}/{max_iter}, Best ROC-AUC = {best_score:.4f}")

    return best_agent, best_score, best_cv_results


In [12]:
# -----------------------
# 7. Run OOA
# -----------------------
start_time = time.time()
best_params, best_score, best_cv_results = run_OOA(
    train_set.drop(columns=["invasion_risk_score"]),
    train_set["invasion_risk_score"],
    preprocessor,
    max_iter=15,
    n_agents=5
)
end_time = time.time()

print("\nBest OOA parameters:")
print(best_params)

# Fold-wise results
metrics = ["accuracy", "precision", "recall", "f1", "roc_auc", "logloss"]
for metric in metrics:
    print(f"\n{metric.upper()}:")
    for i, score in enumerate(best_cv_results[metric], 1):
        print(f"Fold-{i}: {score:.4f}")
    print(f"Mean {metric.upper()}: {np.mean(best_cv_results[metric]):.4f}")

# Generalization gap
gen_gap = np.array(best_cv_results["train_accuracy"]) - np.array(best_cv_results["accuracy"])
print("\nGENERALIZATION GAP (Train - Validation Accuracy) per fold:")
for i, gap in enumerate(gen_gap, 1):
    print(f"Fold-{i}: {gap:.4f}")
print(f"Mean Generalization Gap: {np.mean(gen_gap):.4f}")

# Timing stats
total_time = end_time - start_time
num_iterations = 15
time_per_iteration = total_time / num_iterations
val_scores = best_cv_results["accuracy"]
variance = np.var(val_scores)

print("\nEXPERIMENT 2 - TUNING STATISTICS")
print(f"Total tuning time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
print(f"Number of iterations/folds: {num_iterations}")
print(f"Time per iteration: {time_per_iteration:.2f} seconds")
print(f"Validation score mean: {np.mean(val_scores):.4f}")
print(f"Validation score variance: {variance:.6f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 1/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 2/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 3/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 4/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 5/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 6/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 7/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 8/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 9/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 10/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 11/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 12/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 13/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 14/15, Best ROC-AUC = 0.9142


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Iteration 15/15, Best ROC-AUC = 0.9142

Best OOA parameters:
{'n_estimators': 692, 'learning_rate': 0.2085099629741114, 'max_depth': 7, 'subsample': 0.7986881109219452, 'colsample_bytree': 0.659218561591741, 'gamma': 3.219911229701522, 'reg_lambda': 6.697812877353043}

ACCURACY:
Fold-1: 0.8720
Fold-2: 0.8533
Fold-3: 0.8773
Fold-4: 0.8747
Fold-5: 0.8850
Mean ACCURACY: 0.8725

PRECISION:
Fold-1: 0.8915
Fold-2: 0.8761
Fold-3: 0.8922
Fold-4: 0.8952
Fold-5: 0.9094
Mean PRECISION: 0.8929

RECALL:
Fold-1: 0.8720
Fold-2: 0.8533
Fold-3: 0.8773
Fold-4: 0.8747
Fold-5: 0.8850
Mean RECALL: 0.8725

F1:
Fold-1: 0.8759
Fold-2: 0.8572
Fold-3: 0.8800
Fold-4: 0.8786
Fold-5: 0.8895
Mean F1: 0.8763

ROC_AUC:
Fold-1: 0.9199
Fold-2: 0.8858
Fold-3: 0.9226
Fold-4: 0.9265
Fold-5: 0.9163
Mean ROC_AUC: 0.9142

LOGLOSS:
Fold-1: 0.6393
Fold-2: 0.7004
Fold-3: 0.6203
Fold-4: 0.6438
Fold-5: 0.6494
Mean LOGLOSS: 0.6506

GENERALIZATION GAP (Train - Validation Accuracy) per fold:
Fold-1: 0.1189
Fold-2: 0.1384
Fold-3: 0.1

In [13]:
# -----------------------
# 8. Final Train on full train_set + Test eval
# -----------------------
X_train_full = train_set.drop(columns=["invasion_risk_score", "status"])  # drop status too
y_train_full = discretize_risk(train_set["invasion_risk_score"])
X_train_full_num = preprocessor.fit_transform(X_train_full, y_train_full)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_full_num, y_train_full)

# Final model with best OOA params
final_model = xgb.XGBClassifier(
    n_estimators=int(best_params["n_estimators"]),
    learning_rate=best_params["learning_rate"],
    max_depth=int(best_params["max_depth"]),
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    gamma=best_params["gamma"],
    reg_lambda=best_params["reg_lambda"],
    objective="multi:softprob",
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42
)
final_model.fit(X_train_res, y_train_res)

# -----------------------
# Train set evaluation (on resampled training set)
# -----------------------
y_train_pred = final_model.predict(X_train_res)
y_train_probs = final_model.predict_proba(X_train_res)

print("\nFinal Training Performance (with SMOTE):")
print("Accuracy:", accuracy_score(y_train_res, y_train_pred))
print("Precision:", precision_score(y_train_res, y_train_pred, average="weighted", zero_division=0))
print("Recall:", recall_score(y_train_res, y_train_pred, average="weighted"))
print("F1:", f1_score(y_train_res, y_train_pred, average="weighted"))
print("ROC-AUC:", roc_auc_score(pd.get_dummies(y_train_res), y_train_probs, multi_class="ovr"))
print("Logloss:", log_loss(pd.get_dummies(y_train_res), y_train_probs))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Final Training Performance (with SMOTE):
Accuracy: 0.9913294797687862
Precision: 0.9913481750150273
Recall: 0.9913294797687862
F1: 0.991335719728513
ROC-AUC: 0.9997253917604999
Logloss: 0.03813175802940425


In [14]:
# -----------------------
# Test set evaluation (unseen 20%)
# -----------------------
X_test = test_set.drop(columns=["invasion_risk_score", "status"])
y_test = discretize_risk(test_set["invasion_risk_score"])
X_test_num = preprocessor.transform(X_test)

y_pred = final_model.predict(X_test_num)
y_probs = final_model.predict_proba(X_test_num)

print("\nFinal Test Performance (unseen data):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted", zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1:", f1_score(y_test, y_pred, average="weighted"))
print("ROC-AUC:", roc_auc_score(pd.get_dummies(y_test), y_probs, multi_class="ovr"))
print("Logloss:", log_loss(pd.get_dummies(y_test), y_probs))



Final Test Performance (unseen data):
Accuracy: 0.7329059829059829
Precision: 0.8643498889561881
Recall: 0.7329059829059829
F1: 0.7447835030466751
ROC-AUC: 0.8518281122472162
Logloss: 1.2477963445856588
