In [1]:
import pandas as pd

# ---- Step 1: Load the dataset ----
df = pd.read_csv("/kaggle/input/loan-final-normalized-csv/loan_final_normalized.csv")

# ---- Step 2: Print first few rows (optional) ----
print("Preview of data:")
print(df.head(), "\n")

# ---- Step 3: Print dataset info ----
print("INFO:")
print(df.info(), "\n")

# ---- Step 4: Print statistical summary ----
print("DESCRIBE:")
print(df.describe(include='all'))  # include='all' to show categorical stats also

Preview of data:
   customer_id       age  occupation_status  years_employed  annual_income  \
0            0  0.423077                  0        0.431078       0.045017   
1            1  0.288462                  0        0.182957       0.119519   
2            2  0.461538                  2        0.027569       0.024851   
3            3  0.673077                  2        0.012531       0.060200   
4            4  0.269231                  0        0.313283       0.207051   

   credit_score  credit_history_years  savings_assets  current_debt  \
0      0.685259              0.176667        0.002983      0.065897   
1      0.555777              0.116667        0.000563      0.100990   
2      0.679283              0.280000        0.000057      0.047721   
3      0.685259              0.326667        0.004933      0.070693   
4      0.561753              0.240000        0.000697      0.075721   

   defaults_on_file  delinquencies_last_2yrs  derogatory_marks  product_type  \
0      

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

# =====================================
# DATA PREPARATION
# =====================================
X = df.drop("loan_status", axis=1)
y = df["loan_status"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =====================================
# GA CONFIG
# =====================================
population_size = 20
n_generations = 20
dim = 5

bounds = np.array([
    [100, 500],
    [1, 10],
    [0.01, 0.3],
    [1, 200],
    [1, 200]
])

mutation_rate = 0.2
crossover_rate = 0.7

hp_names = ["n_estimators", "max_depth", "learning_rate",
            "min_samples_split", "min_samples_leaf"]

# =====================================
# DECODE INDIVIDUAL → MODEL PARAMS
# =====================================
def decode(position):
    return {
        "n_estimators": int(position[0]),
        "max_depth": int(position[1]),
        "learning_rate": float(position[2]),
        "min_samples_split": int(position[3]),
        "min_samples_leaf": int(position[4]),
        "random_state": 42
    }

# =====================================
# FITNESS CACHE (SPEED BOOST)
# =====================================
fitness_cache = {}

def fitness_fn(position):
    key = tuple(position.round(4))  # cache key

    # If evaluated before, reuse score → huge speed boost
    if key in fitness_cache:
        return fitness_cache[key]

    params = decode(position)
    model = GradientBoostingClassifier(**params)
    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    acc  = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, zero_division=0)
    rec  = recall_score(y_test, pred, zero_division=0)
    f1   = f1_score(y_test, pred, zero_division=0)

    score = (acc + prec + rec + f1) / 4.0
    fitness_cache[key] = score  # cache the result
    return score

# =====================================
# INIT POPULATION (vectorized)
# =====================================
def init_population():
    return bounds[:, 0] + (bounds[:, 1] - bounds[:, 0]) * np.random.rand(population_size, dim)

# =====================================
# SELECTION
# =====================================
def select(pop, scores):
    i, j = np.random.randint(0, population_size, 2)
    return pop[i] if scores[i] > scores[j] else pop[j]

# =====================================
# CROSSOVER
# =====================================
def crossover(p1, p2):
    if np.random.rand() > crossover_rate:
        return p1.copy(), p2.copy()

    point = np.random.randint(1, dim)
    return (
        np.concatenate((p1[:point], p2[point:])),
        np.concatenate((p2[:point], p1[point:]))
    )

# =====================================
# MUTATION (vectorized)
# =====================================
def mutate(child):
    mask = np.random.rand(dim) < mutation_rate
    random_values = np.random.uniform(bounds[:, 0], bounds[:, 1])
    child[mask] = random_values[mask]
    return child

# =====================================
# GA MAIN LOOP
# =====================================
population = init_population()
fitness_scores = np.zeros(population_size)

for gen in range(n_generations):

    # Compute fitness
    for i in range(population_size):
        fitness_scores[i] = fitness_fn(population[i])

    best_idx = np.argmax(fitness_scores)
    print(f"Generation {gen+1}/{n_generations} → Best Fitness = {fitness_scores[best_idx]:.4f}")

    # GENERATE NEW POPULATION
    new_pop = []

    while len(new_pop) < population_size:
        p1 = select(population, fitness_scores)
        p2 = select(population, fitness_scores)

        c1, c2 = crossover(p1, p2)

        new_pop.append(mutate(c1))
        new_pop.append(mutate(c2))

    population = np.array(new_pop[:population_size])

# =====================================
# BEST PARAMETERS
# =====================================
best_idx = np.argmax(fitness_scores)
best_params = decode(population[best_idx])

print("\nBEST PARAMETERS FOUND BY GA:")
for k, v in best_params.items():
    print(f" {k}: {v}")

# =====================================
# FINAL MODEL
# =====================================
final_model = GradientBoostingClassifier(**best_params)
final_model.fit(X_train, y_train)
pred = final_model.predict(X_test)

acc = accuracy_score(y_test, pred)
prec = precision_score(y_test, pred, zero_division=0)
rec  = recall_score(y_test, pred, zero_division=0)
f1   = f1_score(y_test, pred, zero_division=0)

print("\nFINAL METRICS:")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")

# =====================================
# FEATURE IMPORTANCE
# =====================================
importances = final_model.feature_importances_
fi_df = pd.DataFrame({"Feature": X.columns, "Importance": importances})
fi_df = fi_df.sort_values(by="Importance", ascending=False)

threshold = np.mean(importances)
selected_features = fi_df[fi_df["Importance"] > threshold]["Feature"].tolist()

print("\nSELECTED FEATURES:")
for f in selected_features:
    print(" -", f)

Generation 1/20 → Best Fitness = 0.9338
Generation 2/20 → Best Fitness = 0.9342
Generation 3/20 → Best Fitness = 0.9342


KeyboardInterrupt: 

In [2]:
import pandas as pd

# ---- Step 1: Load the dataset ----
df = pd.read_csv("/kaggle/input/loan-final-normalized-csv/loan_final_normalized.csv")

# ---- Step 2: Print first few rows (optional) ----
print("Preview of data:")
print(df.head(), "\n")

# ---- Step 3: Print dataset info ----
print("INFO:")
print(df.info(), "\n")

# ---- Step 4: Print statistical summary ----
print("DESCRIBE:")
print(df.describe(include='all'))  # include='all' to show categorical stats also

Preview of data:
   customer_id       age  occupation_status  years_employed  annual_income  \
0            0  0.423077                  0        0.431078       0.045017   
1            1  0.288462                  0        0.182957       0.119519   
2            2  0.461538                  2        0.027569       0.024851   
3            3  0.673077                  2        0.012531       0.060200   
4            4  0.269231                  0        0.313283       0.207051   

   credit_score  credit_history_years  savings_assets  current_debt  \
0      0.685259              0.176667        0.002983      0.065897   
1      0.555777              0.116667        0.000563      0.100990   
2      0.679283              0.280000        0.000057      0.047721   
3      0.685259              0.326667        0.004933      0.070693   
4      0.561753              0.240000        0.000697      0.075721   

   defaults_on_file  delinquencies_last_2yrs  derogatory_marks  product_type  \
0      

In [3]:
# ============================================================
# Harmony Search (HS) + Gradient Boosting for Feature Selection
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier

# ============================================================
# 1. Prepare dataset
# ============================================================
X = df.drop("loan_status", axis=1)
y = df["loan_status"].astype(int)

feature_count = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 2. Harmony Search Parameters
# ============================================================
hms = 15                # Harmony Memory Size
iterations = 10
hmcr = 0.9             # Harmony Memory Consideration Rate
par = 0.3              # Pitch Adjustment Rate
bw = 0.1               # Bandwidth (small perturbation)

# ============================================================
# 3. Fitness Function (GradientBoosting)
# ============================================================
def fitness(solution):
    if np.sum(solution) == 0:
        return 0  # invalid solution

    selected_cols = X.columns[solution == 1]

    Xtr = X_train[selected_cols]
    Xte = X_test[selected_cols]

    model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )

    model.fit(Xtr, y_train)
    pred = model.predict(Xte)

    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, zero_division=0)
    rec = recall_score(y_test, pred, zero_division=0)
    f1 = f1_score(y_test, pred, zero_division=0)

    return (acc + prec + rec + f1) / 4


# ============================================================
# 4. Initialize Harmony Memory (HM)
# ============================================================
def initialize_harmony_memory():
    HM = []
    scores = []

    for _ in range(hms):
        sol = np.random.randint(0, 2, feature_count)
        score = fitness(sol)
        HM.append(sol)
        scores.append(score)

    return np.array(HM), np.array(scores)


# ============================================================
# 5. Generate New Harmony
# ============================================================
def generate_new_harmony(HM):
    new_harmony = np.zeros(feature_count)

    for f in range(feature_count):

        if np.random.rand() < hmcr:  
            # choose from memory
            idx = np.random.randint(0, hms)
            value = HM[idx][f]

            # Pitch Adjustment
            if np.random.rand() < par:
                value = 1 - value  # flip bit

        else:
            value = np.random.randint(0, 2)

        new_harmony[f] = value

    return new_harmony


# ============================================================
# 6. Harmony Search Optimization
# ============================================================
def harmony_search():

    HM, scores = initialize_harmony_memory()
    best_index = np.argmax(scores)
    best_solution = HM[best_index].copy()
    best_score = scores[best_index]

    for itr in range(iterations):

        new_harmony = generate_new_harmony(HM)
        new_score = fitness(new_harmony)

        # Replace worst if new harmony is better
        worst_index = np.argmin(scores)
        if new_score > scores[worst_index]:
            HM[worst_index] = new_harmony
            scores[worst_index] = new_score

        # Update global best
        if new_score > best_score:
            best_solution = new_harmony.copy()
            best_score = new_score

        print(f"Iteration {itr+1}/{iterations} - Best Fitness = {best_score:.4f}")

    return best_solution, best_score


# ============================================================
# 7. Run Harmony Search
# ============================================================
best_solution, best_score = harmony_search()

selected_features = X.columns[best_solution == 1].tolist()

print("\n==============================")
print(" BEST FEATURES SELECTED BY HS")
print("==============================")
print("Total Features :", len(X.columns))
print("Selected       :", len(selected_features))
print(selected_features)


# ============================================================
# 8. Final Gradient Boosting Model on Selected Features
# ============================================================
Xtr_final = X_train[selected_features]
Xte_final = X_test[selected_features]

final_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

final_model.fit(Xtr_final, y_train)
final_pred = final_model.predict(Xte_final)

acc = accuracy_score(y_test, final_pred)
prec = precision_score(y_test, final_pred, zero_division=0)
rec = recall_score(y_test, final_pred, zero_division=0)
f1 = f1_score(y_test, final_pred, zero_division=0)

print("\n==============================")
print("   FINAL GBC RESULTS")
print("==============================")
print(f"Accuracy :  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AVG Score: {(acc + prec + rec + f1) / 4:.4f}")

Iteration 1/10 - Best Fitness = 0.9081
Iteration 2/10 - Best Fitness = 0.9081
Iteration 3/10 - Best Fitness = 0.9081
Iteration 4/10 - Best Fitness = 0.9081
Iteration 5/10 - Best Fitness = 0.9081
Iteration 6/10 - Best Fitness = 0.9081
Iteration 7/10 - Best Fitness = 0.9081
Iteration 8/10 - Best Fitness = 0.9081
Iteration 9/10 - Best Fitness = 0.9081
Iteration 10/10 - Best Fitness = 0.9115

 BEST FEATURES SELECTED BY HS
Total Features : 19
Selected       : 12
['customer_id', 'age', 'occupation_status', 'years_employed', 'credit_score', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'loan_intent', 'debt_to_income_ratio', 'loan_to_income_ratio']

   FINAL GBC RESULTS
Accuracy :  0.9037
Precision: 0.8995
Recall   : 0.9288
F1-score : 0.9139
AVG Score: 0.9115


In [4]:
# ============================================================
# Whale Optimization Algorithm (WOA) + Gradient Boosting
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier

# ============================================================
# 1. Dataset
# ============================================================
X = df.drop("loan_status", axis=1)
y = df["loan_status"].astype(int)

feature_count = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 2. WOA Parameters
# ============================================================
population_size = 20
iterations = 10

# ============================================================
# 3. Fitness Function (GradientBoosting)
# ============================================================
def fitness(solution):

    if np.sum(solution) == 0:
        return 0

    selected_cols = X.columns[solution == 1]

    Xtr = X_train[selected_cols]
    Xte = X_test[selected_cols]

    model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )

    model.fit(Xtr, y_train)
    pred = model.predict(Xte)

    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, zero_division=0)
    rec = recall_score(y_test, pred, zero_division=0)
    f1 = f1_score(y_test, pred, zero_division=0)

    return (acc + prec + rec + f1) / 4


# ============================================================
# 4. Initialize Population
# ============================================================
def initialize_population():
    pop = np.random.randint(0, 2, (population_size, feature_count))
    scores = np.array([fitness(ind) for ind in pop])
    return pop, scores


# ============================================================
# 5. Whale Optimization Algorithm
# ============================================================
def woa():

    population, scores = initialize_population()

    # Best whale
    best_index = np.argmax(scores)
    best_solution = population[best_index].copy()
    best_score = scores[best_index]

    for itr in range(iterations):

        a = 2 - itr * (2 / iterations)   # linearly decreasing a

        for i in range(population_size):

            A = 2 * a * np.random.rand() - a
            C = 2 * np.random.rand()
            p = np.random.rand()
            b = 1
            l = np.random.uniform(-1, 1)

            whale = population[i].copy()

            if p < 0.5:

                if abs(A) < 1:
                    # Exploitation
                    D = abs(C * best_solution - whale)
                    new_position = best_solution - A * D

                else:
                    # Select random whale
                    rand_idx = np.random.randint(0, population_size)
                    random_whale = population[rand_idx]

                    D = abs(C * random_whale - whale)
                    new_position = random_whale - A * D

            else:
                # Spiral updating
                D = abs(best_solution - whale)
                new_position = (D * np.exp(b * l) * np.cos(2 * np.pi * l)) + best_solution

            # Convert to binary (sigmoid threshold)
            new_binary = 1 / (1 + np.exp(-new_position))
            new_binary = (new_binary > 0.5).astype(int)

            population[i] = new_binary
            scores[i] = fitness(new_binary)

        # Update global best
        best_index = np.argmax(scores)
        if scores[best_index] > best_score:
            best_score = scores[best_index]
            best_solution = population[best_index].copy()

        print(f"Iteration {itr+1}/{iterations} - Best Fitness = {best_score:.4f}")

    return best_solution, best_score


# ============================================================
# 6. Run WOA Feature Selection
# ============================================================
best_solution, best_score = woa()

selected_features = X.columns[best_solution == 1].tolist()

print("\n==============================")
print(" BEST FEATURES SELECTED BY WOA")
print("==============================")
print("Total Features :", len(X.columns))
print("Selected       :", len(selected_features))
print(selected_features)


# ============================================================
# 7. Final Gradient Boosting Model
# ============================================================
Xtr_final = X_train[selected_features]
Xte_final = X_test[selected_features]

final_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

final_model.fit(Xtr_final, y_train)
final_pred = final_model.predict(Xte_final)

acc = accuracy_score(y_test, final_pred)
prec = precision_score(y_test, final_pred, zero_division=0)
rec = recall_score(y_test, final_pred, zero_division=0)
f1 = f1_score(y_test, final_pred, zero_division=0)

print("\n==============================")
print("   FINAL GBC RESULTS")
print("==============================")
print(f"Accuracy :  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AVG Score: {(acc + prec + rec + f1) / 4:.4f}")

Iteration 1/10 - Best Fitness = 0.9244
Iteration 2/10 - Best Fitness = 0.9252
Iteration 3/10 - Best Fitness = 0.9252
Iteration 4/10 - Best Fitness = 0.9252
Iteration 5/10 - Best Fitness = 0.9252
Iteration 6/10 - Best Fitness = 0.9252
Iteration 7/10 - Best Fitness = 0.9252
Iteration 8/10 - Best Fitness = 0.9252
Iteration 9/10 - Best Fitness = 0.9252
Iteration 10/10 - Best Fitness = 0.9252

 BEST FEATURES SELECTED BY WOA
Total Features : 19
Selected       : 19
['customer_id', 'age', 'occupation_status', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'product_type', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio']

   FINAL GBC RESULTS
Accuracy :  0.9186
Precision: 0.9124
Recall   : 0.9426
F1-score : 0.9273
AVG Score: 0.9252


In [5]:
# ============================================================
# Grey Wolf Optimizer (GWO) + Gradient Boosting Classifier
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier

# ============================================================
# 1. Prepare dataset
# ============================================================
X = df.drop("loan_status", axis=1)
y = df["loan_status"].astype(int)

feature_count = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 2. GWO Parameters
# ============================================================
wolves = 12
iterations = 5


# ============================================================
# 3. Fitness Function (Gradient Boosting)
# ============================================================
def fitness(solution):

    if np.sum(solution) == 0:
        return 0  # invalid → no features selected

    selected_cols = X.columns[solution == 1]

    Xtr = X_train[selected_cols]
    Xte = X_test[selected_cols]

    model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )

    model.fit(Xtr, y_train)
    pred = model.predict(Xte)

    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, zero_division=0)
    rec = recall_score(y_test, pred, zero_division=0)
    f1 = f1_score(y_test, pred, zero_division=0)

    return (acc + prec + rec + f1) / 4


# ============================================================
# 4. Initialize Population
# ============================================================
def init_population():
    return np.random.randint(0, 2, (wolves, feature_count))


# ============================================================
# 5. Binary Grey Wolf Optimizer
# ============================================================
def gwo_feature_selection():

    population = init_population()
    fitness_scores = np.zeros(wolves)

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1

    for itr in range(iterations):

        # Evaluate population
        for i in range(wolves):

            fitness_scores[i] = fitness(population[i])

            if fitness_scores[i] > Alpha_score:
                Alpha_score = fitness_scores[i]
                Alpha = population[i].copy()

            elif fitness_scores[i] > Beta_score:
                Beta_score = fitness_scores[i]
                Beta = population[i].copy()

            elif fitness_scores[i] > Delta_score:
                Delta_score = fitness_scores[i]
                Delta = population[i].copy()

        print(f"Iteration {itr+1}/{iterations} - Best Fitness = {Alpha_score:.4f}")

        a = 2 - itr * (2 / iterations)  # a decreases from 2 → 0

        # Update positions
        for i in range(wolves):
            for d in range(feature_count):

                # ---------------- Alpha Update ----------------
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a
                C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - population[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                # ---------------- Beta Update ----------------
                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a
                C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - population[i][d])
                X2 = Beta[d] - A2 * D_beta

                # ---------------- Delta Update ----------------
                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a
                C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - population[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3

                # ---------------- Binary Sigmoid ----------------
                s = 1 / (1 + np.exp(-new_pos))
                population[i][d] = 1 if np.random.rand() < s else 0

    return Alpha, Alpha_score


# ============================================================
# 6. Run GWO Feature Selection
# ============================================================
best_solution, best_score = gwo_feature_selection()

selected_features = X.columns[best_solution == 1].tolist()

print("\n==============================")
print(" BEST FEATURES SELECTED BY GWO")
print("==============================")
print("Total Features :", len(X.columns))
print("Selected       :", len(selected_features))
print(selected_features)


# ============================================================
# 7. Final Gradient Boosting Model on Selected Features
# ============================================================
Xtr_final = X_train[selected_features]
Xte_final = X_test[selected_features]

final_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

final_model.fit(Xtr_final, y_train)
final_pred = final_model.predict(Xte_final)

acc = accuracy_score(y_test, final_pred)
prec = precision_score(y_test, final_pred, zero_division=0)
rec = recall_score(y_test, final_pred, zero_division=0)
f1 = f1_score(y_test, final_pred, zero_division=0)

print("\n==============================")
print(" FINAL GRADIENT BOOSTING RESULTS")
print("==============================")
print(f"Accuracy :  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AVG Score: {(acc + prec + rec + f1) / 4:.4f}")

Iteration 1/5 - Best Fitness = 0.8769
Iteration 2/5 - Best Fitness = 0.9188
Iteration 3/5 - Best Fitness = 0.9215
Iteration 4/5 - Best Fitness = 0.9250
Iteration 5/5 - Best Fitness = 0.9250

 BEST FEATURES SELECTED BY GWO
Total Features : 19
Selected       : 17
['customer_id', 'age', 'occupation_status', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio']

 FINAL GRADIENT BOOSTING RESULTS
Accuracy :  0.9184
Precision: 0.9133
Recall   : 0.9411
F1-score : 0.9270
AVG Score: 0.9250


In [6]:
# ============================================================
# Bat Algorithm (BA) + Gradient Boosting for Feature Selection
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier

# ============================================================
# 1. Prepare dataset
# ============================================================
X = df.drop("loan_status", axis=1)
y = df["loan_status"].astype(int)

feature_count = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# ============================================================
# 2. Bat Algorithm Parameters
# ============================================================
n_bats = 20
iterations = 5

A = 0.5       # Loudness
r = 0.5       # Pulse emission rate
fmin, fmax = 0, 2  # Frequency range


# ============================================================
# 3. Fitness Function using Gradient Boosting
# ============================================================
def fitness(solution):
    if np.sum(solution) == 0:
        return 0  # no features selected → invalid

    selected_cols = X.columns[solution == 1]
    Xtr = X_train[selected_cols]
    Xte = X_test[selected_cols]

    model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )

    model.fit(Xtr, y_train)
    pred = model.predict(Xte)

    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, zero_division=0)
    rec = recall_score(y_test, pred, zero_division=0)
    f1 = f1_score(y_test, pred, zero_division=0)

    return (acc + prec + rec + f1) / 4


# ============================================================
# 4. Initialize Bat Population
# ============================================================
def init_population():
    pop = np.random.randint(0, 2, (n_bats, feature_count))
    vel = np.random.rand(n_bats, feature_count)
    freq = np.zeros(n_bats)
    return pop, vel, freq


# ============================================================
# 5. Bat Algorithm Main Loop
# ============================================================
def bat_feature_selection():

    population, velocity, freq = init_population()
    fitness_vals = np.zeros(n_bats)

    # Evaluate initial
    for i in range(n_bats):
        fitness_vals[i] = fitness(population[i])

    best_idx = np.argmax(fitness_vals)
    best_solution = population[best_idx].copy()
    best_fitness = fitness_vals[best_idx]

    for itr in range(iterations):

        for i in range(n_bats):

            # Update frequency
            freq[i] = fmin + (fmax - fmin) * np.random.rand()

            # Velocity update
            velocity[i] += (population[i] ^ best_solution) * freq[i]

            # Position update (Binary using Sigmoid)
            temp = population[i] + velocity[i]
            prob = 1 / (1 + np.exp(-temp))
            new_solution = (np.random.rand(feature_count) < prob).astype(int)

            # Local search
            if np.random.rand() > r:
                # Flip some bits (local improvement)
                flip_idx = np.random.randint(0, feature_count)
                new_solution[flip_idx] = 1 - new_solution[flip_idx]

            new_fitness = fitness(new_solution)

            # Accept new solution based on loudness A
            if (new_fitness > fitness_vals[i]) and (np.random.rand() < A):
                population[i] = new_solution
                fitness_vals[i] = new_fitness

            # Update global best
            if new_fitness > best_fitness:
                best_solution = new_solution.copy()
                best_fitness = new_fitness

        print(f"Iteration {itr+1}/{iterations} - Best Fitness = {best_fitness:.4f}")

    return best_solution, best_fitness


# ============================================================
# 6. Run Bat Algorithm for Feature Selection
# ============================================================
best_solution, best_score = bat_feature_selection()

selected_features = X.columns[best_solution == 1].tolist()

print("\n==============================")
print("  BEST FEATURES SELECTED BY BAT")
print("==============================")
print("Total Features :", len(X.columns))
print("Selected       :", len(selected_features))
print(selected_features)


# ============================================================
# 7. Final Gradient Boosting on Selected Features
# ============================================================
Xtr_final = X_train[selected_features]
Xte_final = X_test[selected_features]

final_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

final_model.fit(Xtr_final, y_train)
pred_final = final_model.predict(Xte_final)

acc = accuracy_score(y_test, pred_final)
prec = precision_score(y_test, pred_final, zero_division=0)
rec = recall_score(y_test, pred_final, zero_division=0)
f1 = f1_score(y_test, pred_final, zero_division=0)

print("\n==============================")
print(" FINAL GRADIENT BOOSTING RESULTS")
print("==============================")
print(f"Accuracy :  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AVG Score: {(acc + prec + rec + f1) / 4:.4f}")

Iteration 1/5 - Best Fitness = 0.9245
Iteration 2/5 - Best Fitness = 0.9252
Iteration 3/5 - Best Fitness = 0.9252
Iteration 4/5 - Best Fitness = 0.9252
Iteration 5/5 - Best Fitness = 0.9252

  BEST FEATURES SELECTED BY BAT
Total Features : 19
Selected       : 18
['age', 'occupation_status', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'product_type', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio']

 FINAL GRADIENT BOOSTING RESULTS
Accuracy :  0.9186
Precision: 0.9124
Recall   : 0.9426
F1-score : 0.9273
AVG Score: 0.9252


In [7]:
# ============================================================
# Poor & Rich Optimization (PRO) + Gradient Boosting for Feature Selection
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.ensemble import GradientBoostingClassifier

# ============================================================
# 1. Prepare dataset
# ============================================================
X = df.drop("loan_status", axis=1)
y = df["loan_status"].astype(int)

feature_count = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# ============================================================
# 2. Fitness Function using Gradient Boosting
# ============================================================
def fitness(solution):
    if np.sum(solution) == 0:
        return 0  # No feature selected

    selected = X.columns[solution == 1]
    Xtr = X_train[selected]
    Xte = X_test[selected]

    model = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )

    model.fit(Xtr, y_train)
    pred = model.predict(Xte)

    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, zero_division=0)
    rec = recall_score(y_test, pred, zero_division=0)
    f1 = f1_score(y_test, pred, zero_division=0)

    return (acc + prec + rec + f1) / 4


# ============================================================
# 3. PRO Parameters
# ============================================================
pop_size = 20
iterations = 5


# ============================================================
# 4. Initialize Population
# ============================================================
def init_population():
    pop = np.random.randint(0, 2, (pop_size, feature_count))
    return pop


# ============================================================
# 5. Poor & Rich Optimization (PRO) algorithm
# ============================================================
def PRO_feature_selection():

    population = init_population()
    fitness_vals = np.zeros(pop_size)

    # Evaluate initial
    for i in range(pop_size):
        fitness_vals[i] = fitness(population[i])

    best_idx = np.argmax(fitness_vals)
    best_solution = population[best_idx].copy()
    best_fitness = fitness_vals[best_idx]

    for itr in range(iterations):

        median_fit = np.median(fitness_vals)

        rich_group = population[fitness_vals >= median_fit]
        poor_group = population[fitness_vals < median_fit]

        mean_rich = np.mean(rich_group, axis=0) if len(rich_group) > 0 else np.random.rand(feature_count)
        mean_poor = np.mean(poor_group, axis=0) if len(poor_group) > 0 else np.random.rand(feature_count)

        new_population = population.copy()

        for i in range(pop_size):

            if fitness_vals[i] >= median_fit:
                # Rich → exploit near rich mean
                new_vec = population[i] + np.random.uniform(-1, 1) * (mean_rich - population[i])
            else:
                # Poor → move toward rich
                new_vec = population[i] + np.random.uniform(0, 1) * (mean_rich - population[i])

            # Sigmoid → Binary conversion
            prob = 1 / (1 + np.exp(-new_vec))
            new_solution = (np.random.rand(feature_count) < prob).astype(int)

            new_fitness = fitness(new_solution)

            # Accept if improves
            if new_fitness > fitness_vals[i]:
                new_population[i] = new_solution
                fitness_vals[i] = new_fitness

            # Global best update
            if new_fitness > best_fitness:
                best_fitness = new_fitness
                best_solution = new_solution.copy()

        population = new_population

        print(f"Iteration {itr+1}/{iterations} - Best Fitness = {best_fitness:.4f}")

    return best_solution, best_fitness


# ============================================================
# 6. Run PRO Feature Selection
# ============================================================
best_solution, best_score = PRO_feature_selection()

selected_features = X.columns[best_solution == 1].tolist()

print("\n==============================")
print("  BEST FEATURES SELECTED BY PRO")
print("==============================")
print("Total Features :", len(X.columns))
print("Selected       :", len(selected_features))
print(selected_features)


# ============================================================
# 7. Final Gradient Boosting on Selected Features
# ============================================================
Xtr_final = X_train[selected_features]
Xte_final = X_test[selected_features]

final_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

final_model.fit(Xtr_final, y_train)
pred_final = final_model.predict(Xte_final)

acc = accuracy_score(y_test, pred_final)
prec = precision_score(y_test, pred_final, zero_division=0)
rec = recall_score(y_test, pred_final, zero_division=0)
f1 = f1_score(y_test, pred_final, zero_division=0)

print("\n==============================")
print(" FINAL GRADIENT BOOSTING RESULTS (PRO)")
print("==============================")
print(f"Accuracy :  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"AVG Score: {(acc + prec + rec + f1) / 4:.4f}")

Iteration 1/5 - Best Fitness = 0.9220
Iteration 2/5 - Best Fitness = 0.9221
Iteration 3/5 - Best Fitness = 0.9248
Iteration 4/5 - Best Fitness = 0.9248
Iteration 5/5 - Best Fitness = 0.9248

  BEST FEATURES SELECTED BY PRO
Total Features : 19
Selected       : 18
['customer_id', 'age', 'occupation_status', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'product_type', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'payment_to_income_ratio']

 FINAL GRADIENT BOOSTING RESULTS (PRO)
Accuracy :  0.9182
Precision: 0.9132
Recall   : 0.9408
F1-score : 0.9268
AVG Score: 0.9248


In [2]:
# ===============================
# Hybrid PSO + XGBoost + LightGBM
# Feature Selection + Evaluation
# ===============================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

# ===============================
# 1. Load / Prepare Data
# ===============================

# Your DataFrame must already exist as df
# Example: df = pd.read_csv("your_dataset.csv")
df=pd.read_csv("/kaggle/input/loan-final-normalized-csv/loan_final_normalized.csv")
TARGET_COL = "loan_status"    

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(int)   # assuming binary labels like 0/1

feature_names = X.columns.to_numpy()
n_features = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)
print("No. of features:", n_features)


# ===============================
# 2. Helper: Model Evaluation
# ===============================

def evaluate_model(model, X_tr, X_te, y_tr, y_te, label="MODEL", average_type="binary"):
    """
    Train model and print Accuracy, Precision, Recall, F1.
    For multi-class classification, set average_type="macro".
    """
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred, average=average_type, zero_division=0)
    rec = recall_score(y_te, y_pred, average=average_type, zero_division=0)
    f1 = f1_score(y_te, y_pred, average=average_type, zero_division=0)

    print(f"\n==== {label} ====")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")

    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1}


# ===============================
# 3. PSO for Feature Selection
#    – Fitness = XGBoost CV F1
# ===============================

class PSOFeatureSelector:
    """
    Binary PSO for feature selection using XGBoost as evaluator.
    Each particle encodes a bitmask over features.
    """

    def __init__(
        self,
        n_particles,
        n_iterations,
        inertia=0.7,
        cognitive=1.5,
        social=1.5,
        random_state=42,
        cv_splits=3
    ):
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.w = inertia
        self.c1 = cognitive
        self.c2 = social
        self.random_state = random_state
        self.cv_splits = cv_splits
        np.random.seed(self.random_state)

    def _init_swarm(self, n_dims):
        # positions: (n_particles, n_features) in {0,1}
        positions = np.random.randint(0, 2, size=(self.n_particles, n_dims))
        # avoid all-zero vectors
        for i in range(self.n_particles):
            if positions[i].sum() == 0:
                positions[i, np.random.randint(0, n_dims)] = 1

        velocities = np.random.uniform(-1, 1, size=(self.n_particles, n_dims))

        return positions.astype(float), velocities.astype(float)

    def _fitness(self, mask, X, y):
        if mask.sum() == 0:
            return 0.0

        X_sel = X[:, mask == 1]

        model = XGBClassifier(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            n_jobs=-1
        )

        cv = StratifiedKFold(n_splits=self.cv_splits, shuffle=True, random_state=self.random_state)
        f1 = cross_val_score(model, X_sel, y, cv=cv, scoring="f1").mean()

        # NEW: penalty for too many or too few features
        k = mask.sum()                           # number of selected features
        penalty = 0.002 * abs(k - (X.shape[1] * 0.4))   # optimal = 40% features

        return f1 - penalty


    def fit(self, X, y):
        """
        Run PSO to find best feature subset.
        X: np.array (n_samples, n_features)
        y: np.array (n_samples,)
        """
        n_dims = X.shape[1]
        positions, velocities = self._init_swarm(n_dims)

        # personal best
        pbest_pos = positions.copy()
        pbest_scores = np.array([self._fitness(p, X, y) for p in positions])

        # global best
        gbest_idx = np.argmax(pbest_scores)
        gbest_pos = pbest_pos[gbest_idx].copy()
        gbest_score = pbest_scores[gbest_idx]

        print("\n[PSO] Initial best F1:", gbest_score)

        for it in range(self.n_iterations):
            for i in range(self.n_particles):
                # Update velocity
                r1 = np.random.rand(n_dims)
                r2 = np.random.rand(n_dims)

                velocities[i] = (
                    self.w * velocities[i]
                    + self.c1 * r1 * (pbest_pos[i] - positions[i])
                    + self.c2 * r2 * (gbest_pos - positions[i])
                )

                # Binary PSO using sigmoid
                sigmoid = 1 / (1 + np.exp(-velocities[i]))
                new_pos = (np.random.rand(n_dims) < sigmoid).astype(float)

                # Avoid no-feature case
                if new_pos.sum() == 0:
                    new_pos[np.random.randint(0, n_dims)] = 1

                positions[i] = new_pos

                # Evaluate new fitness
                score = self._fitness(positions[i], X, y)

                # Update personal best
                if score > pbest_scores[i]:
                    pbest_scores[i] = score
                    pbest_pos[i] = positions[i].copy()

            # Update global best
            best_idx = np.argmax(pbest_scores)
            if pbest_scores[best_idx] > gbest_score:
                gbest_score = pbest_scores[best_idx]
                gbest_pos = pbest_pos[best_idx].copy()

            print(f"[PSO] Iter {it+1}/{self.n_iterations} - Best F1: {gbest_score:.4f}  | Selected features: {int(gbest_pos.sum())}")

        self.best_mask_ = gbest_pos.astype(int)
        self.best_score_ = gbest_score
        return self

    def transform(self, X):
        return X[:, self.best_mask_ == 1]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)


# ===============================
# 4. Run PSO Feature Selection
# ===============================

# Convert to numpy for speed
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

pso_selector = PSOFeatureSelector(
    n_particles=20,
    n_iterations=20,
    inertia=0.7,
    cognitive=1.5,
    social=1.5,
    random_state=42,
    cv_splits=3
)

X_train_fs = pso_selector.fit_transform(X_train_np, y_train_np)
mask = pso_selector.best_mask_.astype(bool)
selected_features = feature_names[mask]

print("\n===============================")
print("PSO + XGBoost Feature Selection")
print("Best CV F1-score:", pso_selector.best_score_)
print("Selected feature count:", len(selected_features))
print("Selected features:")
print(list(selected_features))
print("===============================")

# Apply same feature subset to test data
X_test_np = X_test.to_numpy()
X_test_fs = X_test_np[:, mask]


# ===============================
# 5. Train & Evaluate Models
# ===============================

# 5.1 Baseline: LightGBM on ALL features
lgb_all = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
results_lgb_all = evaluate_model(
    lgb_all, X_train, X_test, y_train, y_test,
    label="LightGBM (All Features)", 
    average_type="binary"  # for multi-class use "macro"
)

# 5.2 Baseline: XGBoost on ALL features
xgb_all = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)
results_xgb_all = evaluate_model(
    xgb_all, X_train, X_test, y_train, y_test,
    label="XGBoost (All Features)",
    average_type="binary"
)

# 5.3 HYBRID: PSO (XGBoost-based feature eval) + LightGBM
lgb_pso = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
results_lgb_pso = evaluate_model(
    lgb_pso, X_train_fs, X_test_fs, y_train, y_test,
    label="Hybrid: PSO + XGBoost (FS) + LightGBM",
    average_type="binary"
)

print("\n\n======= SUMMARY =======")
print("LightGBM (All):", results_lgb_all)
print("XGBoost (All):", results_xgb_all)
print("Hybrid PSO+XGB+LGB:", results_lgb_pso)
print("========================")


Train shape: (40000, 19)
Test shape : (10000, 19)
No. of features: 19

[PSO] Initial best F1: 0.9253162974218366
[PSO] Iter 1/20 - Best F1: 0.9324  | Selected features: 14
[PSO] Iter 2/20 - Best F1: 0.9324  | Selected features: 14
[PSO] Iter 3/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 4/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 5/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 6/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 7/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 8/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 9/20 - Best F1: 0.9354  | Selected features: 15
[PSO] Iter 10/20 - Best F1: 0.9355  | Selected features: 14
[PSO] Iter 11/20 - Best F1: 0.9355  | Selected features: 14
[PSO] Iter 12/20 - Best F1: 0.9360  | Selected features: 14
[PSO] Iter 13/20 - Best F1: 0.9360  | Selected features: 14
[PSO] Iter 14/20 - Best F1: 0.9360  | Selected features: 14
[PSO] Iter 15/20 - Best F1: 0.9360  | Selected features:

In [5]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# -----------------------------
# Combined PSO + GWO Optimization
# -----------------------------
class PSO_GWO_FeatureSelector:
    def __init__(self, num_particles, num_features, max_iter):
        self.num_particles = num_particles
        self.num_features = num_features
        self.max_iter = max_iter
        
    def random_mask(self):
        mask = np.random.randint(0, 2, self.num_features)
        if mask.sum() == 0:
            mask[np.random.randint(0, self.num_features)] = 1
        return mask
    
    def evaluate_mask(self, mask, X, y):
        selected = np.where(mask == 1)[0]
        if len(selected) == 0:
            return 0.0
        
        X_sel = X[:, selected]
        
        model = XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            eval_metric="logloss",
            n_jobs=-1,
            subsample=0.8
        )
        
        cv = StratifiedKFold(n_splits=3, shuffle=True)
        scores = []
        
        for tr, va in cv.split(X_sel, y):
            model.fit(X_sel[tr], y[tr])
            pred = model.predict(X_sel[va])
            scores.append(f1_score(y[va], pred))
        
        return np.mean(scores)
    
    def optimize(self, X, y):
        population = np.array([self.random_mask() for _ in range(self.num_particles)])
        fitness = np.array([self.evaluate_mask(m, X, y) for m in population])
        
        alpha, beta, delta = population[np.argsort(-fitness)[:3]]

        # PSO parameters
        velocity = np.random.uniform(-1, 1, population.shape)
        w, c1, c2 = 0.7, 1.4, 1.4
        
        for it in range(self.max_iter):
            for i in range(self.num_particles):
                
                # ========== PSO Update ==========
                r1, r2 = np.random.rand(), np.random.rand()
                velocity[i] = (
                    w * velocity[i]
                    + c1 * r1 * (alpha - population[i])
                    + c2 * r2 * (beta - population[i])
                )
                new_pso = (np.random.rand(self.num_features) < 1/(1+np.exp(-velocity[i]))).astype(int)
                
                # ========== GWO Update ==========
                a = 2 * (1 - it / self.max_iter)
                dist_alpha = abs(alpha - population[i])
                X1 = alpha - a * dist_alpha
                
                dist_beta = abs(beta - population[i])
                X2 = beta - a * dist_beta
                
                dist_delta = abs(delta - population[i])
                X3 = delta - a * dist_delta
                
                new_gwo = ((X1 + X2 + X3) / 3 >= 0.5).astype(int)
                
                # Combine two explorers: weighted hybrid
                new_mask = ((new_pso + new_gwo) / 2 >= 0.5).astype(int)
                
                if new_mask.sum() == 0:
                    new_mask[np.random.randint(0, self.num_features)] = 1
                
                # Evaluate
                new_fit = self.evaluate_mask(new_mask, X, y)
                
                if new_fit > fitness[i]:
                    population[i] = new_mask
                    fitness[i] = new_fit
            
            # Update alpha, beta, delta wolves
            alpha, beta, delta = population[np.argsort(-fitness)[:3]]
            
            print(f"Iter {it+1}/{self.max_iter} - Best F1: {fitness.max():.4f}")
        
        return alpha, fitness.max()


In [None]:
# ===================================================================
# FULL HYBRID PIPELINE: PSO + GWO + Feature Fusion + Bayesian Opt +
# XGBoost + LightGBM + CatBoost + Metrics
# ===================================================================

import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from bayes_opt import BayesianOptimization

# ============================================================
# LOAD DATA
# df MUST BE ALREADY LOADED
# TARGET COL MUST BE CHANGED HERE
# ============================================================
TARGET = "loan_status"

X = df.drop(columns=[TARGET]).to_numpy()
y = df[TARGET].to_numpy()
feature_names = df.drop(columns=[TARGET]).columns


# ============================================================
# TRAIN/VAL/TEST SPLIT
# ============================================================
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.20, stratify=y_train_full, random_state=42
)


# ============================================================
# FEATURE IMPORTANCE FUSION (XGB + CAT)
# ============================================================
def fused_feature_importance(X, y):
    xgb = XGBClassifier(n_estimators=300, eval_metric="logloss", n_jobs=-1)
    cat = CatBoostClassifier(iterations=300, verbose=False)

    xgb.fit(X, y)
    cat.fit(X, y)

    imp_xgb = xgb.feature_importances_ / (xgb.feature_importances_.max() + 1e-9)
    imp_cat = cat.get_feature_importance() / (cat.get_feature_importance().max() + 1e-9)

    fused = 0.6 * imp_xgb + 0.4 * imp_cat
    return fused


fused_scores = fused_feature_importance(X_train_full, y_train_full)
ranked = np.argsort(-fused_scores)
print("Top 20 fused features:", feature_names[ranked[:20]])


# ============================================================
# HYBRID PSO + GWO FEATURE SELECTION
# ============================================================
class PSO_GWO_FeatureSelector:
    def __init__(self, n_particles, n_features, max_iter):
        self.n_particles = n_particles
        self.n_features = n_features
        self.max_iter = max_iter

    def random_mask(self):
        mask = np.random.randint(0, 2, self.n_features)
        if mask.sum() == 0:
            mask[random.randint(0, self.n_features - 1)] = 1
        return mask

    def evaluate_mask(self, mask, X, y):
        idx = np.where(mask == 1)[0]
        if len(idx) == 0:
            return 0

        X_sel = X[:, idx]
        model = XGBClassifier(
            n_estimators=200, learning_rate=0.1, max_depth=5,
            eval_metric="logloss", n_jobs=-1
        )

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scores = []
        for tr, va in cv.split(X_sel, y):
            model.fit(X_sel[tr], y[tr])
            pred = model.predict(X_sel[va])
            scores.append(f1_score(y[va], pred))
        return np.mean(scores)

    def optimize(self, X, y):
        pop = np.array([self.random_mask() for _ in range(self.n_particles)])
        fitness = np.array([self.evaluate_mask(m, X, y) for m in pop])

        alpha, beta, delta = pop[np.argsort(-fitness)[:3]]

        # PSO velocity
        velocity = np.random.uniform(-1, 1, pop.shape)
        w, c1, c2 = 0.7, 1.5, 1.5

        for it in range(self.max_iter):
            for i in range(self.n_particles):

                # ---------------- PSO update ----------------
                r1, r2 = np.random.rand(), np.random.rand()
                velocity[i] = (
                    w * velocity[i]
                    + c1 * r1 * (alpha - pop[i])
                    + c2 * r2 * (beta - pop[i])
                )
                pso_mask = (np.random.rand(self.n_features) < 1/(1+np.exp(-velocity[i]))).astype(int)

                # ---------------- GWO update ----------------
                a = 2 * (1 - it / self.max_iter)
                X1 = alpha - a * abs(alpha - pop[i])
                X2 = beta - a * abs(beta - pop[i])
                X3 = delta - a * abs(delta - pop[i])
                gwo_mask = ((X1 + X2 + X3) / 3 >= 0.5).astype(int)

                # Combine
                new_mask = ((pso_mask + gwo_mask) / 2 >= 0.5).astype(int)

                if new_mask.sum() == 0:
                    new_mask[random.randint(0, self.n_features - 1)] = 1

                new_fit = self.evaluate_mask(new_mask, X, y)
                if new_fit > fitness[i]:
                    pop[i] = new_mask
                    fitness[i] = new_fit

            best_idx = np.argmax(fitness)
            alpha, beta, delta = pop[np.argsort(-fitness)[:3]]

            print(f"Iter {it+1}/{self.max_iter} — Best F1: {fitness.max():.4f}")

        return pop[np.argmax(fitness)], fitness.max()


selector = PSO_GWO_FeatureSelector(
    n_particles=25,
    n_features=X_train.shape[1],
    max_iter=20
)

best_mask, best_f1 = selector.optimize(X_train, y_train)
best_features = np.where(best_mask == 1)[0]
print("Selected features:", feature_names[best_features])


# ============================================================
# APPLY FEATURE SELECTION
# ============================================================
X_train_sel = X_train[:, best_features]
X_val_sel   = X_val[:, best_features]
X_test_sel  = X_test[:, best_features]


# ============================================================
# BAYESIAN OPTIMIZATION FOR LIGHTGBM
# ============================================================
def lgb_eval(num_leaves, max_depth, learning_rate):
    params = {
        "num_leaves": int(num_leaves),
        "max_depth": int(max_depth),
        "learning_rate": float(learning_rate),
        "n_estimators": 300,
        "objective": "binary"
    }
    model = LGBMClassifier(**params)
    model.fit(X_train_sel, y_train)
    pred = model.predict(X_val_sel)
    return f1_score(y_val, pred)

optimizer = BayesianOptimization(
    f=lgb_eval,
    pbounds={
        "num_leaves": (20, 200),
        "max_depth": (4, 12),
        "learning_rate": (0.03, 0.3)
    },
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=20)
best_params = optimizer.max["params"]

best_params["num_leaves"] = int(best_params["num_leaves"])
best_params["max_depth"]  = int(best_params["max_depth"])
best_params["learning_rate"] = float(best_params["learning_rate"])

print("Best params:", best_params)


# ============================================================
# FINAL MODELS
# ============================================================
models = {
    "XGB": XGBClassifier(n_estimators=300, eval_metric="logloss"),
    "LGB": LGBMClassifier(**best_params, n_estimators=300),
    "CAT": CatBoostClassifier(iterations=300, verbose=False)
}

for name, model in models.items():
    model.fit(X_train_sel, y_train)
    pred = model.predict(X_test_sel)

    print(f"\n===== {name} =====")
    print("Accuracy :", accuracy_score(y_test, pred))
    print("Precision:", precision_score(y_test, pred))
    print("Recall   :", recall_score(y_test, pred))
    print("F1 Score :", f1_score(y_test, pred))
    print("AUC      :", roc_auc_score(y_test, pred))


'''output :
===== LGB =====
Accuracy : 0.9283
Precision: 0.9250710227272727
Recall   : 0.9464123524069028
F1 Score : 0.9356200053874472
AUC      : 0.9262651305972223

===== CAT =====
Accuracy : 0.9282
Precision: 0.9262689225289403
Recall   : 0.9447774750227066
F1 Score : 0.935431654676259
AUC      : 0.9263375695469485'''

In [9]:
# ==============================================================
# HYBRID PIPELINE:
#   - GA Feature Selection
#   - PSO Feature Selection
#   - XGBoost / LightGBM / CatBoost models
#   - Multiple combinations compared
# ==============================================================

import numpy as np
import pandas as pd
import random
import warnings
import logging

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# -------------------------
# 0. GLOBAL SETTINGS
# -------------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# Suppress warnings
warnings.filterwarnings("ignore")

# Suppress LightGBM logs
import lightgbm as lgb
logging.getLogger("lightgbm").setLevel(logging.ERROR)


# ==============================================================
# 1. LOAD DATA
# ==============================================================

# --- YOU MUST HAVE df ALREADY LOADED ---
# Example:
# df = pd.read_csv("your_data.csv")

TARGET_COL = "loan_status"   # <-- change if needed

X_df = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(int).values
feature_names = X_df.columns.values
X = X_df.values

print("Total samples:", X.shape[0])
print("Total features:", X.shape[1])


# ==============================================================
# 2. SPLIT INTO TRAIN / VAL / TEST
# ==============================================================

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.20,
    stratify=y_train_full, random_state=RANDOM_STATE
)

print("\nTrain shape:", X_train.shape)
print("Val shape  :", X_val.shape)
print("Test shape :", X_test.shape)


# ==============================================================
# 3. METRIC HELPER
# ==============================================================

def evaluate_model(model, X_tr, X_te, y_tr, y_te, label):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    acc  = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred, zero_division=0)
    rec  = recall_score(y_te, y_pred, zero_division=0)
    f1   = f1_score(y_te, y_pred, zero_division=0)
    auc  = roc_auc_score(y_te, y_pred)

    print(f"\n==== {label} ====")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"AUC      : {auc:.4f}")

    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1, "auc": auc}


# ==============================================================
# 4. GENETIC ALGORITHM FEATURE SELECTION
# ==============================================================

class GAFeatureSelector:
    """
    Binary Genetic Algorithm for feature selection.
    Chromosome: binary mask over features.
    Fitness: mean CV F1-score using XGBoost.
    """

    def __init__(
        self,
        n_features,
        pop_size=20,
        n_generations=15,
        crossover_rate=0.8,
        mutation_rate=0.02,
        random_state=42
    ):
        self.n_features = n_features
        self.pop_size = pop_size
        self.n_generations = n_generations
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate
        self.random_state = random_state
        np.random.seed(self.random_state)
        random.seed(self.random_state)

    def _init_population(self):
        pop = np.random.randint(0, 2, size=(self.pop_size, self.n_features))
        # Ensure no all-zero individuals
        for i in range(self.pop_size):
            if pop[i].sum() == 0:
                pop[i, np.random.randint(0, self.n_features)] = 1
        return pop

    def _fitness(self, mask, X, y):
        idx = np.where(mask == 1)[0]
        if len(idx) == 0:
            return 0.0

        X_sel = X[:, idx]
        model = XGBClassifier(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            n_jobs=-1,
            random_state=self.random_state
        )

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)
        scores = []
        for tr, va in cv.split(X_sel, y):
            model.fit(X_sel[tr], y[tr])
            pred = model.predict(X_sel[va])
            scores.append(f1_score(y[va], pred))
        return np.mean(scores)

    def _tournament_selection(self, population, fitness, k=3):
        selected = np.random.choice(len(population), k, replace=False)
        best_idx = selected[np.argmax(fitness[selected])]
        return population[best_idx].copy()

    def _crossover(self, parent1, parent2):
        if np.random.rand() > self.crossover_rate:
            return parent1.copy(), parent2.copy()

        point = np.random.randint(1, self.n_features - 1)
        child1 = np.concatenate([parent1[:point], parent2[point:]])
        child2 = np.concatenate([parent2[:point], parent1[point:]])
        return child1, child2

    def _mutate(self, individual):
        for i in range(self.n_features):
            if np.random.rand() < self.mutation_rate:
                individual[i] = 1 - individual[i]
        if individual.sum() == 0:
            individual[np.random.randint(0, self.n_features)] = 1
        return individual

    def fit(self, X, y):
        pop = self._init_population()
        fitness = np.array([self._fitness(ind, X, y) for ind in pop])

        best_idx = np.argmax(fitness)
        best_ind = pop[best_idx].copy()
        best_fit = fitness[best_idx]

        print("\n[GA] Initial best F1:", best_fit)

        for gen in range(self.n_generations):
            new_pop = []
            while len(new_pop) < self.pop_size:
                parent1 = self._tournament_selection(pop, fitness)
                parent2 = self._tournament_selection(pop, fitness)

                child1, child2 = self._crossover(parent1, parent2)
                child1 = self._mutate(child1)
                child2 = self._mutate(child2)

                new_pop.extend([child1, child2])

            pop = np.array(new_pop[:self.pop_size])
            fitness = np.array([self._fitness(ind, X, y) for ind in pop])

            gen_best_idx = np.argmax(fitness)
            gen_best_fit = fitness[gen_best_idx]
            if gen_best_fit > best_fit:
                best_fit = gen_best_fit
                best_ind = pop[gen_best_idx].copy()

            print(f"[GA] Generation {gen+1}/{self.n_generations} - Best F1: {best_fit:.4f}")

        self.best_mask_ = best_ind
        self.best_score_ = best_fit
        return self

    def transform(self, X):
        return X[:, self.best_mask_ == 1]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)


# ==============================================================
# 5. PSO FEATURE SELECTION
# ==============================================================

class PSOFeatureSelector:
    """
    Binary PSO for feature selection.
    Fitness: mean CV F1-score using XGBoost.
    """

    def __init__(
        self,
        n_features,
        n_particles=20,
        n_iterations=15,
        inertia=0.7,
        cognitive=1.5,
        social=1.5,
        random_state=42
    ):
        self.n_features = n_features
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.w = inertia
        self.c1 = cognitive
        self.c2 = social
        self.random_state = random_state
        np.random.seed(self.random_state)

    def _init_swarm(self):
        positions = np.random.randint(0, 2, size=(self.n_particles, self.n_features)).astype(float)
        for i in range(self.n_particles):
            if positions[i].sum() == 0:
                positions[i, np.random.randint(0, self.n_features)] = 1
        velocities = np.random.uniform(-1, 1, size=(self.n_particles, self.n_features))
        return positions, velocities

    def _fitness(self, pos, X, y):
        mask = (pos >= 0.5).astype(int)
        idx = np.where(mask == 1)[0]
        if len(idx) == 0:
            return 0.0

        X_sel = X[:, idx]
        model = XGBClassifier(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            n_jobs=-1,
            random_state=self.random_state
        )
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)
        scores = []
        for tr, va in cv.split(X_sel, y):
            model.fit(X_sel[tr], y[tr])
            pred = model.predict(X_sel[va])
            scores.append(f1_score(y[va], pred))
        return np.mean(scores)

    def fit(self, X, y):
        positions, velocities = self._init_swarm()

        pbest_pos = positions.copy()
        pbest_scores = np.array([self._fitness(p, X, y) for p in positions])

        gbest_idx = np.argmax(pbest_scores)
        gbest_pos = pbest_pos[gbest_idx].copy()
        gbest_score = pbest_scores[gbest_idx]

        print("\n[PSO] Initial best F1:", gbest_score)

        for it in range(self.n_iterations):
            for i in range(self.n_particles):
                r1 = np.random.rand(self.n_features)
                r2 = np.random.rand(self.n_features)

                velocities[i] = (
                    self.w * velocities[i]
                    + self.c1 * r1 * (pbest_pos[i] - positions[i])
                    + self.c2 * r2 * (gbest_pos - positions[i])
                )

                sigmoid = 1 / (1 + np.exp(-velocities[i]))
                new_pos = (np.random.rand(self.n_features) < sigmoid).astype(float)

                if new_pos.sum() == 0:
                    new_pos[np.random.randint(0, self.n_features)] = 1

                positions[i] = new_pos
                score = self._fitness(positions[i], X, y)

                if score > pbest_scores[i]:
                    pbest_scores[i] = score
                    pbest_pos[i] = positions[i].copy()

            best_idx = np.argmax(pbest_scores)
            if pbest_scores[best_idx] > gbest_score:
                gbest_score = pbest_scores[best_idx]
                gbest_pos = pbest_pos[best_idx].copy()

            print(f"[PSO] Iter {it+1}/{self.n_iterations} - Best F1: {gbest_score:.4f}")

        self.best_pos_ = gbest_pos
        self.best_score_ = gbest_score
        return self

    def transform(self, X):
        mask = (self.best_pos_ >= 0.5).astype(int)
        return X[:, mask == 1]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)


# ==============================================================
# 6. RUN GA FEATURE SELECTION
# ==============================================================

ga_selector = GAFeatureSelector(
    n_features=X_train.shape[1],
    pop_size=20,
    n_generations=10,
    crossover_rate=0.8,
    mutation_rate=0.02,
    random_state=RANDOM_STATE
)

X_train_ga = ga_selector.fit_transform(X_train, y_train)
X_val_ga   = ga_selector.transform(X_val)
X_test_ga  = ga_selector.transform(X_test)

ga_selected_features = feature_names[ga_selector.best_mask_ == 1]
print("\n[GA] Best F1 (CV):", ga_selector.best_score_)
print("[GA] Selected features count:", len(ga_selected_features))
print("[GA] Features:", list(ga_selected_features))


# ==============================================================
# 7. RUN PSO FEATURE SELECTION
# ==============================================================

pso_selector = PSOFeatureSelector(
    n_features=X_train.shape[1],
    n_particles=20,
    n_iterations=10,
    inertia=0.7,
    cognitive=1.5,
    social=1.5,
    random_state=RANDOM_STATE
)

X_train_pso = pso_selector.fit_transform(X_train, y_train)
X_val_pso   = pso_selector.transform(X_val)
X_test_pso  = pso_selector.transform(X_test)

pso_mask = (pso_selector.best_pos_ >= 0.5).astype(int)
pso_selected_features = feature_names[pso_mask == 1]

print("\n[PSO] Best F1 (CV):", pso_selector.best_score_)
print("[PSO] Selected features count:", len(pso_selected_features))
print("[PSO] Features:", list(pso_selected_features))


# ==============================================================
# 8. DEFINE MODELS (NO LGB WARNINGS)
# ==============================================================

def get_models():
    models = {
        "XGBoost": XGBClassifier(
            n_estimators=300,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            n_jobs=-1,
            random_state=RANDOM_STATE
        ),
        "LightGBM": LGBMClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="binary",
            random_state=RANDOM_STATE,
            verbose=-1  # suppress logs/warnings
        ),
        "CatBoost": CatBoostClassifier(
            iterations=300,
            learning_rate=0.05,
            depth=6,
            verbose=False,
            random_state=RANDOM_STATE
        )
    }
    return models


# ==============================================================
# 9. TRAIN & EVALUATE ALL COMBINATIONS
# ==============================================================

results = {}

# --- 9.1 Baseline: All Features ---
models_all = get_models()
for name, model in models_all.items():
    res = evaluate_model(
        model,
        X_train_full,
        X_test,
        y_train_full,
        y_test,
        label=f"{name} (All Features)"
    )
    results[f"{name}_all"] = res

# --- 9.2 GA-selected Features ---
models_ga = get_models()
for name, model in models_ga.items():
    res = evaluate_model(
        model,
        np.vstack([X_train_ga, X_val_ga]),
        X_test_ga,
        np.concatenate([y_train, y_val]),
        y_test,
        label=f"{name} (GA Features)"
    )
    results[f"{name}_ga"] = res

# --- 9.3 PSO-selected Features ---
models_pso = get_models()
for name, model in models_pso.items():
    res = evaluate_model(
        model,
        np.vstack([X_train_pso, X_val_pso]),
        X_test_pso,
        np.concatenate([y_train, y_val]),
        y_test,
        label=f"{name} (PSO Features)"
    )
    results[f"{name}_pso"] = res

print("\n\n========== SUMMARY DICT ==========")
for k, v in results.items():
    print(k, "->", v)


Total samples: 50000
Total features: 19

Train shape: (32000, 19)
Val shape  : (8000, 19)
Test shape : (10000, 19)

[GA] Initial best F1: 0.9246193188832371
[GA] Generation 1/10 - Best F1: 0.9328
[GA] Generation 2/10 - Best F1: 0.9329
[GA] Generation 3/10 - Best F1: 0.9329
[GA] Generation 4/10 - Best F1: 0.9332
[GA] Generation 5/10 - Best F1: 0.9332
[GA] Generation 6/10 - Best F1: 0.9351
[GA] Generation 7/10 - Best F1: 0.9351
[GA] Generation 8/10 - Best F1: 0.9355
[GA] Generation 9/10 - Best F1: 0.9355
[GA] Generation 10/10 - Best F1: 0.9355

[GA] Best F1 (CV): 0.9355251946405058
[GA] Selected features count: 15
[GA] Features: ['age', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'payment_to_income_ratio']

[PSO] Initial best F1: 0.9246193188832371
[PSO] Iter 1/10 - Best F1: 0.9325
[PSO] 

In [None]:
# ==============================================================
# HYBRID PIPELINE (Corrected & Updated):
#   - GA Feature Selection (Wrapper = CatBoost)
#   - PSO Feature Selection (Wrapper = CatBoost)
#   - XGBoost / LightGBM / CatBoost models
#   - Multiple combinations compared
# ==============================================================

import numpy as np
import pandas as pd
import random
import warnings
import logging

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# -------------------------
# GLOBAL SETTINGS
# -------------------------
df=pd.read_csv("/kaggle/input/loan-final-normalized-csv/loan_final_normalized.csv")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

warnings.filterwarnings("ignore")
logging.getLogger("lightgbm").setLevel(logging.ERROR)

# ==============================================================
# LOAD DATA
# ==============================================================

TARGET_COL = "loan_status"  # Change if needed

X_df = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(int).values
feature_names = X_df.columns.values
X = X_df.values

print("Total samples:", X.shape[0])
print("Total features:", X.shape[1])

# ==============================================================
# TRAIN / VAL / TEST SPLIT
# ==============================================================

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.20,
    stratify=y_train_full, random_state=RANDOM_STATE
)

print("\nTrain:", X_train.shape, 
      "\nVal  :", X_val.shape, 
      "\nTest :", X_test.shape)

# ==============================================================
# METRIC FUNCTION
# ==============================================================

def evaluate_model(model, X_tr, X_te, y_tr, y_te, label):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    acc  = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred, zero_division=0)
    rec  = recall_score(y_te, y_pred, zero_division=0)
    f1   = f1_score(y_te, y_pred, zero_division=0)
    auc  = roc_auc_score(y_te, y_pred)

    print(f"\n==== {label} ====")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"AUC      : {auc:.4f}")

    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1, "auc": auc}

# ==============================================================
# GENETIC ALGORITHM (Wrapper = CatBoost)
# ==============================================================

class GAFeatureSelector:
    def __init__(
        self,
        n_features,
        pop_size=20,
        n_generations=15,
        crossover_rate=0.8,
        mutation_rate=0.02,
        random_state=42
    ):
        self.n_features = n_features
        self.pop_size = pop_size
        self.n_generations = n_generations
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate
        self.random_state = random_state
        np.random.seed(random_state)
        random.seed(random_state)

    def _init_population(self):
        pop = np.random.randint(0, 2, size=(self.pop_size, self.n_features))
        for i in range(self.pop_size):
            if pop[i].sum() == 0:
                pop[i, np.random.randint(0, self.n_features)] = 1
        return pop

    def _fitness(self, mask, X, y):
        idx = np.where(mask == 1)[0]
        if len(idx) == 0:
            return 0.0

        X_sel = X[:, idx]

        model = CatBoostClassifier(
            iterations=200,
            learning_rate=0.08,
            depth=6,
            verbose=False,
            random_state=self.random_state
        )

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)
        scores = []
        for tr, va in cv.split(X_sel, y):
            model.fit(X_sel[tr], y[tr])
            pred = model.predict(X_sel[va])
            scores.append(f1_score(y[va], pred))

        return np.mean(scores)

    def _tournament_selection(self, population, fitness, k=3):
        selected = np.random.choice(len(population), k, replace=False)
        best_idx = selected[np.argmax(fitness[selected])]
        return population[best_idx].copy()

    def _crossover(self, p1, p2):
        if np.random.rand() > self.crossover_rate:
            return p1.copy(), p2.copy()

        point = np.random.randint(1, self.n_features - 1)
        return (np.concatenate([p1[:point], p2[point:]]),
                np.concatenate([p2[:point], p1[point:]]))

    def _mutate(self, indiv):
        for i in range(self.n_features):
            if np.random.rand() < self.mutation_rate:
                indiv[i] = 1 - indiv[i]
        if indiv.sum() == 0:
            indiv[np.random.randint(0, self.n_features)] = 1
        return indiv

    def fit(self, X, y):
        pop = self._init_population()
        fitness = np.array([self._fitness(ind, X, y) for ind in pop])

        best_ind = pop[np.argmax(fitness)].copy()
        best_fit = fitness.max()

        print("\n[GA] Initial best F1:", best_fit)

        for gen in range(self.n_generations):
            new_pop = []
            while len(new_pop) < self.pop_size:
                p1 = self._tournament_selection(pop, fitness)
                p2 = self._tournament_selection(pop, fitness)
                c1, c2 = self._crossover(p1, p2)
                new_pop.extend([self._mutate(c1), self._mutate(c2)])

            pop = np.array(new_pop[:self.pop_size])
            fitness = np.array([self._fitness(ind, X, y) for ind in pop])

            if fitness.max() > best_fit:
                best_fit = fitness.max()
                best_ind = pop[np.argmax(fitness)].copy()

            print(f"[GA] Generation {gen+1}/{self.n_generations} - Best F1: {best_fit:.4f}")

        self.best_mask_ = best_ind
        self.best_score_ = best_fit
        return self

    def transform(self, X):
        return X[:, self.best_mask_ == 1]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)

# ==============================================================
# PSO (Wrapper = CatBoost)
# ==============================================================

class PSOFeatureSelector:
    def __init__(
        self,
        n_features,
        n_particles=20,
        n_iterations=15,
        inertia=0.7,
        cognitive=1.5,
        social=1.5,
        random_state=42
    ):
        self.n_features = n_features
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.w = inertia
        self.c1 = cognitive
        self.c2 = social
        self.random_state = random_state
        np.random.seed(random_state)

    def _init_swarm(self):
        positions = np.random.randint(0, 2, (self.n_particles, self.n_features)).astype(float)
        for i in range(self.n_particles):
            if positions[i].sum() == 0:
                positions[i, np.random.randint(0, self.n_features)] = 1
        velocities = np.random.uniform(-1, 1, (self.n_particles, self.n_features))
        return positions, velocities

    def _fitness(self, pos, X, y):
        mask = (pos >= 0.5).astype(int)
        idx = np.where(mask == 1)[0]
        if len(idx) == 0:
            return 0.0

        X_sel = X[:, idx]

        model = CatBoostClassifier(
            iterations=200,
            learning_rate=0.08,
            depth=6,
            verbose=False,
            random_state=self.random_state
        )

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)

        scores = []
        for tr, va in cv.split(X_sel, y):
            model.fit(X_sel[tr], y[tr])
            pred = model.predict(X_sel[va])
            scores.append(f1_score(y[va], pred))

        return np.mean(scores)

    def fit(self, X, y):
        positions, velocities = self._init_swarm()

        pbest_pos = positions.copy()
        pbest_scores = np.array([self._fitness(p, X, y) for p in positions])

        gbest_pos = pbest_pos[np.argmax(pbest_scores)].copy()
        gbest_score = pbest_scores.max()

        print("\n[PSO] Initial best F1:", gbest_score)

        for it in range(self.n_iterations):
            for i in range(self.n_particles):
                r1 = np.random.rand(self.n_features)
                r2 = np.random.rand(self.n_features)

                velocities[i] = (
                    self.w * velocities[i]
                    + self.c1 * r1 * (pbest_pos[i] - positions[i])
                    + self.c2 * r2 * (gbest_pos - positions[i])
                )

                sigmoid = 1 / (1 + np.exp(-velocities[i]))
                new_pos = (np.random.rand(self.n_features) < sigmoid).astype(float)

                if new_pos.sum() == 0:
                    new_pos[np.random.randint(0, self.n_features)] = 1

                positions[i] = new_pos

                score = self._fitness(new_pos, X, y)

                if score > pbest_scores[i]:
                    pbest_scores[i] = score
                    pbest_pos[i] = new_pos.copy()

            if pbest_scores.max() > gbest_score:
                gbest_score = pbest_scores.max()
                gbest_pos = pbest_pos[np.argmax(pbest_scores)].copy()

            print(f"[PSO] Iter {it+1}/{self.n_iterations} - Best F1: {gbest_score:.4f}")

        self.best_pos_ = gbest_pos
        self.best_score_ = gbest_score
        return self

    def transform(self, X):
        mask = (self.best_pos_ >= 0.5).astype(int)
        return X[:, mask == 1]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)

# ==============================================================
# RUN GA
# ==============================================================

ga_selector = GAFeatureSelector(
    n_features=X_train.shape[1],
    pop_size=20,
    n_generations=10,
    crossover_rate=0.8,
    mutation_rate=0.02,
    random_state=RANDOM_STATE
)

X_train_ga = ga_selector.fit_transform(X_train, y_train)
X_val_ga   = ga_selector.transform(X_val)
X_test_ga  = ga_selector.transform(X_test)

ga_selected_features = feature_names[ga_selector.best_mask_ == 1]

print("\n[GA] Best F1 (CV):", ga_selector.best_score_)
print("[GA] Selected:", len(ga_selected_features))

# ==============================================================
# RUN PSO
# ==============================================================

pso_selector = PSOFeatureSelector(
    n_features=X_train.shape[1],
    n_particles=20,
    n_iterations=10,
    inertia=0.7,
    cognitive=1.5,
    social=1.5,
    random_state=RANDOM_STATE
)

X_train_pso = pso_selector.fit_transform(X_train, y_train)
X_val_pso   = pso_selector.transform(X_val)
X_test_pso  = pso_selector.transform(X_test)

pso_mask = (pso_selector.best_pos_ >= 0.5).astype(int)
pso_selected_features = feature_names[pso_mask == 1]

print("\n[PSO] Best F1 (CV):", pso_selector.best_score_)
print("[PSO] Selected:", len(pso_selected_features))

# ==============================================================
# DEFINE FINAL MODELS
# ==============================================================

def get_models():
    return {
        "XGBoost": XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            n_jobs=-1,
            random_state=RANDOM_STATE
        ),
        "LightGBM": LGBMClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="binary",
            verbose=-1,
            random_state=RANDOM_STATE
        ),
        "CatBoost": CatBoostClassifier(
            iterations=300,
            learning_rate=0.05,
            depth=6,
            verbose=False,
            random_state=RANDOM_STATE
        )
    }

# ==============================================================
# TRAIN & EVALUATE
# ==============================================================

results = {}

# 1. All features
models_all = get_models()
for name, model in models_all.items():
    results[name+"_all"] = evaluate_model(
        model, X_train_full, X_test, y_train_full, y_test,
        f"{name} (All)"
    )

# 2. GA features
models_ga = get_models()
for name, model in models_ga.items():
    results[name+"_ga"] = evaluate_model(
        model,
        np.vstack([X_train_ga, X_val_ga]),
        X_test_ga,
        np.concatenate([y_train, y_val]),
        y_test,
        f"{name} (GA)"
    )

# 3. PSO features
models_pso = get_models()
for name, model in models_pso.items():
    results[name+"_pso"] = evaluate_model(
        model,
        np.vstack([X_train_pso, X_val_pso]),
        X_test_pso,
        np.concatenate([y_train, y_val]),
        y_test,
        f"{name} (PSO)"
    )

print("\n========= SUMMARY =========")
for k, v in results.items():
    print(k, "=>", v)


Total samples: 50000
Total features: 19

Train: (32000, 19) 
Val  : (8000, 19) 
Test : (10000, 19)

[GA] Initial best F1: 0.9252879113377773
[GA] Generation 1/10 - Best F1: 0.9322
[GA] Generation 2/10 - Best F1: 0.9322
[GA] Generation 3/10 - Best F1: 0.9326
[GA] Generation 4/10 - Best F1: 0.9329
[GA] Generation 5/10 - Best F1: 0.9329
[GA] Generation 6/10 - Best F1: 0.9345
[GA] Generation 7/10 - Best F1: 0.9352
[GA] Generation 8/10 - Best F1: 0.9352
[GA] Generation 9/10 - Best F1: 0.9356


In [1]:
# ============================================================
# HYBRID ENSEMBLE:
# 1) PSO + XGBoost (features + model)
# 2) GA + CatBoost (features + model)
# 3) GWO + CatBoost (features + model)
# -> Meta Neural Network on their probabilities
# -> Save all models + feature masks + zip file
# ============================================================

import numpy as np
import pandas as pd
import random
import time
import warnings
import pickle
import zipfile

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, make_scorer
)
from sklearn.base import clone

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

warnings.filterwarnings("ignore")

# ---------------- GLOBAL SETTINGS ----------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---------------- LOAD DATA ----------------
# Adjust this path if needed
CSV_PATH = "/kaggle/input/loan-final-normalized-csv/loan_final_normalized.csv"
TARGET_COL = "loan_status"

df = pd.read_csv(CSV_PATH)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(int).values

feature_names = np.array(X.columns)
n_features = X.shape[1]

print("Data loaded.")
print("Samples:", X.shape[0], "Features:", X.shape[1])

# ---------------- TRAIN / VAL / TEST SPLIT ----------------
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=RANDOM_STATE
)
# Now: train 60%, val 20%, test 20%

print("\nTrain:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

# ============================================================
# FITNESS HELPERS
# ============================================================

def fitness_xgb(mask, X_data, y_data, cv=3):
    """Fitness for PSO: XGBoost F1 CV."""
    idx = np.where(mask == 1)[0]
    if len(idx) == 0:
        return 0.0
    X_sel = X_data[:, idx]

    model = XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(
        model, X_sel, y_data, cv=skf,
        scoring=make_scorer(f1_score, zero_division=0),
        n_jobs=-1
    )
    return scores.mean()


def fitness_cat(mask, X_data, y_data, cv=3):
    """Fitness for GA and GWO: CatBoost F1 CV."""
    idx = np.where(mask == 1)[0]
    if len(idx) == 0:
        return 0.0
    X_sel = X_data[:, idx]

    model = CatBoostClassifier(
        iterations=200,
        learning_rate=0.05,
        depth=6,
        verbose=False,
        random_seed=RANDOM_STATE
    )

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(
        model, X_sel, y_data, cv=skf,
        scoring=make_scorer(f1_score, zero_division=0),
        n_jobs=-1
    )
    return scores.mean()

# Convert to numpy arrays for faster indexing
X_train_np = X_train.values
X_val_np   = X_val.values
X_test_np  = X_test.values

# ============================================================
# 1) PSO + XGBoost PATH
# ============================================================

def run_pso_feature_selection(
    n_particles=15, n_iters=15, w=0.7, c1=1.5, c2=1.5
):
    print("\n=== PSO Feature Selection (Wrapper: XGBoost) ===")
    dim = n_features

    positions = np.random.randint(0, 2, (n_particles, dim)).astype(int)
    velocities = np.random.uniform(-1, 1, (n_particles, dim))

    personal_best = positions.copy()
    personal_best_scores = np.array([
        fitness_xgb(p, X_train_np, y_train) for p in positions
    ])
    global_best = personal_best[np.argmax(personal_best_scores)].copy()
    global_best_score = personal_best_scores.max()

    for it in range(n_iters):
        print(f" PSO iter {it+1}/{n_iters} - best F1: {global_best_score:.4f}")
        for i in range(n_particles):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)

            velocities[i] = (
                w * velocities[i]
                + c1 * r1 * (personal_best[i] - positions[i])
                + c2 * r2 * (global_best - positions[i])
            )
            sigmoid = 1 / (1 + np.exp(-velocities[i]))
            positions[i] = (np.random.rand(dim) < sigmoid).astype(int)

            score = fitness_xgb(positions[i], X_train_np, y_train)

            if score > personal_best_scores[i]:
                personal_best_scores[i] = score
                personal_best[i] = positions[i].copy()

        global_best = personal_best[np.argmax(personal_best_scores)].copy()
        global_best_score = personal_best_scores.max()

    print("PSO final best F1:", global_best_score)
    return global_best, global_best_score

# ============================================================
# 2) GA + CatBoost PATH
# ============================================================

def run_ga_feature_selection(
    pop_size=25, n_gens=15, crossover_rate=0.7, mutation_rate=0.1
):
    print("\n=== GA Feature Selection (Wrapper: CatBoost) ===")
    dim = n_features

    population = np.random.randint(0, 2, (pop_size, dim)).astype(int)
    fitness_scores = np.array([
        fitness_cat(ind, X_train_np, y_train) for ind in population
    ])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for gen in range(n_gens):
        print(f" GA gen {gen+1}/{n_gens} - best F1: {fitness_scores.max():.4f}")
        new_pop = []
        # Elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(population[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            p1 = population[tournament_select()].copy()
            p2 = population[tournament_select()].copy()

            # Crossover
            if np.random.rand() < crossover_rate:
                point = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:point], p2[point:]])
                c2 = np.concatenate([p2[:point], p1[point:]])
            else:
                c1, c2 = p1, p2

            # Mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < mutation_rate:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break

        population = np.array(new_pop[:pop_size])
        fitness_scores = np.array([
            fitness_cat(ind, X_train_np, y_train) for ind in population
        ])

    best_ind = population[np.argmax(fitness_scores)].copy()
    best_score = fitness_scores.max()
    print("GA final best F1:", best_score)
    return best_ind, best_score

# ============================================================
# 3) GWO + CatBoost PATH
# ============================================================

def run_gwo_feature_selection(
    n_wolves=15, n_iters=15
):
    print("\n=== GWO Feature Selection (Wrapper: CatBoost) ===")
    dim = n_features

    population = np.random.randint(0, 2, (n_wolves, dim)).astype(int)
    fitness_scores = np.array([
        fitness_cat(ind, X_train_np, y_train) for ind in population
    ])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for it in range(n_iters):
        print(f" GWO iter {it+1}/{n_iters} - best Alpha F1: {Alpha_score:.4f}")
        # Update Alpha, Beta, Delta
        for i, score in enumerate(fitness_scores):
            if score > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, score
                Delta, Beta, Alpha = Beta, Alpha, population[i].copy()
            elif score > Beta_score:
                Delta_score, Beta_score = Beta_score, score
                Delta, Beta = Beta, population[i].copy()
            elif score > Delta_score:
                Delta_score = score
                Delta = population[i].copy()

        a = 2 - it * (2 / n_iters)

        if Alpha is None:
            continue

        for i in range(n_wolves):
            new_pos = population[i].copy()
            for d in range(dim):
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a
                C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - population[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a
                C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - population[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a
                C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - population[i][d])
                X3 = Delta[d] - A3 * D_delta

                X_mean = (X1 + X2 + X3) / 3.0
                s = 1 / (1 + np.exp(-X_mean))
                new_pos[d] = 1 if np.random.rand() < s else 0

            population[i] = new_pos

        fitness_scores = np.array([
            fitness_cat(ind, X_train_np, y_train) for ind in population
        ])

    best_ind = population[np.argmax(fitness_scores)].copy()
    best_score = fitness_scores.max()
    print("GWO final best F1:", best_score)
    return best_ind, best_score

# ============================================================
# RUN ALL THREE OPTIMIZERS AND TRAIN BASE MODELS
# ============================================================

# 1) PSO → best features → XGBoost
pso_mask, pso_score = run_pso_feature_selection()
pso_feat_idx = np.where(pso_mask == 1)[0]
pso_features = feature_names[pso_feat_idx]
print("\n[PSO] Selected features:", len(pso_features))

model_pso_xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

model_pso_xgb.fit(X_train_np[:, pso_feat_idx], y_train)

# 2) GA → best features → CatBoost
ga_mask, ga_score = run_ga_feature_selection()
ga_feat_idx = np.where(ga_mask == 1)[0]
ga_features = feature_names[ga_feat_idx]
print("\n[GA] Selected features:", len(ga_features))

model_ga_cat = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    verbose=False,
    random_seed=RANDOM_STATE
)
model_ga_cat.fit(X_train_np[:, ga_feat_idx], y_train)

# 3) GWO → best features → CatBoost
gwo_mask, gwo_score = run_gwo_feature_selection()
gwo_feat_idx = np.where(gwo_mask == 1)[0]
gwo_features = feature_names[gwo_feat_idx]
print("\n[GWO] Selected features:", len(gwo_features))

model_gwo_cat = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    verbose=False,
    random_seed=RANDOM_STATE
)
model_gwo_cat.fit(X_train_np[:, gwo_feat_idx], y_train)

# ============================================================
# BUILD META DATASET FROM 3 MODELS (STACKING INPUT)
# ============================================================

print("\n=== Building meta-features for Neural Network (using validation set) ===")

# Base model probabilities on validation set
pso_val_proba = model_pso_xgb.predict_proba(X_val_np[:, pso_feat_idx])[:, 1]
ga_val_proba  = model_ga_cat.predict_proba(X_val_np[:, ga_feat_idx])[:, 1]
gwo_val_proba = model_gwo_cat.predict_proba(X_val_np[:, gwo_feat_idx])[:, 1]

X_meta_train = np.vstack([pso_val_proba, ga_val_proba, gwo_val_proba]).T  # shape (n_val, 3)
y_meta_train = y_val

print("Meta-data shape:", X_meta_train.shape)

# ============================================================
# SMALL NEURAL NETWORK META-MODEL
# ============================================================

print("\n=== Training Meta Neural Network (stacking) ===")

meta_model = Sequential([
    Dense(16, activation="relu", input_shape=(3,)),
    Dropout(0.2),
    Dense(8, activation="relu"),
    Dense(1, activation="sigmoid")
])

meta_model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = meta_model.fit(
    X_meta_train, y_meta_train,
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

print("Meta NN training done.")

# ============================================================
# EVALUATE FINAL ENSEMBLE ON TEST SET
# ============================================================

print("\n=== Evaluating final ensemble on TEST set ===")

pso_test_proba = model_pso_xgb.predict_proba(X_test_np[:, pso_feat_idx])[:, 1]
ga_test_proba  = model_ga_cat.predict_proba(X_test_np[:, ga_feat_idx])[:, 1]
gwo_test_proba = model_gwo_cat.predict_proba(X_test_np[:, gwo_feat_idx])[:, 1]

X_meta_test = np.vstack([pso_test_proba, ga_test_proba, gwo_test_proba]).T

meta_test_proba = meta_model.predict(X_meta_test).ravel()
meta_test_pred = (meta_test_proba >= 0.5).astype(int)

acc  = accuracy_score(y_test, meta_test_pred)
prec = precision_score(y_test, meta_test_pred, zero_division=0)
rec  = recall_score(y_test, meta_test_pred, zero_division=0)
f1   = f1_score(y_test, meta_test_pred, zero_division=0)

print(f"Final Ensemble Accuracy : {acc:.4f}")
print(f"Final Ensemble Precision: {prec:.4f}")
print(f"Final Ensemble Recall   : {rec:.4f}")
print(f"Final Ensemble F1-score : {f1:.4f}")

# ============================================================
# SAVE MODELS + FEATURE MASKS + ZIP
# ============================================================

print("\n=== Saving models and masks ===")

# Save XGBoost model
model_pso_xgb.save_model("model_pso_xgb.json")

# Save CatBoost models
model_ga_cat.save_model("model_ga_cat.cbm")
model_gwo_cat.save_model("model_gwo_cat.cbm")

# Save meta NN
meta_model.save("meta_meta_nn.h5")

# Save feature indices / masks
feature_info = {
    "pso_feat_idx": pso_feat_idx,
    "ga_feat_idx": ga_feat_idx,
    "gwo_feat_idx": gwo_feat_idx,
    "pso_features": pso_features.tolist(),
    "ga_features": ga_features.tolist(),
    "gwo_features": gwo_features.tolist()
}
with open("feature_masks.pkl", "wb") as f:
    pickle.dump(feature_info, f)

# Zip everything
zip_filename = "final_hybrid_ensemble_models.zip"
with zipfile.ZipFile(zip_filename, "w") as zf:
    zf.write("model_pso_xgb.json")
    zf.write("model_ga_cat.cbm")
    zf.write("model_gwo_cat.cbm")
    zf.write("meta_meta_nn.h5")
    zf.write("feature_masks.pkl")

print("Saved models and zipped file:", zip_filename)
print("Done.")


2025-11-28 15:17:19.292777: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764343039.767135      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764343039.905356      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Data loaded.
Samples: 50000 Features: 19

Train: (30000, 19) Val: (10000, 19) Test: (10000, 19)

=== PSO Feature Selection (Wrapper: XGBoost) ===
 PSO iter 1/15 - best F1: 0.9216
 PSO iter 2/15 - best F1: 0.9216
 PSO iter 3/15 - best F1: 0.9271
 PSO iter 4/15 - best F1: 0.9271
 PSO iter 5/15 - best F1: 0.9271
 PSO iter 6/15 - best F1: 0.9271
 PSO iter 7/15 - best F1: 0.9271
 PSO iter 8/15 - best F1: 0.9277
 PSO iter 9/15 - best F1: 0.9277
 PSO iter 10/15 - best F1: 0.9290
 PSO iter 11/15 - best F1: 0.9290
 PSO iter 12/15 - best F1: 0.9311
 PSO iter 13/15 - best F1: 0.9311
 PSO iter 14/15 - best F1: 0.9311
 PSO iter 15/15 - best F1: 0.9328
PSO final best F1: 0.9328121953263177

[PSO] Selected features: 17

=== GA Feature Selection (Wrapper: CatBoost) ===
 GA gen 1/15 - best F1: 0.9006
 GA gen 2/15 - best F1: 0.9120
 GA gen 3/15 - best F1: 0.9231
 GA gen 4/15 - best F1: 0.9310
 GA gen 5/15 - best F1: 0.9310
 GA gen 6/15 - best F1: 0.9315
 GA gen 7/15 - best F1: 0.9324
 GA gen 8/15 - best

I0000 00:00:1764346109.250399      47 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1764346109.251083      47 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5
I0000 00:00:1764346112.349524   11679 service.cc:148] XLA service 0x7ebd08009320 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1764346112.350799   11679 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1764346112.350819   11679 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1764346112.682782   11679 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1764346113.921015   11679 device_compiler.h:188] Compiled clust

Meta NN training done.

=== Evaluating final ensemble on TEST set ===
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step




Final Ensemble Accuracy : 0.9099
Final Ensemble Precision: 0.9725
Final Ensemble Recall   : 0.8607
Final Ensemble F1-score : 0.9132

=== Saving models and masks ===
Saved models and zipped file: final_hybrid_ensemble_models.zip
Done.


In [2]:
# ============================================================
# HYBRID ENSEMBLE WITH PROPER STACKING (PSO + GA + GWO)
# BASE MODELS: XGBoost, CatBoost, CatBoost
# META MODEL: Logistic Regression (correct stacking)
# ============================================================

import numpy as np
import pandas as pd
import warnings, time, pickle, zipfile
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# ============================================================
# LOAD DATA
# ============================================================
df = pd.read_csv("/kaggle/input/loan-final-normalized-csv/loan_final_normalized.csv")

X = df.drop("loan_status", axis=1).values
y = df["loan_status"].astype(int).values
feature_names = df.drop("loan_status", axis=1).columns

N_FEATURES = X.shape[1]
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("Data loaded.")


# ============================================================
# FITNESS FUNCTIONS
# ============================================================

def eval_xgb(mask):
    """Evaluate mask using XGBoost."""
    idx = np.where(mask == 1)[0]
    if len(idx) == 0:
        return 0.0
    X_sel = X[:, idx]
    
    model = XGBClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, n_jobs=-1,
        eval_metric="logloss", random_state=RANDOM_STATE
    )
    
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    for tr, va in cv.split(X_sel, y):
        model.fit(X_sel[tr], y[tr])
        pred = model.predict(X_sel[va])
        scores.append(f1_score(y[va], pred))
    return np.mean(scores)

def eval_cat(mask):
    """Evaluate mask using CatBoost."""
    idx = np.where(mask == 1)[0]
    if len(idx) == 0:
        return 0.0
    X_sel = X[:, idx]
    
    model = CatBoostClassifier(
        iterations=200, depth=6, learning_rate=0.05,
        verbose=False, random_seed=RANDOM_STATE
    )
    
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    for tr, va in cv.split(X_sel, y):
        model.fit(X_sel[tr], y[tr])
        pred = model.predict(X_sel[va])
        scores.append(f1_score(y[va], pred))
    return np.mean(scores)


# ============================================================
# PSO FEATURE SELECTION
# ============================================================
def run_pso():
    swarm = 20
    iters = 20
    dim = N_FEATURES
    
    pos = np.random.randint(0, 2, (swarm, dim))
    vel = np.random.rand(swarm, dim)
    
    pbest = pos.copy()
    pbest_scores = np.array([eval_xgb(m) for m in pos])
    
    gbest = pbest[np.argmax(pbest_scores)].copy()
    gbest_score = np.max(pbest_scores)
    
    for t in range(iters):
        print(f"PSO Iter={t+1} BestF1={gbest_score:.4f}")
        for i in range(swarm):
            r1, r2 = np.random.rand(dim), np.random.rand(dim)
            vel[i] = 0.7*vel[i] + 1.5*r1*(pbest[i]-pos[i]) + 1.5*r2*(gbest-pos[i])
            pos[i] = (1/(1+np.exp(-vel[i])) > 0.5).astype(int)
            
            sc = eval_xgb(pos[i])
            if sc > pbest_scores[i]:
                pbest_scores[i] = sc
                pbest[i] = pos[i].copy()
            
            if sc > gbest_score:
                gbest_score = sc
                gbest = pos[i].copy()
    
    return gbest


# ============================================================
# GENETIC ALGORITHM FEATURE SELECTION
# ============================================================
def run_ga():
    pop = 30
    gens = 20
    dim = N_FEATURES
    
    population = np.random.randint(0, 2, (pop, dim))
    
    for g in range(gens):
        scores = np.array([eval_cat(ind) for ind in population])
        print(f"GA Gen {g+1} BestF1={scores.max():.4f}")
        
        parents = population[scores.argsort()[-10:]]
        
        children = []
        for _ in range(pop // 2):
            p1, p2 = parents[np.random.randint(10)], parents[np.random.randint(10)]
            cut = np.random.randint(1, dim - 1)
            child = np.concatenate([p1[:cut], p2[cut:]])
            children.append(child)
        
        children = np.array(children)
        
        mutation = np.random.rand(*children.shape) < 0.1
        children = np.abs(children - mutation.astype(int))
        
        population = np.vstack((parents, children))
    
    best = population[np.argmax([eval_cat(ind) for ind in population])]
    return best


# ============================================================
# GWO FEATURE SELECTION
# ============================================================
def run_gwo():
    wolves = 20
    iters = 20
    dim = N_FEATURES
    
    pop = np.random.randint(0, 2, (wolves, dim))
    scores = np.array([eval_cat(ind) for ind in pop])
    
    alpha, beta, delta = pop[scores.argsort()[-3:]]
    
    for t in range(iters):
        print(f"GWO Iter={t+1} BestF1={scores.max():.4f}")
        
        a = 2 - t*(2/iters)
        
        for i in range(wolves):
            for leader in [alpha, beta, delta]:
                r1, r2 = np.random.rand(), np.random.rand()
                A = 2*a*r1 - a
                C = 2*r2
                D = abs(C*leader - pop[i])
                X = leader - A*D
                pop[i] = (1/(1+np.exp(-X)) > 0.5).astype(int)
        
        scores = np.array([eval_cat(ind) for ind in pop])
        alpha, beta, delta = pop[scores.argsort()[-3:]]
    
    return alpha


# ============================================================
# TRAIN BASE MODELS WITH FINAL SELECTED FEATURES
# ============================================================
def train_base_model(model, mask, X_train, y_train):
    idx = np.where(mask == 1)[0]
    model.fit(X_train[:, idx], y_train)
    return model


# ============================================================
# PROPER STACKING USING OUT-OF-FOLD PREDICTIONS
# ============================================================
def generate_oof_predictions(mask, model, X, y):
    idx = np.where(mask == 1)[0]
    X_sel = X[:, idx]
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))
    
    for tr, va in skf.split(X_sel, y):
        m = clone(model)
        m.fit(X_sel[tr], y[tr])
        oof[va] = m.predict_proba(X_sel[va])[:, 1]
    
    return oof


# ============================================================
# RUN OPTIMIZERS
# ============================================================
print("\nRunning PSO...")
mask_pso = run_pso()

print("\nRunning GA...")
mask_ga = run_ga()

print("\nRunning GWO...")
mask_gwo = run_gwo()


# ============================================================
# BASE MODELS
# ============================================================
xgb_model = XGBClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
    random_state=42, n_jobs=-1
)

cat_model1 = CatBoostClassifier(iterations=300, depth=6, learning_rate=0.05, verbose=False)
cat_model2 = CatBoostClassifier(iterations=300, depth=6, learning_rate=0.05, verbose=False)


# ============================================================
# OOF STACKING
# ============================================================
print("\nGenerating OOF predictions...")

oof_pso = generate_oof_predictions(mask_pso, xgb_model, X, y)
oof_ga = generate_oof_predictions(mask_ga, cat_model1, X, y)
oof_gwo = generate_oof_predictions(mask_gwo, cat_model2, X, y)

stack_X = np.vstack([oof_pso, oof_ga, oof_gwo]).T


# ============================================================
# TRAIN META-MODEL (LOGISTIC REGRESSION)
# ============================================================
print("\nTraining Logistic Regression Meta Model...")

meta_model = LogisticRegression()
meta_model.fit(stack_X, y)

print("Meta model training done.")


# ============================================================
# FINAL TEST SPLIT AND EVALUATION
# ============================================================
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

test_pso = xgb_model.fit(X_train_final[:, mask_pso==1], y_train_final)\
                    .predict_proba(X_test_final[:, mask_pso==1])[:,1]

test_ga = cat_model1.fit(X_train_final[:, mask_ga==1], y_train_final)\
                    .predict_proba(X_test_final[:, mask_ga==1])[:,1]

test_gwo = cat_model2.fit(X_train_final[:, mask_gwo==1], y_train_final)\
                     .predict_proba(X_test_final[:, mask_gwo==1])[:,1]

test_stack = np.vstack([test_pso, test_ga, test_gwo]).T

meta_pred = meta_model.predict(test_stack)
meta_proba = meta_model.predict_proba(test_stack)[:,1]

acc = accuracy_score(y_test_final, meta_pred)
prec = precision_score(y_test_final, meta_pred)
rec = recall_score(y_test_final, meta_pred)
f1 = f1_score(y_test_final, meta_pred)

print("\n===== FINAL STACKED ENSEMBLE RESULTS =====")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1 Score  : {f1:.4f}")


# ============================================================
# SAVE ALL MODELS AND MASKS
# ============================================================
print("\nSaving models...")

with open("mask_pso.pkl", "wb") as f: pickle.dump(mask_pso, f)
with open("mask_ga.pkl", "wb") as f: pickle.dump(mask_ga, f)
with open("mask_gwo.pkl", "wb") as f: pickle.dump(mask_gwo, f)
with open("meta_model.pkl", "wb") as f: pickle.dump(meta_model, f)

# Zip everything
with zipfile.ZipFile("final_stacked_ensemble.zip", "w") as zf:
    zf.write("mask_pso.pkl")
    zf.write("mask_ga.pkl")
    zf.write("mask_gwo.pkl")
    zf.write("meta_model.pkl")

print("Saved final_stacked_ensemble.zip successfully.")


Data loaded.

Running PSO...
PSO Iter=1 BestF1=0.9244
PSO Iter=2 BestF1=0.9342
PSO Iter=3 BestF1=0.9342
PSO Iter=4 BestF1=0.9342
PSO Iter=5 BestF1=0.9342
PSO Iter=6 BestF1=0.9342
PSO Iter=7 BestF1=0.9342
PSO Iter=8 BestF1=0.9342
PSO Iter=9 BestF1=0.9342
PSO Iter=10 BestF1=0.9342
PSO Iter=11 BestF1=0.9342
PSO Iter=12 BestF1=0.9342
PSO Iter=13 BestF1=0.9342
PSO Iter=14 BestF1=0.9342
PSO Iter=15 BestF1=0.9342
PSO Iter=16 BestF1=0.9342
PSO Iter=17 BestF1=0.9342
PSO Iter=18 BestF1=0.9342
PSO Iter=19 BestF1=0.9342
PSO Iter=20 BestF1=0.9342

Running GA...
GA Gen 1 BestF1=0.9255
GA Gen 2 BestF1=0.9255
GA Gen 3 BestF1=0.9313
GA Gen 4 BestF1=0.9313
GA Gen 5 BestF1=0.9335
GA Gen 6 BestF1=0.9335
GA Gen 7 BestF1=0.9335
GA Gen 8 BestF1=0.9340
GA Gen 9 BestF1=0.9340
GA Gen 10 BestF1=0.9340
GA Gen 11 BestF1=0.9340
GA Gen 12 BestF1=0.9340
GA Gen 13 BestF1=0.9340
GA Gen 14 BestF1=0.9340
GA Gen 15 BestF1=0.9340
GA Gen 16 BestF1=0.9340
GA Gen 17 BestF1=0.9340
GA Gen 18 BestF1=0.9340
GA Gen 19 BestF1=0.934

In [5]:
# ==========================================================
#  FULL PIPELINE: FEATURE SELECTION + LOADED MODEL PREDICTION
# ==========================================================

import pandas as pd
import numpy as np
import joblib
import logging
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# -----------------------------
# Disable warnings
# -----------------------------
warnings.filterwarnings("ignore")

# -----------------------------
# Setup Logger (show all logs)
# -----------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
log = logging.getLogger()

log.info("Starting diabetes feature selection + loan model prediction pipeline...")

# ==========================================================
# 1. LOAD DIABETES DATASET
# ==========================================================
log.info("Loading Diabetes dataset...")
df = pd.read_csv("/kaggle/input/balanced/balanced_20percent.csv")

X = df.drop("Diabetes_binary", axis=1)
y = df["Diabetes_binary"]

log.info(f"Dataset loaded. Shape: {df.shape}")
log.info(f"Target Value Counts:\n{y.value_counts()}")

# ==========================================================
# 2. FEATURE SELECTION
# ==========================================================
log.info("Performing scaling + feature selection...")

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X_scaled, y)

selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]

log.info(f"Selected Features ({len(selected_features)}): {list(selected_features)}")

# Convert back to DataFrame
X_selected_df = pd.DataFrame(X_selected, columns=selected_features)

# ==========================================================
# 3. LOAD SAVED MODEL (TRAINED ON LOAN DATASET)
# ==========================================================
model_path = "/kaggle/input/saved-models/meta_model.pkl"
log.info(f"Loading trained model from: {model_path}")

model = joblib.load(model_path)

# Model has NO feature_names_in_ — use coef_ shape
n_features = model.coef_.shape[1]
log.info(f"Model expects {n_features} input features.")

# ==========================================================
# 4. ALIGN FEATURES BY PAD/TRIM ONLY
# ==========================================================
log.info("Aligning diabetes selected features to required number of input features...")

# If selected feature count < expected → pad with zeros
if X_selected_df.shape[1] < n_features:
    missing = n_features - X_selected_df.shape[1]
    for i in range(missing):
        X_selected_df[f"pad_{i}"] = 0
    log.info(f"Padded with {missing} zero-features.")

# If selected feature count > expected → truncate
if X_selected_df.shape[1] > n_features:
    X_selected_df = X_selected_df.iloc[:, :n_features]
    log.info(f"Trimmed extra features.")

# Final aligned input
X_final = X_selected_df
log.info(f"Final aligned dataset shape: {X_final.shape}")

# ==========================================================
# 5. PREDICTION
# ==========================================================
log.info("Running predictions...")

pred = model.predict(X_final)
prob = model.predict_proba(X_final)[:, 1]

# ==========================================================
# 6. METRICS & REPORT
# ==========================================================
log.info("Computing evaluation metrics...")

acc = accuracy_score(y, pred)
prec = precision_score(y, pred, zero_division=0)
rec = recall_score(y, pred, zero_division=0)
f1 = f1_score(y, pred, zero_division=0)
cm = confusion_matrix(y, pred)
cls_report = classification_report(y, pred)

print("\n========== MODEL EVALUATION ==========")
print(f"Accuracy       : {acc:.4f}")
print(f"Precision      : {prec:.4f}")
print(f"Recall         : {rec:.4f}")
print(f"F1-score       : {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(cls_report)

log.info("Pipeline completed successfully.")


2025-11-29 15:03:44,663 — INFO — Starting diabetes feature selection + loan model prediction pipeline...
2025-11-29 15:03:44,665 — INFO — Loading Diabetes dataset...
2025-11-29 15:03:44,694 — INFO — Dataset loaded. Shape: (14098, 22)
2025-11-29 15:03:44,696 — INFO — Target Value Counts:
Diabetes_binary
0.0    7049
1.0    7049
Name: count, dtype: int64
2025-11-29 15:03:44,697 — INFO — Performing scaling + feature selection...
2025-11-29 15:03:44,709 — INFO — Selected Features (10): ['HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'HvyAlcoholConsump', 'GenHlth', 'PhysHlth', 'DiffWalk', 'Income']
2025-11-29 15:03:44,709 — INFO — Loading trained model from: /kaggle/input/saved-models/meta_model.pkl
2025-11-29 15:03:44,712 — INFO — Model expects 3 input features.
2025-11-29 15:03:44,713 — INFO — Aligning diabetes selected features to required number of input features...
2025-11-29 15:03:44,714 — INFO — Trimmed extra features.
2025-11-29 15:03:44,714 — INFO — Final al


Accuracy       : 0.5322
Precision      : 0.7528
Recall         : 0.0959
F1-score       : 0.1701

Confusion Matrix:
[[6827  222]
 [6373  676]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.52      0.97      0.67      7049
         1.0       0.75      0.10      0.17      7049

    accuracy                           0.53     14098
   macro avg       0.63      0.53      0.42     14098
weighted avg       0.63      0.53      0.42     14098



In [11]:
# ==========================================================
# FINAL PIPELINE: Drop 2 features → Apply GA/GWO/PSO masks
# ==========================================================

import pandas as pd
import numpy as np
import joblib
import warnings
import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
log = logging.getLogger()


# ==========================================================
# 1. LOAD DIABETES DATASET
# ==========================================================
df = pd.read_csv("/kaggle/input/balanced/balanced_20percent.csv")

# Drop 2 columns to match GA/PSO/GWO mask size = 19
drop_cols = ["DiffWalk", "Sex"]   # you can change if needed
df = df.drop(columns=drop_cols)

log.info(f"After dropping {drop_cols}, shape = {df.shape}")

X = df.drop("Diabetes_binary", axis=1)
y = df["Diabetes_binary"]

feature_names = list(X.columns)
log.info(f"Final feature count (should be 19): {len(feature_names)}")


# ==========================================================
# 2. LOAD GA/GWO/PSO MASK ARRAYS
# ==========================================================
paths = [
    "/kaggle/input/saved-models/mask_ga.pkl",
    "/kaggle/input/saved-models/mask_gwo.pkl",
    "/kaggle/input/saved-models/mask_pso.pkl"
]

masks = [joblib.load(p) for p in paths]

for i, mask in enumerate(masks):
    log.info(f"Mask {i+1}: length={len(mask)} values={mask}")


# ==========================================================
# 3. APPLY MASKS TO GET SELECTED FEATURES
# ==========================================================
selected_lists = []

for i, mask in enumerate(masks):
    selected = list(np.array(feature_names)[mask == 1])
    selected_lists.append(selected)
    log.info(f"Model {i+1} selected: {selected}")


# ==========================================================
# 4. FEATURE VOTING (Majority rule: appear in ≥ 2 models)
# ==========================================================
votes = {}
for feats in selected_lists:
    for f in feats:
        votes[f] = votes.get(f, 0) + 1

final_features = [f for f, c in votes.items() if c >= 2]

# fallback: select top 10 most voted
if len(final_features) < 3:
    final_features = sorted(votes, key=votes.get, reverse=True)[:10]

log.info(f"FINAL VOTED FEATURES: {final_features}")


# ==========================================================
# 5. ALIGN FOR LOGISTIC REGRESSION MODEL (PAD/TRIM)
# ==========================================================
X_selected = X[final_features]

# Load logistic regression model
lr_model = joblib.load("/kaggle/input/saved-models/meta_model.pkl")

n_features = lr_model.coef_.shape[1]
log.info(f"Logistic model expects {n_features} features.")

# pad / trim
if X_selected.shape[1] < n_features:
    missing = n_features - X_selected.shape[1]
    for i in range(missing):
        X_selected[f"pad_{i}"] = 0
    log.info(f"Padded {missing} missing columns.")

if X_selected.shape[1] > n_features:
    X_selected = X_selected.iloc[:, :n_features]
    log.info("Trimmed extra columns.")

X_final = X_selected
log.info(f"Final input shape = {X_final.shape}")


# ==========================================================
# 6. PREDICT
# ==========================================================
pred = lr_model.predict(X_final)
prob = lr_model.predict_proba(X_final)[:, 1]


# ==========================================================
# 7. METRICS
# ==========================================================
print("\n======== MODEL PERFORMANCE ========")
print("Accuracy :", round(accuracy_score(y, pred), 4))
print("Precision:", round(precision_score(y, pred), 4))
print("Recall   :", round(recall_score(y, pred), 4))
print("F1 Score :", round(f1_score(y, pred), 4))

print("\nConfusion Matrix:\n", confusion_matrix(y, pred))
print("\nClassification Report:\n", classification_report(y, pred))

log.info("Pipeline completed successfully.")


2025-11-29 15:25:54,466 — INFO — After dropping ['DiffWalk', 'Sex'], shape = (14098, 20)
2025-11-29 15:25:54,469 — INFO — Final feature count (should be 19): 19
2025-11-29 15:25:54,475 — INFO — Mask 1: length=19 values=[1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1]
2025-11-29 15:25:54,476 — INFO — Mask 2: length=19 values=[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2025-11-29 15:25:54,476 — INFO — Mask 3: length=19 values=[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2025-11-29 15:25:54,477 — INFO — Model 1 selected: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Stroke', 'HeartDiseaseorAttack', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
2025-11-29 15:25:54,478 — INFO — Model 2 selected: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Educati


Accuracy : 0.5174
Precision: 0.5089
Recall   : 0.994
F1 Score : 0.6732

Confusion Matrix:
 [[ 287 6762]
 [  42 7007]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.04      0.08      7049
         1.0       0.51      0.99      0.67      7049

    accuracy                           0.52     14098
   macro avg       0.69      0.52      0.38     14098
weighted avg       0.69      0.52      0.38     14098



In [1]:
import joblib

paths = [
    "/kaggle/input/saved-models/mask_ga.pkl",
    "/kaggle/input/saved-models/mask_gwo.pkl",
    "/kaggle/input/saved-models/mask_pso.pkl"
]

for p in paths:
    print("\n==============================")
    print("FILE:", p)
    print("==============================")
    obj = joblib.load(p)
    print("TYPE:", type(obj))

    # If dict, print keys
    if isinstance(obj, dict):
        print("DICT KEYS:", obj.keys())

    # If list/array
    if isinstance(obj, (list, tuple)):
        print("LIST LENGTH:", len(obj))
        print("FIRST 5:", obj[:5])

    # If numpy array
    try:
        import numpy as np
        if isinstance(obj, np.ndarray):
            print("ARRAY SHAPE:", obj.shape)
            print("FIRST ROW:", obj[:5])
    except:
        pass

    print(obj)



FILE: /kaggle/input/saved-models/mask_ga.pkl
TYPE: <class 'numpy.ndarray'>
ARRAY SHAPE: (19,)
FIRST ROW: [1 1 1 1 0]
[1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1]

FILE: /kaggle/input/saved-models/mask_gwo.pkl
TYPE: <class 'numpy.ndarray'>
ARRAY SHAPE: (19,)
FIRST ROW: [1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

FILE: /kaggle/input/saved-models/mask_pso.pkl
TYPE: <class 'numpy.ndarray'>
ARRAY SHAPE: (19,)
FIRST ROW: [1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [6]:
# ==========================================================
# FINAL PIPELINE: Drop 2 features → Apply GA/GWO/PSO masks
# ==========================================================

import pandas as pd
import numpy as np
import joblib
import warnings
import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
log = logging.getLogger()


# ==========================================================
# 1. LOAD DIABETES DATASET
# ==========================================================
df = pd.read_csv("/kaggle/input/balanced/balanced_20percent.csv")

# Drop 2 columns to match GA/PSO/GWO mask size = 19
drop_cols = ["DiffWalk", "Sex"]   # you can change if needed
df = df.drop(columns=drop_cols)

log.info(f"After dropping {drop_cols}, shape = {df.shape}")

X = df.drop("Diabetes_binary", axis=1)
y = df["Diabetes_binary"]

feature_names = list(X.columns)
log.info(f"Final feature count (should be 19): {len(feature_names)}")


# ==========================================================
# 2. LOAD GA/GWO/PSO MASK ARRAYS
# ==========================================================
paths = [
    "/kaggle/input/saved-models/mask_ga.pkl",
    "/kaggle/input/saved-models/mask_gwo.pkl",
    "/kaggle/input/saved-models/mask_pso.pkl"
]

masks = [joblib.load(p) for p in paths]

for i, mask in enumerate(masks):
    log.info(f"Mask {i+1}: length={len(mask)} values={mask}")


# ==========================================================
# 3. APPLY MASKS TO GET SELECTED FEATURES
# ==========================================================
selected_lists = []

for i, mask in enumerate(masks):
    selected = list(np.array(feature_names)[mask == 1])
    selected_lists.append(selected)
    log.info(f"Model {i+1} selected: {selected}")


# ==========================================================
# 4. FEATURE VOTING (Majority rule: appear in ≥ 2 models)
# ==========================================================
votes = {}
for feats in selected_lists:
    for f in feats:
        votes[f] = votes.get(f, 0) + 1

final_features = [f for f, c in votes.items() if c >= 2]

# fallback: select top 10 most voted
if len(final_features) < 3:
    final_features = sorted(votes, key=votes.get, reverse=True)[:10]

log.info(f"FINAL VOTED FEATURES: {final_features}")


# ==========================================================
# 5. ALIGN FOR LOGISTIC REGRESSION MODEL (PAD/TRIM)
# ==========================================================
X_selected = X[final_features]

# Load logistic regression model
lr_model = joblib.load("/kaggle/input/saved-models/meta_model.pkl")

n_features = lr_model.coef_.shape[1]
log.info(f"Logistic model expects {n_features} features.")

# pad / trim
if X_selected.shape[1] < n_features:
    missing = n_features - X_selected.shape[1]
    for i in range(missing):
        X_selected[f"pad_{i}"] = 0
    log.info(f"Padded {missing} missing columns.")

if X_selected.shape[1] > n_features:
    X_selected = X_selected.iloc[:, :n_features]
    log.info("Trimmed extra columns.")

X_final = X_selected
log.info(f"Final input shape = {X_final.shape}")


# ==========================================================
# 6. PREDICT
# ==========================================================
pred = lr_model.predict(X_final)
prob = lr_model.predict_proba(X_final)[:, 1]


# ==========================================================
# 7. METRICS
# ==========================================================
print("\n======== MODEL PERFORMANCE ========")
print("Accuracy :", round(accuracy_score(y, pred), 4))
print("Precision:", round(precision_score(y, pred), 4))
print("Recall   :", round(recall_score(y, pred), 4))
print("F1 Score :", round(f1_score(y, pred), 4))

print("\nConfusion Matrix:\n", confusion_matrix(y, pred))
print("\nClassification Report:\n", classification_report(y, pred))

log.info("Pipeline completed successfully.")


2025-11-29 18:20:30,850 — INFO — Diabetes dataset loaded with 19 features.
2025-11-29 18:20:30,856 — INFO — Loaded GA mask length  = 19
2025-11-29 18:20:30,856 — INFO — Loaded GWO mask length = 19
2025-11-29 18:20:30,857 — INFO — Loaded PSO mask length = 19
2025-11-29 18:20:30,862 — INFO — Loaded model for union: <class 'catboost.core.CatBoostClassifier'>
2025-11-29 18:20:30,866 — INFO — Loaded model for intersection: <class 'catboost.core.CatBoostClassifier'>
2025-11-29 18:20:30,870 — INFO — Loaded model for voting: <class 'catboost.core.CatBoostClassifier'>


CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:81: At position 0 should be feature with name years_employed (found HighBP).

In [1]:
import pandas as pd

df=pd.read_csv("/kaggle/input/new-ids/UNSW_NB15_testing-set.csv")
print(df.columns)

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')


In [2]:
# ==========================================================
# FINAL PIPELINE: Apply GA/GWO/PSO masks on DDOS → Test with saved LR model
# ==========================================================

import pandas as pd
import numpy as np
import joblib
import warnings
import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
log = logging.getLogger()


# ==========================================================
# 1. LOAD DDOS DATASET  (YOU SELECT EXACTLY 19 COLUMNS)
# ==========================================================
df = pd.read_csv("/kaggle/input/new-ids/UNSW_NB15_testing-set.csv")

# 👉 YOU SELECT EXACTLY 19 FEATURES FROM DDOS
ddos_features_19 = [
    "spkts","dpkts","sbytes","dbytes","rate",
    "sttl","dttl","sload","dload","sloss",
    "dloss","synack","ackdat","smean","dmean",
    "sinpkt","dinpkt","sjit","djit"
]

df = df[ddos_features_19].copy()
log.info(f"DDOS dataset loaded with 19 selected numeric features: {df.shape}")


# ==========================================================
# 2. LOAD SAVED MASKS (GA / GWO / PSO)
# ==========================================================
paths = [
    "/kaggle/input/saved-models/mask_ga.pkl",
    "/kaggle/input/saved-models/mask_gwo.pkl",
    "/kaggle/input/saved-models/mask_pso.pkl"
]

masks = [joblib.load(p) for p in paths]

for i, mask in enumerate(masks):
    log.info(f"Mask {i+1}: length={len(mask)} values={mask}")


# ==========================================================
# 3. RENAME DDOS COLUMNS → MATCH EXPECTED FEATURE NAMES FROM MODEL
# ==========================================================
lr_model = joblib.load("/kaggle/input/saved-models/meta_model.pkl")
expected_names = list(lr_model.feature_names_in_)

if len(expected_names) != 19:
    raise ValueError("Your logistic regression model does NOT expect 19 features!")

df.columns = expected_names
log.info("DDOS columns renamed to match model feature names.")


# ==========================================================
# 4. APPLY MASKS + VOTING (just like diabetes)
# ==========================================================
feature_names = list(df.columns)

selected_lists = []

for i, mask in enumerate(masks):
    selected = list(np.array(feature_names)[mask == 1])
    selected_lists.append(selected)
    log.info(f"Model {i+1} selected: {selected}")

# voting
votes = {}
for feats in selected_lists:
    for f in feats:
        votes[f] = votes.get(f, 0) + 1

final_features = [f for f, c in votes.items() if c >= 2]

if len(final_features) < 3:
    final_features = sorted(votes, key=votes.get, reverse=True)[:10]

log.info(f"FINAL VOTED FEATURES: {final_features}")


# ==========================================================
# 5. ALIGN DDOS SELECTED FEATURES FOR LOGISTIC MODEL
# ==========================================================
X_selected = df[final_features]

n_features = lr_model.coef_.shape[1]
log.info(f"Logistic model expects {n_features} features.")

# pad if needed
if X_selected.shape[1] < n_features:
    missing = n_features - X_selected.shape[1]
    for i in range(missing):
        X_selected[f"pad_{i}"] = 0
    log.info(f"Padded {missing} missing columns.")

# trim if needed
if X_selected.shape[1] > n_features:
    X_selected = X_selected.iloc[:, :n_features]
    log.info("Trimmed extra columns.")

X_final = X_selected.copy()
log.info(f"Final DDOS input shape for model: {X_final.shape}")


# ==========================================================
# 6. PREDICT USING TRAINED MODEL
# ==========================================================
pred = lr_model.predict(X_final)
prob = lr_model.predict_proba(X_final)[:, 1]


# ==========================================================
# 7. IF YOU HAVE DDOS LABELS, LOAD THEM
# ==========================================================
try:
    y = pd.read_csv("/kaggle/input/ddos/ddos_labels.csv")["label"]
except:
    y = None


# ==========================================================
# 8. METRICS (ONLY IF LABEL AVAILABLE)
# ==========================================================
if y is not None:
    print("\n======== MODEL PERFORMANCE (DDOS) ========")
    print("Accuracy :", round(accuracy_score(y, pred), 4))
    print("Precision:", round(precision_score(y, pred), 4))
    print("Recall   :", round(recall_score(y, pred), 4))
    print("F1 Score :", round(f1_score(y, pred), 4))

    print("\nConfusion Matrix:\n", confusion_matrix(y, pred))
    print("\nClassification Report:\n", classification_report(y, pred))
else:
    print("\nNo DDOS labels → showing first 20 predictions:")
    print(pred[:20])

log.info("DDOS pipeline completed successfully.")


2025-12-02 12:41:15,903 — INFO — DDOS dataset loaded with 19 selected numeric features: (82332, 19)


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/saved-models/mask_ga.pkl'

In [7]:
# ==========================================================
# AUTO FEATURE SELECTION ON FULL DDOS DATASET (47 COLS)
# Using GA/GWO/PSO masks → voting → align → test with LOAN LR model
# ==========================================================

import pandas as pd
import numpy as np
import joblib
import warnings
import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(levelname)s — %(message)s")
log = logging.getLogger()

# ==========================================================
# 1. LOAD FULL DDOS DATASET (47+ columns)
# ==========================================================
ddos_csv_path = "/kaggle/working/ddos_final_preprocessed.csv"    # <-- change here
df = pd.read_csv(ddos_csv_path)

TARGET_COL = "label"
if TARGET_COL not in df.columns:
    raise ValueError(f"{TARGET_COL} not found in dataset.")

X_full = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

feature_names_full = list(X_full.columns)
log.info(f"DDOS dataset loaded: {X_full.shape[1]} features present.")


# ==========================================================
# 2. LOAD 19-length GA/GWO/PSO MASK ARRAYS
# ==========================================================
paths = [
    "/kaggle/input/saved-model/mask_ga.pkl",
    "/kaggle/input/saved-model/mask_gwo.pkl",
    "/kaggle/input/saved-model/mask_pso.pkl"
]

masks = [joblib.load(p) for p in paths]

for i, mask in enumerate(masks):
    log.info(f"Mask {i+1}: length={len(mask)} values={mask}")

# These masks correspond to LOAN dataset feature ordering.
# We must select the FIRST 19 columns from DDOS for mapping.
# You can reorder differently if needed.


# ==========================================================
# 3. PICK FIRST 19 NUMERIC DDOS FEATURES AUTOMATICALLY
# ==========================================================
# FILTER NUMERIC ONLY
numeric_cols = X_full.select_dtypes(include=['number']).columns.tolist()

if len(numeric_cols) < 19:
    raise ValueError("Your DDOS dataset does not have 19 numeric columns!")

# Take first 19 numeric columns
ddos_19_features = numeric_cols[:19]

log.info(f"Auto-selected DDOS 19 numeric columns: {ddos_19_features}")

X_19 = X_full[ddos_19_features]
feature_names = ddos_19_features   # these 19 will be mapped to masks


# ==========================================================
# 4. APPLY MASKS TO THESE 19 DDOS FEATURES
# ==========================================================
selected_lists = []

for i, mask in enumerate(masks):
    selected = list(np.array(feature_names)[mask == 1])
    selected_lists.append(selected)
    log.info(f"Model {i+1} selected: {selected}")


# ==========================================================
# 5. FEATURE VOTING (majority: ≥2 masks)
# ==========================================================
votes = {}
for feats in selected_lists:
    for f in feats:
        votes[f] = votes.get(f, 0) + 1

final_features = [f for f, c in votes.items() if c >= 2]

# fallback
if len(final_features) < 3:
    final_features = sorted(votes, key=votes.get, reverse=True)[:10]

log.info(f"FINAL VOTED FEATURES: {final_features}")


# ==========================================================
# 6. LOAD LOGISTIC REGRESSION MODEL (TRAINED ON LOAN)
# ==========================================================
lr_model = joblib.load("/kaggle/input/saved-model/meta_model.pkl")

n_features_expected = lr_model.coef_.shape[1]
log.info(f"LR model expects {n_features_expected} features.")


# ==========================================================
# 7. ALIGN DDOS FEATURES TO MODEL EXPECTED SIZE
# ==========================================================
X_sel = X_19[final_features]

# pad
if X_sel.shape[1] < n_features_expected:
    missing = n_features_expected - X_sel.shape[1]
    for i in range(missing):
        X_sel[f"pad_{i}"] = 0
    log.info(f"Padded {missing} missing columns.")

# trim
if X_sel.shape[1] > n_features_expected:
    X_sel = X_sel.iloc[:, :n_features_expected]
    log.info("Trimmed extra columns.")

X_final = X_sel
log.info(f"Aligned DDOS input shape: {X_final.shape}")


# ==========================================================
# 8. PREDICT USING LOAN MODEL
# ==========================================================
pred = lr_model.predict(X_final)
prob = lr_model.predict_proba(X_final)[:, 1]


# ==========================================================
# 9. METRICS
# ==========================================================
print("\n======== DDOS TESTING WITH LOAN MODEL ========")
print("Accuracy :", round(accuracy_score(y, pred), 4))
print("Precision:", round(precision_score(y, pred), 4))
print("Recall   :", round(recall_score(y, pred), 4))
print("F1 Score :", round(f1_score(y, pred), 4))

print("\nConfusion Matrix:\n", confusion_matrix(y, pred))
print("\nClassification Report:\n", classification_report(y, pred))

log.info("DDOS Testing using LOAN model completed successfully.")


2025-12-02 13:39:09,217 — INFO — DDOS dataset loaded: 19 features present.
2025-12-02 13:39:09,239 — INFO — Mask 1: length=19 values=[1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1]
2025-12-02 13:39:09,240 — INFO — Mask 2: length=19 values=[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2025-12-02 13:39:09,240 — INFO — Mask 3: length=19 values=[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2025-12-02 13:39:09,244 — INFO — Auto-selected DDOS 19 numeric columns: ['id', 'dload', 'sinpkt', 'ct_srv_src', 'attack_cat', 'state', 'sbytes', 'rate', 'dttl', 'sload', 'sjit', 'dwin', 'synack', 'ackdat', 'trans_depth', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_src_ltm']
2025-12-02 13:39:09,247 — INFO — Model 1 selected: ['id', 'dload', 'sinpkt', 'ct_srv_src', 'state', 'sbytes', 'dttl', 'sload', 'sjit', 'dwin', 'synack', 'ackdat', 'trans_depth', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_src_ltm']
2025-12-02 13:39:09,248 — INFO — Model 2 selected: ['id', 'dload', 'sinpkt', 'ct_srv_src', 'attack_cat',


Accuracy : 0.4389
Precision: 0.0069
Recall   : 0.0001
F1 Score : 0.0003

Confusion Matrix:
 [[36133   867]
 [45326     6]]


2025-12-02 13:39:09,486 — INFO — DDOS Testing using LOAN model completed successfully.



Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.98      0.61     37000
           1       0.01      0.00      0.00     45332

    accuracy                           0.44     82332
   macro avg       0.23      0.49      0.31     82332
weighted avg       0.20      0.44      0.27     82332



In [4]:
# ==========================================================
#   DDOS FEATURE SELECTION (PSO + GA + GWO) on ALL COLUMNS
#   Converts all categorical → numeric, then selects TOP-19
#   Saves ddos_selected19.csv
# ==========================================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.base import clone
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# LOAD DDOS DATASET
# -------------------------
ddos_csv_path = "/kaggle/input/new-ids/UNSW_NB15_testing-set.csv"     # <-- change
TARGET_COL = "label"

df = pd.read_csv(ddos_csv_path)
print("\nLoaded DDOS dataset:", df.shape)

# ------------------------------------------------------
# 1. CONVERT ALL NON-NUMERIC COLUMNS → LABEL ENCODED
# ------------------------------------------------------
encoders = {}
for col in df.columns:
    if col != TARGET_COL and df[col].dtype == "object":
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        encoders[col] = encoder

print("Converted categorical columns to numeric.")

# ------------------------------------------------------
# PREPARE DATA
# ------------------------------------------------------
X_full = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

feature_names = list(X_full.columns)
N = len(feature_names)

print(f"Total encoded feature count = {N}")


# ============================================================
# FITNESS FUNCTION (CatBoost)
# ============================================================
def get_model():
    return CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.05,
        verbose=0,
        random_seed=42
    )

def fitness(mask):
    idx = np.where(mask == 1)[0]
    if len(idx) == 0:
        return 0

    X_sel = X_full.iloc[:, idx]
    model = get_model()

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    scores = cross_val_score(
        clone(model), X_sel, y, cv=skf,
        scoring=make_scorer(f1_score)
    )
    return scores.mean()


# ============================================================
# PSO / GA / GWO  (same logic, works on ALL features)
# ============================================================

def run_pso(swarm=15, iters=10):
    print("\n=== PSO ===")
    dim = N
    pos = np.random.randint(0,2,(swarm,dim))
    vel = np.random.uniform(-1,1,(swarm,dim))

    best = pos[0]
    best_score = fitness(best)

    for t in range(iters):
        for i in range(swarm):
            r1, r2 = np.random.rand(dim), np.random.rand(dim)
            vel[i] = 0.5*vel[i] + 1.5*r1*(best-pos[i]) + 1.5*r2*(best-pos[i])
            s = 1/(1+np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            score = fitness(pos[i])
            if score > best_score:
                best_score = score
                best = pos[i].copy()

        print(f" iter {t+1}/{iters} best={best_score:.4f}")

    return best


def run_ga(pop=20, gens=10):
    print("\n=== GA ===")
    dim = N
    population = np.random.randint(0,2,(pop,dim))

    def mutate(ind):
        for i in range(dim):
            if np.random.rand() < 0.05:
                ind[i] = 1 - ind[i]
        return ind

    for g in range(gens):
        fitnesses = np.array([fitness(ind) for ind in population])
        best_idx = np.argmax(fitnesses)

        new_pop = [population[best_idx].copy()]  # elitism

        while len(new_pop) < pop:
            parents = population[np.random.choice(pop, 2)]
            pt = np.random.randint(1, dim)
            child = np.concatenate([parents[0][:pt], parents[1][pt:]])
            new_pop.append(mutate(child))

        population = np.array(new_pop)
        print(f" gen {g+1}/{gens} best={fitnesses[best_idx]:.4f}")

    return population[np.argmax(fitnesses)]


def run_gwo(wolves=15, iters=10):
    print("\n=== GWO ===")
    dim = N
    pack = np.random.randint(0,2,(wolves,dim))

    for t in range(iters):
        scores = np.array([fitness(w) for w in pack])
        idx = scores.argsort()[::-1]

        alpha, beta, delta = pack[idx[:3]]
        a = 2 - t*(2/iters)

        for i in range(wolves):
            X = pack[i]

            D1 = abs(np.random.rand(dim)*alpha - X)
            D2 = abs(np.random.rand(dim)*beta - X)
            D3 = abs(np.random.rand(dim)*delta - X)

            X1 = alpha - a*D1
            X2 = beta  - a*D2
            X3 = delta - a*D3

            X_new = (X1+X2+X3)/3
            prob = 1/(1+np.exp(-X_new))
            pack[i] = (np.random.rand(dim) < prob).astype(int)

        print(f" iter {t+1}/{iters} best={scores[idx[0]]:.4f}")

    return pack[idx[0]]


# ============================================================
# RUN ALL OPTIMIZERS
# ============================================================
mask_pso = run_pso()
mask_ga  = run_ga()
mask_gwo = run_gwo()

print("\nMasks obtained:")
print("PSO:", mask_pso)
print("GA :", mask_ga)
print("GWO:", mask_gwo)


# ============================================================
# FEATURE VOTING → Select TOP 19
# ============================================================
vote_count = {}
for m in [mask_pso, mask_ga, mask_gwo]:
    feats = np.array(feature_names)[m == 1]
    for f in feats:
        vote_count[f] = vote_count.get(f, 0) + 1

sorted_feats = sorted(vote_count.items(), key=lambda x: x[1], reverse=True)
top19 = [f for f,_ in sorted_feats[:19]]

print("\nTOP-19 SELECTED FEATURES:")
print(top19)

# Save CSV
df_sel = df[top19 + [TARGET_COL]]
df_sel.to_csv("ddos_selected19.csv", index=False)

print("\nSaved: ddos_selected19.csv")



Loaded DDOS dataset: (82332, 45)
Converted categorical columns to numeric.
Total encoded feature count = 44

=== PSO ===
 iter 1/10 best=1.0000
 iter 2/10 best=1.0000
 iter 3/10 best=1.0000
 iter 4/10 best=1.0000
 iter 5/10 best=1.0000
 iter 6/10 best=1.0000
 iter 7/10 best=1.0000
 iter 8/10 best=1.0000
 iter 9/10 best=1.0000
 iter 10/10 best=1.0000

=== GA ===
 gen 1/10 best=1.0000
 gen 2/10 best=1.0000
 gen 3/10 best=1.0000
 gen 4/10 best=1.0000
 gen 5/10 best=1.0000
 gen 6/10 best=1.0000
 gen 7/10 best=1.0000
 gen 8/10 best=1.0000
 gen 9/10 best=1.0000
 gen 10/10 best=1.0000

=== GWO ===
 iter 1/10 best=1.0000
 iter 2/10 best=1.0000
 iter 3/10 best=1.0000
 iter 4/10 best=1.0000
 iter 5/10 best=1.0000
 iter 6/10 best=1.0000
 iter 7/10 best=1.0000
 iter 8/10 best=1.0000
 iter 9/10 best=1.0000
 iter 10/10 best=1.0000

Masks obtained:
PSO: [1 0 0 0 1 1 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0
 1 1 0 1 0 1 1]
GA : [1 0 0 1 0 1 1 1 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# =========================================
# 1️⃣ Load dataset (after feature selection)
# =========================================
df = pd.read_csv("/kaggle/working/ddos_selected19.csv")   # <-- change path only if needed
print("Original shape:", df.shape)

# =========================================
# 2️⃣ Remove columns where ALL values are NaN
# =========================================
df = df.dropna(axis=1, how='all')

# =========================================
# 3️⃣ Remove columns where ALL values are 0
# =========================================
df = df.loc[:, (df != 0).any(axis=0)]

# =========================================
# 4️⃣ Remove duplicate rows
# =========================================
df = df.drop_duplicates()

# =========================================
# 5️⃣ Identify numeric & categorical columns
# =========================================
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)

# =========================================
# 6️⃣ Handle missing values
# =========================================
if len(num_cols) > 0:
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

if len(cat_cols) > 0:
    df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# =========================================
# 7️⃣ Label encode categorical columns
# =========================================
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# =========================================
# 8️⃣ Min-Max scale numeric columns
# =========================================
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# =========================================
# 9️⃣ Save final output
# =========================================
output_filename = "ddos_final_preprocessed.csv"
df.to_csv(output_filename, index=False)

print(f"✅ DDOS preprocessing complete! Saved as: {output_filename}")
print("Final shape:", df.shape)


Original shape: (82332, 20)
Numeric cols: ['dpkts', 'dttl', 'djit', 'ackdat', 'ct_state_ttl', 'is_ftp_login', 'attack_cat', 'id', 'spkts', 'sttl', 'dloss', 'dinpkt', 'stcpb', 'response_body_len', 'ct_dst_ltm', 'ct_ftp_cmd', 'ct_src_ltm', 'is_sm_ips_ports', 'sinpkt', 'label']
Categorical cols: []
✅ DDOS preprocessing complete! Saved as: ddos_final_preprocessed.csv
Final shape: (82332, 20)


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib

# Load final selected 19-feature DDOS dataset
df = pd.read_csv("/kaggle/working/ddos_selected19.csv")
print("Loaded:", df.shape)

TARGET_COL = "label"

# Encode categorical columns if any
for col in df.columns:
    if col != TARGET_COL and df[col].dtype == object:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Normalization
num_cols = df.drop(TARGET_COL, axis=1).columns
df[num_cols] = MinMaxScaler().fit_transform(df[num_cols])

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

print("Training shape:", X.shape)

# Train  NEW DDOS model
model = CatBoostClassifier(
    iterations=800,
    depth=6,
    learning_rate=0.05,
    verbose=0
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(X_train, y_train)

# Evaluate
pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))

print("\nClassification Report:")
print(classification_report(y_test, pred))

# save
joblib.dump(model, "ddos_19features_trained_model.pkl")
print("Saved ddos_19features_trained_model.pkl")


Loaded: (82332, 20)
Training shape: (82332, 19)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7400
           1       1.00      1.00      1.00      9067

    accuracy                           1.00     16467
   macro avg       1.00      1.00      1.00     16467
weighted avg       1.00      1.00      1.00     16467

Saved ddos_19features_trained_model.pkl


In [None]:
#USING NEW DATA SET CIC-DDOS2019 FROM KAGGLE DIRECTLY 

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/cicddos2019


In [5]:
#CIC-DDOS2019 WITH ONLY MODEL USED CATBOOST


# ==============================================================
# CICDDoS2019 CLEANING + CATBOOST CLASSIFICATION (80–20 SPLIT)
# ==============================================================

import kagglehub
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# --------------------------------------------------------------
# 1. DOWNLOAD CICDDoS2019 DATASET
# --------------------------------------------------------------
path = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")
print("Dataset downloaded to:", path)

# Load all CSVs
files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".csv")]

print(f"\nFound {len(files)} CSV files. Loading...")

df_list = []
for f in files:
    print("Loading:", f)
    df_list.append(pd.read_csv(f, low_memory=False))

df = pd.concat(df_list, axis=0, ignore_index=True)
print("\nMerged dataset shape:", df.shape)


# --------------------------------------------------------------
# 2. BASIC CLEANING
# --------------------------------------------------------------

# Remove duplicate rows
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)

# Drop all-NaN columns
df = df.dropna(axis=1, how='all')
print("After dropping all-NaN columns:", df.shape)

# Remove useless index-like columns
for col in ["Unnamed: 0", "Flow ID"]:
    if col in df.columns:
        df = df.drop(columns=[col])

print("After dropping index/FlowID columns:", df.shape)

# Fill missing numeric values
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing categorical values
cat_cols = df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("Missing values handled.")


# --------------------------------------------------------------
# 3. LABEL ENCODE CATEGORICAL COLUMNS
# --------------------------------------------------------------
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print("Categorical → numeric encoding complete.")


# --------------------------------------------------------------
# 4. SELECT TARGET COLUMN
# --------------------------------------------------------------
TARGET_COL = " Label"    # <-- EXACT MATCH

if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found!")

print("\nUsing target column:", TARGET_COL)

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

print("Final feature shape:", X.shape)


# --------------------------------------------------------------
# 5. TRAIN-TEST SPLIT
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain size:", X_train.shape, " Test size:", X_test.shape)


# --------------------------------------------------------------
# 6. TRAIN CATBOOST
# --------------------------------------------------------------
model = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=8,
    verbose=50,
    random_seed=42
)

model.fit(X_train, y_train)


# --------------------------------------------------------------
# 7. EVALUATE MODEL
# --------------------------------------------------------------
pred = model.predict(X_test)

acc = accuracy_score(y_test, pred)
prec = precision_score(y_test, pred, average="weighted")
rec  = recall_score(y_test, pred, average="weighted")
f1   = f1_score(y_test, pred, average="weighted")

print("\n================= CICDDoS2019 CATBOOST RESULTS =================")
print("Accuracy :", round(acc, 4))
print("Precision:", round(prec, 4))
print("Recall   :", round(rec, 4))
print("F1 Score :", round(f1, 4))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

print("================================================================\n")

import pickle

save_path = "cicddos2019_catboost_model.pkl"

with open(save_path, "wb") as f:
    pickle.dump({
        "model": model,
        "features": X.columns.tolist()
    }, f)

print(f"\nModel saved successfully as: {save_path}")
print("You can now load it later using pickle.load(...)")

Dataset downloaded to: /kaggle/input/cicddos2019

Found 1 CSV files. Loading...
Loading: /kaggle/input/cicddos2019/Random_combine_final.csv

Merged dataset shape: (300000, 88)
After dropping duplicates: (299991, 88)
After dropping all-NaN columns: (299991, 88)
After dropping index/FlowID columns: (299991, 86)
Missing values handled.
Categorical → numeric encoding complete.

Using target column:  Label
Final feature shape: (299991, 85)

Train size: (239992, 85)  Test size: (59999, 85)
0:	learn: 2.1430835	total: 2.3s	remaining: 15m 16s
50:	learn: 0.1182699	total: 1m 56s	remaining: 13m 18s
100:	learn: 0.0380642	total: 3m 50s	remaining: 11m 22s
150:	learn: 0.0259132	total: 5m 45s	remaining: 9m 30s
200:	learn: 0.0216656	total: 7m 41s	remaining: 7m 36s
250:	learn: 0.0195842	total: 9m 35s	remaining: 5m 41s
300:	learn: 0.0182737	total: 11m 30s	remaining: 3m 47s
350:	learn: 0.0171308	total: 13m 24s	remaining: 1m 52s
399:	learn: 0.0163033	total: 15m 17s	remaining: 0us

Accuracy : 0.9927
Precisio

In [17]:
#TESTING USING JUST MODEL


import pickle
import pandas as pd
import numpy as np
import kagglehub
import glob, os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# -----------------------------------------------------
# 1) Load saved model
# -----------------------------------------------------
model_path = "/kaggle/working/cicddos2019_catboost_model.pkl"
saved = pickle.load(open(model_path, "rb"))

model = saved["model"]
train_features_raw = saved["features"]

print("\nLoaded model.")
print("Training feature count:", len(train_features_raw))

# Create clean versions (strip spaces)
train_features_clean = [f.strip() for f in train_features_raw]

# Mapping clean → ORIGINAL (with hidden spaces)
feature_map = dict(zip(train_features_clean, train_features_raw))


# -----------------------------------------------------
# 2) Load Kaggle dataset
# -----------------------------------------------------
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")
csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))

dfs = []
for f in csv_files:
    dfs.append(pd.read_csv(f, low_memory=False))

df = pd.concat(dfs, ignore_index=True)
df.columns = df.columns.astype(str).str.strip()

# Fix the label column
label_candidates = [c for c in df.columns if c.lower().strip() == "label"]
label_col = label_candidates[0]
df.rename(columns={label_col: "Label"}, inplace=True)


# -----------------------------------------------------
# 3) Clean missing values (same as training)
# -----------------------------------------------------
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

df[num_cols] = df[num_cols].fillna(df[num_cols].median())
for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])
    df[c] = LabelEncoder().fit_transform(df[c].astype(str))


# -----------------------------------------------------
# 4) RENAME test features to EXACT model names
# -----------------------------------------------------
df_cols = df.columns.tolist()
df_cols_clean = [c.strip() for c in df_cols]

rename_dict = {}

for clean_name, orig_name in feature_map.items():
    if clean_name in df_cols_clean:
        found_index = df_cols_clean.index(clean_name)
        df_name = df_cols[found_index]
        rename_dict[df_name] = orig_name

df.rename(columns=rename_dict, inplace=True)


# -----------------------------------------------------
# 5) Check for missing features
# -----------------------------------------------------
missing = [f for f in train_features_raw if f not in df.columns]

if missing:
    raise RuntimeError(
        "Still missing features:\n" + str(missing) +
        "\n\nThis means training column names contained hidden unicode symbols."
    )


# -----------------------------------------------------
# 6) Prepare X, y
# -----------------------------------------------------
df = df[train_features_raw + ["Label"]]
X = df.drop("Label", axis=1)
y = df["Label"].astype(int)

print("\nFinal Test Shape:", X.shape)


# -----------------------------------------------------
# 7) Predict
# -----------------------------------------------------
y_pred = model.predict(X)

acc  = accuracy_score(y, y_pred)
prec = precision_score(y, y_pred, average="weighted", zero_division=0)
rec  = recall_score(y, y_pred, average="weighted", zero_division=0)
f1   = f1_score(y, y_pred, average="weighted", zero_division=0)

print("\n========== FINAL TEST RESULTS ==========")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

print("\nClassification Report:\n")
print(classification_report(y, y_pred, zero_division=0))



Loaded model.
Training feature count: 85

Final Test Shape: (300000, 85)

Accuracy : 0.99376
Precision: 0.9939438457235572
Recall   : 0.99376
F1 Score : 0.9937173594773673

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       487
           1       1.00      1.00      1.00     21686
           2       0.98      1.00      0.99      9090
           3       0.99      0.99      0.99     19308
           4       1.00      0.99      1.00      5131
           5       0.99      0.98      0.99     17588
           6       0.99      0.99      0.99     21856
           7       0.97      0.99      0.98     11184
           8       0.99      0.98      0.99     13237
           9       1.00      1.00      1.00      8228
          10       1.00      1.00      1.00     24652
          11       1.00      0.99      0.99     15563
          12       0.78      0.98      0.87       805
          13       0.99      1.00      0.99  

In [3]:
print("Columns in dataset:")
print(df.columns.tolist())


Columns in dataset:
['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Le

In [10]:
#VOTING FOR 6 ITERATIONS


# hybrid_voting_hlo_ddos_pipeline.py
# Single-file: PSO + GA + GWO -> VOTING -> HLO -> Hill-climb -> Final CatBoost
# Option A: optimization subset = 3000 rows (1500 benign + 1500 attack)

import kagglehub
import glob, os, time, pickle, warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.base import clone
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# -----------------------
# USER CONFIG
# -----------------------
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")  # your loader
OPT_SUBSET_PER_CLASS = 1500     # 1500 benign + 1500 attack => 3000 rows
PSO_SWARM = 8
PSO_ITERS = 6
GA_POP = 12
GA_GENS = 6
GWO_WOLVES = 8
GWO_ITERS = 6
HLO_POP = 8
HLO_ITERS = 8
FIT_CB_ITERS_OPT = 80    # CatBoost iterations used inside fitness (fast)
CV_OPT = 2               # cheap CV inside optimizer
FINAL_CB_ITERS = 1000    # final model iterations (early stopping used)
FINAL_EARLY_STOP = 50
SAVE_PREFIX = "ddos_hybrid_voting_hlo"
LEAKAGE_SINGLE_FEATURE_THRESHOLD = 0.99999  # single-feature accuracy threshold to treat as leakage

print("DATA_PATH:", DATA_PATH)

# -----------------------
# 1) load all CSVs from dataset path
# -----------------------
csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
if len(csv_files) == 0:
    raise RuntimeError("No CSV files found in DATA_PATH: " + DATA_PATH)

print(f"Found {len(csv_files)} CSV files. Loading & merging (may take a bit)...")
dfs = []
for f in csv_files:
    print(" ->", os.path.basename(f))
    dfs.append(pd.read_csv(f, low_memory=False))
df = pd.concat(dfs, ignore_index=True)
print("Merged dataset shape:", df.shape)

# -----------------------
# 2) normalize columns and find label
# -----------------------
df.columns = df.columns.str.strip()
TARGET_CANDIDATES = ["Label", "label", " Attack", "attack_cat", "Label "]
found_label = None
for c in ["Label", "label", "Attack", "attack", "attack_cat"]:
    if c in df.columns:
        found_label = c
        break
if found_label is None:
    # try case-insensitive lookup
    for c in df.columns:
        if c.strip().lower() == "label" or c.strip().lower() == "attack":
            found_label = c
            break
if found_label is None:
    raise RuntimeError("Cannot find label column. Columns available: " + ", ".join(df.columns[:30]))

# normalize label column name
df.rename(columns={found_label: "Label"}, inplace=True)
print("Using target column 'Label' (original: {})".format(found_label))

# -----------------------
# 3) keep only rows with non-null label and make binary target
# -----------------------
df = df[df["Label"].notna()].copy()
df["Label"] = df["Label"].astype(str).str.strip().str.lower()
# convert benign -> 0 else -> 1 (attack)
df["Label"] = df["Label"].apply(lambda x: 0 if x == "benign" else 1)
print("Label counts (full):\n", df["Label"].value_counts())

# -----------------------
# 4) drop obviously leaking columns if present (IDs, IPs, timestamps)
# -----------------------
possible_leak_cols = [c for c in df.columns if c.strip().lower() in (
    "id", "flow id", "flowid", "timestamp", "ts", "source ip", "destination ip",
    "src ip", "dst ip", "sourceip", "destinationip", "srcip", "dstip")]
if possible_leak_cols:
    print("Dropping likely-leakage columns (ids/timestamps/ips):", possible_leak_cols)
    df.drop(columns=[c for c in possible_leak_cols if c in df.columns], inplace=True)

# -----------------------
# 5) basic cleaning: drop all-empty columns, replace inf, drop rows with NaN
# -----------------------
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="any", inplace=True)   # safe because we'll use full dataset for final, but optimizer needs no missing
print("After basic cleaning:", df.shape)

# -----------------------
# 6) create balanced small subset for optimization: OPT_SUBSET_PER_CLASS * 2 rows
# -----------------------
counts = df["Label"].value_counts().to_dict()
n_attack = counts.get(1, 0)
n_benign = counts.get(0, 0)
take_attack = min(OPT_SUBSET_PER_CLASS, n_attack)
take_benign = min(OPT_SUBSET_PER_CLASS, n_benign)
if take_attack < 10 or take_benign < 10:
    raise RuntimeError("Not enough rows in one class to form the optimization subset. counts=" + str(counts))

df_attack = df[df["Label"] == 1].sample(take_attack, random_state=42)
df_benign = df[df["Label"] == 0].sample(take_benign, random_state=42)
df_sub = pd.concat([df_attack, df_benign], ignore_index=True).sample(frac=1.0, random_state=42).reset_index(drop=True)
print("Optimization subset shape:", df_sub.shape, "Label counts:", df_sub["Label"].value_counts().to_dict())

# -----------------------
# 7) preprocess subset: encode categorical & scale numeric
# -----------------------
TARGET_COL = "Label"
X_sub = df_sub.drop(columns=[TARGET_COL]).copy()
y_sub = df_sub[TARGET_COL].astype(int).copy()

# encode object columns
obj_cols = X_sub.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    X_sub[c] = LabelEncoder().fit_transform(X_sub[c].astype(str))
# numeric scaling
num_cols_sub = X_sub.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols_sub) > 0:
    X_sub[num_cols_sub] = MinMaxScaler().fit_transform(X_sub[num_cols_sub])

FEATURE_NAMES = X_sub.columns.tolist()
N_FEATURES = len(FEATURE_NAMES)
print("Subset features:", N_FEATURES)

# -----------------------
# 8) CatBoost factory & fitness function with caching
# -----------------------
def get_catboost_model(iterations=FIT_CB_ITERS_OPT):
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=0, random_seed=42)

fitness_cache = {}
def evaluate_mask(mask_bool, cv=CV_OPT, cb_iter=FIT_CB_ITERS_OPT):
    key = tuple(int(x) for x in mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    idxs = [i for i,b in enumerate(key) if b==1]
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0
    Xsel = X_sub.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(clone(model), Xsel, y_sub, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
    except Exception as e:
        # if CatBoost fails (e.g. unexpected types), return 0
        fitness_cache[key] = 0.0
        return 0.0
    val = float(np.mean(scores))
    fitness_cache[key] = val
    return val

# -----------------------
# 9) PSO (binary) - reduced
# -----------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS):
    print("[PSO] start: swarm", swarm_size, "iters", iters)
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim))
    vel = np.random.uniform(-1,1,(swarm_size,dim))
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask(p) for p in pbest])
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]
    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        print(" PSO iter", t+1, "/", iters, "best", gbest_score)
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)
            sc = evaluate_mask(pos[i])
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy(); pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy(); gbest_score = sc
        w = max(0.2, w*0.97)
    print("[PSO] done best score", gbest_score, "selected", int(np.sum(gbest)))
    return gbest

# -----------------------
# 10) GA (binary) - reduced
# -----------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS):
    print("[GA] start: pop", pop_size, "gens", gens)
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    for g in range(gens):
        print(" GA gen", g+1, "/", gens, "best", fitnesses.max())
        elite_idxs = np.argsort(fitnesses)[-2:]
        new_pop = [pop[elite_idxs[0]].copy(), pop[elite_idxs[1]].copy()]
        while len(new_pop) < pop_size:
            p1 = pop[np.random.randint(pop_size)].copy()
            p2 = pop[np.random.randint(pop_size)].copy()
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                child = np.concatenate([p1[:pt], p2[pt:]])
            else:
                child = p1
            # mutation
            for d in range(dim):
                if np.random.rand() < 0.05:
                    child[d] = 1-child[d]
            new_pop.append(child)
        pop = np.array(new_pop[:pop_size])
        fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    best = pop[np.argmax(fitnesses)]
    print("[GA] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 11) GWO (binary) - reduced
# -----------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS):
    print("[GWO] start: wolves", wolves, "iters", iters)
    dim = N_FEATURES
    pack = np.random.randint(0,2,(wolves, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0
    for itr in range(iters):
        print(" GWO iter", itr+1, "/", iters, "best", Alpha_score)
        # update alpha/beta/delta
        for i in range(wolves):
            sc = fitnesses[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pack[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pack[i].copy()
            elif sc > Delta_score:
                Delta_score = sc; Delta = pack[i].copy()
        a = 2 - itr*(2.0/iters)
        for i in range(wolves):
            if Alpha is None:
                continue
            for d in range(dim):
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2*a*r1 - a; C1 = 2*r2
                D_alpha = abs(C1*Alpha[d] - pack[i][d])
                X1 = Alpha[d] - A1*D_alpha
                # use X1 approx only (keeps it simple + fast)
                s = 1.0/(1.0+np.exp(-X1))
                pack[i][d] = 1 if np.random.rand() < s else 0
        fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    best = pack[np.argmax(fitnesses)]
    print("[GWO] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 12) RUN OPTIMIZERS (PSO, GA, GWO)
# -----------------------
t0 = time.time()
mask_pso = run_pso()
mask_ga = run_ga()
mask_gwo = run_gwo()
t1 = time.time()
print("Optimizers finished in", int(t1-t0), "s")

# Save raw masks
os.makedirs("outputs", exist_ok=True)
pickle.dump({"mask_pso": mask_pso.tolist(), "mask_ga": mask_ga.tolist(), "mask_gwo": mask_gwo.tolist()}, open(os.path.join("outputs", SAVE_PREFIX + "_raw_masks.pkl"), "wb"))

# -----------------------
# 13) VOTING (majority >= 2)
# -----------------------
votes = np.array(mask_pso) + np.array(mask_ga) + np.array(mask_gwo)
voting_mask = (votes >= 2).astype(int)
selected_indices = list(np.where(voting_mask == 1)[0])
selected_features_voting = [FEATURE_NAMES[i] for i in selected_indices]
print("Voting selected features count:", len(selected_indices))
print("Voting selected:", selected_features_voting)

# Save voting mask
pickle.dump({"voting_mask": voting_mask.tolist(), "selected_features_voting": selected_features_voting},
            open(os.path.join("outputs", SAVE_PREFIX + "_voting.pkl"), "wb"))

# -----------------------
# 14) HLO on candidate set (candidates = voting selected)
# -----------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(cand_idxs)
    if k == 0:
        raise RuntimeError("No candidates for HLO")
    print("[HLO] start on", k, "candidates")
    pop = np.random.randint(0,2,(pop_size, k))
    def fitness_local(bitmask):
        full = np.zeros(N_FEATURES, dtype=int)
        for j,b in enumerate(bitmask):
            if int(b)==1:
                full[cand_idxs[j]] = 1
        return evaluate_mask(full)
    fitness_scores = np.array([fitness_local(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    for it in range(iters):
        print(" HLO iter", it+1, "/", iters, "best", best_score)
        teacher = pop[int(np.argmax([fitness_local(x) for x in pop]))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching
            for d in range(k):
                if np.random.rand() < 0.75:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < 0.12:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_local(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()
    final_full = np.zeros(N_FEATURES, dtype=int)
    for j,b in enumerate(best_solution):
        if int(b)==1:
            final_full[cand_idxs[j]] = 1
    print("[HLO] done best local score", best_score, "selected", int(final_full.sum()))
    return final_full, best_score

hlo_mask, hlo_score = hlo_on_candidates(voting_mask)
pickle.dump({"hlo_mask": hlo_mask.tolist(), "hlo_score": hlo_score},
            open(os.path.join("outputs", SAVE_PREFIX + "_hlo.pkl"), "wb"))

# -----------------------
# 15) Greedy hill-climb restricted to candidate indices
# -----------------------
def hill_climb(initial_mask, candidate_mask, max_steps=100, eval_cap=500):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    cur = initial_mask.copy()
    cur_score = evaluate_mask(cur)
    steps = 0
    evals = 0
    improved = True
    print("[HC] start: candidates", len(cand_idxs))
    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(cand_idxs):
            trial = cur.copy()
            trial[idx] = 1 - trial[idx]
            sc = evaluate_mask(trial)
            evals += 1
            if sc > cur_score + 1e-8:
                cur = trial
                cur_score = sc
                improved = True
                steps += 1
                print(f" HC step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score {cur_score:.4f} (evals={evals})")
                break
    print("[HC] done steps", steps, "evals", evals, "final_score", cur_score, "selected", int(cur.sum()))
    return cur, cur_score

hc_mask, hc_score = hill_climb(hlo_mask, voting_mask)
pickle.dump({"hc_mask": hc_mask.tolist(), "hc_score": hc_score},
            open(os.path.join("outputs", SAVE_PREFIX + "_hc.pkl"), "wb"))

# -----------------------
# 16) Selected features after hill-climb (final_mask)
# -----------------------
final_mask = hc_mask
final_selected_indices = np.where(np.array(final_mask).astype(bool))[0].tolist()
final_selected = [FEATURE_NAMES[i] for i in final_selected_indices]
print("Final selected features:", final_selected, "count:", len(final_selected))

# -----------------------
# 17) Leakage check: drop single-feature perfect predictors
# -----------------------
def single_feature_predictive_accuracy(feature_series, labels):
    # map each feature value to most common label for that value, compute accuracy
    mapping = feature_series.groupby(feature_series).apply(lambda s: labels[s.index].mode().iloc[0])
    preds = feature_series.map(mapping)
    return (preds.values == labels.values).mean()

# check each final feature; if single-feature accuracy >= threshold, drop it
to_drop = []
for f in final_selected:
    acc = single_feature_predictive_accuracy(X_sub[f], y_sub)
    if acc >= LEAKAGE_SINGLE_FEATURE_THRESHOLD or acc == 1.0:
        print(f"Leakage-suspect feature '{f}' single-feature accuracy={acc:.6f} -> will drop")
        to_drop.append(f)

if to_drop:
    final_selected = [f for f in final_selected if f not in to_drop]
    final_selected_indices = [FEATURE_NAMES.index(f) for f in final_selected]
    print("After dropping leakage suspects, final features:", final_selected)

if len(final_selected) == 0:
    raise RuntimeError("No safe features remain after leakage check. Consider lowering threshold or manual check.")

# Save final selected features
pickle.dump({"final_selected": final_selected, "final_mask": final_mask.tolist()},
            open(os.path.join("outputs", SAVE_PREFIX + "_final_selected.pkl"), "wb"))

# -----------------------
# 18) Prepare FULL dataset with same preprocessing for final training
# -----------------------
# Reuse df (full merged) earlier but ensure the same preprocessing as subset
df_full = df.copy()
# Already dropped leak columns earlier and trimmed nulls; ensure same features exist
missing_in_full = [f for f in final_selected if f not in df_full.columns]
if missing_in_full:
    raise RuntimeError("Selected features missing from full dataset: " + str(missing_in_full))

# Keep only final selected + label
df_full = df_full[final_selected + ["Label"]].copy()

# Convert object columns to numeric (LabelEncode) and fill NaN
for c in df_full.columns:
    if c != "Label" and df_full[c].dtype == "object":
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# Scale numeric columns (MinMax) using full data
num_cols = [c for c in final_selected if pd.api.types.is_numeric_dtype(df_full[c])]
if len(num_cols) > 0:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop(columns=["Label"])
y_full = df_full["Label"].astype(int)
print("Full final training shape:", X_full.shape, "Label dist:", y_full.value_counts().to_dict())

# -----------------------
# 19) Final train/test split (80/20 stratified) and final CatBoost training with regularization + early stopping
# -----------------------
minclass = y_full.value_counts().min()
if minclass < 10:
    print("Warning: small class size after selecting features:", minclass)

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

final_params = {
    "iterations": FINAL_CB_ITERS,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 7.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "random_strength": 1.0,
    "verbose": 50,
    "random_seed": 42
}
final_model = CatBoostClassifier(**final_params)

print("Training final model on full data with early stopping...")
final_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=FINAL_EARLY_STOP, use_best_model=True)

# Evaluate on hold-out test
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print("\n=== FINAL HOLDOUT METRICS ===")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Quick 5-fold CV estimate (fast: reduced iters)
cv_model = CatBoostClassifier(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=7.0,
                              bootstrap_type="Bernoulli", subsample=0.8, random_seed=42, verbose=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring="accuracy", n_jobs=-1)
f1s = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
print("\n5-fold CV (quick estimate) -> Accuracy: %.4f ± %.4f ; F1: %.4f ± %.4f" % (accs.mean(), accs.std(), f1s.mean(), f1s.std()))

# -----------------------
# 20) Save final model & selected features
# -----------------------
pickle.dump({"model": final_model, "features": final_selected, "mask": final_mask.tolist()},
            open(os.path.join("outputs", SAVE_PREFIX + "_final_model.pkl"), "wb"))
print("Saved final model + features -> outputs/{}_final_model.pkl".format(SAVE_PREFIX))

print("PIPELINE COMPLETE")


DATA_PATH: /kaggle/input/cicddos2019
Found 1 CSV files. Loading & merging (may take a bit)...
 -> Random_combine_final.csv
Merged dataset shape: (300000, 88)
Using target column 'Label' (original: Label)
Label counts (full):
 Label
1    299513
0       487
Name: count, dtype: int64
Dropping likely-leakage columns (ids/timestamps/ips): ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
After basic cleaning: (290753, 84)
Optimization subset shape: (1980, 84) Label counts: {1: 1500, 0: 480}
Subset features: 83
[PSO] start: swarm 8 iters 6
 PSO iter 1 / 6 best 0.9993342210386151
 PSO iter 2 / 6 best 0.9993342210386151
 PSO iter 3 / 6 best 0.9993342210386151
 PSO iter 4 / 6 best 0.9993342210386151
 PSO iter 5 / 6 best 0.9993342210386151
 PSO iter 6 / 6 best 0.9993342210386151
[PSO] done best score 1.0 selected 40
[GA] start: pop 12 gens 6
 GA gen 1 / 6 best 0.9996668887408395
 GA gen 2 / 6 best 0.9996668887408395
 GA gen 3 / 6 best 1.0
 GA gen 4 / 6 best 1.0
 GA gen 5 / 6 best 1.0
 GA g

In [11]:
#VOTING FOR 20 ITERATIONS


# hybrid_voting_hlo_ddos_pipeline.py
# Single-file: PSO + GA + GWO -> VOTING -> HLO -> Hill-climb -> Final CatBoost
# Option A: optimization subset = 3000 rows (1500 benign + 1500 attack)

import kagglehub
import glob, os, time, pickle, warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.base import clone
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# -----------------------
# USER CONFIG
# -----------------------
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")  # your loader
OPT_SUBSET_PER_CLASS = 1500     # 1500 benign + 1500 attack => 3000 rows
PSO_SWARM = 8
PSO_ITERS = 20
GA_POP = 12
GA_GENS = 20
GWO_WOLVES = 8
GWO_ITERS = 20
HLO_POP = 8
HLO_ITERS = 10
FIT_CB_ITERS_OPT = 80    # CatBoost iterations used inside fitness (fast)
CV_OPT = 2               # cheap CV inside optimizer
FINAL_CB_ITERS = 1000    # final model iterations (early stopping used)
FINAL_EARLY_STOP = 50
SAVE_PREFIX = "ddos_hybrid_voting_hlo"
LEAKAGE_SINGLE_FEATURE_THRESHOLD = 0.99999  # single-feature accuracy threshold to treat as leakage

print("DATA_PATH:", DATA_PATH)

# -----------------------
# 1) load all CSVs from dataset path
# -----------------------
csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
if len(csv_files) == 0:
    raise RuntimeError("No CSV files found in DATA_PATH: " + DATA_PATH)

print(f"Found {len(csv_files)} CSV files. Loading & merging (may take a bit)...")
dfs = []
for f in csv_files:
    print(" ->", os.path.basename(f))
    dfs.append(pd.read_csv(f, low_memory=False))
df = pd.concat(dfs, ignore_index=True)
print("Merged dataset shape:", df.shape)

# -----------------------
# 2) normalize columns and find label
# -----------------------
df.columns = df.columns.str.strip()
TARGET_CANDIDATES = ["Label", "label", " Attack", "attack_cat", "Label "]
found_label = None
for c in ["Label", "label", "Attack", "attack", "attack_cat"]:
    if c in df.columns:
        found_label = c
        break
if found_label is None:
    # try case-insensitive lookup
    for c in df.columns:
        if c.strip().lower() == "label" or c.strip().lower() == "attack":
            found_label = c
            break
if found_label is None:
    raise RuntimeError("Cannot find label column. Columns available: " + ", ".join(df.columns[:30]))

# normalize label column name
df.rename(columns={found_label: "Label"}, inplace=True)
print("Using target column 'Label' (original: {})".format(found_label))

# -----------------------
# 3) keep only rows with non-null label and make binary target
# -----------------------
df = df[df["Label"].notna()].copy()
df["Label"] = df["Label"].astype(str).str.strip().str.lower()
# convert benign -> 0 else -> 1 (attack)
df["Label"] = df["Label"].apply(lambda x: 0 if x == "benign" else 1)
print("Label counts (full):\n", df["Label"].value_counts())

# -----------------------
# 4) drop obviously leaking columns if present (IDs, IPs, timestamps)
# -----------------------
possible_leak_cols = [c for c in df.columns if c.strip().lower() in (
    "id", "flow id", "flowid", "timestamp", "ts", "source ip", "destination ip",
    "src ip", "dst ip", "sourceip", "destinationip", "srcip", "dstip")]
if possible_leak_cols:
    print("Dropping likely-leakage columns (ids/timestamps/ips):", possible_leak_cols)
    df.drop(columns=[c for c in possible_leak_cols if c in df.columns], inplace=True)

# -----------------------
# 5) basic cleaning: drop all-empty columns, replace inf, drop rows with NaN
# -----------------------
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="any", inplace=True)   # safe because we'll use full dataset for final, but optimizer needs no missing
print("After basic cleaning:", df.shape)

# -----------------------
# 6) create balanced small subset for optimization: OPT_SUBSET_PER_CLASS * 2 rows
# -----------------------
counts = df["Label"].value_counts().to_dict()
n_attack = counts.get(1, 0)
n_benign = counts.get(0, 0)
take_attack = min(OPT_SUBSET_PER_CLASS, n_attack)
take_benign = min(OPT_SUBSET_PER_CLASS, n_benign)
if take_attack < 10 or take_benign < 10:
    raise RuntimeError("Not enough rows in one class to form the optimization subset. counts=" + str(counts))

df_attack = df[df["Label"] == 1].sample(take_attack, random_state=42)
df_benign = df[df["Label"] == 0].sample(take_benign, random_state=42)
df_sub = pd.concat([df_attack, df_benign], ignore_index=True).sample(frac=1.0, random_state=42).reset_index(drop=True)
print("Optimization subset shape:", df_sub.shape, "Label counts:", df_sub["Label"].value_counts().to_dict())

# -----------------------
# 7) preprocess subset: encode categorical & scale numeric
# -----------------------
TARGET_COL = "Label"
X_sub = df_sub.drop(columns=[TARGET_COL]).copy()
y_sub = df_sub[TARGET_COL].astype(int).copy()

# encode object columns
obj_cols = X_sub.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    X_sub[c] = LabelEncoder().fit_transform(X_sub[c].astype(str))
# numeric scaling
num_cols_sub = X_sub.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols_sub) > 0:
    X_sub[num_cols_sub] = MinMaxScaler().fit_transform(X_sub[num_cols_sub])

FEATURE_NAMES = X_sub.columns.tolist()
N_FEATURES = len(FEATURE_NAMES)
print("Subset features:", N_FEATURES)

# -----------------------
# 8) CatBoost factory & fitness function with caching
# -----------------------
def get_catboost_model(iterations=FIT_CB_ITERS_OPT):
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=0, random_seed=42)

fitness_cache = {}
def evaluate_mask(mask_bool, cv=CV_OPT, cb_iter=FIT_CB_ITERS_OPT):
    key = tuple(int(x) for x in mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    idxs = [i for i,b in enumerate(key) if b==1]
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0
    Xsel = X_sub.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(clone(model), Xsel, y_sub, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
    except Exception as e:
        # if CatBoost fails (e.g. unexpected types), return 0
        fitness_cache[key] = 0.0
        return 0.0
    val = float(np.mean(scores))
    fitness_cache[key] = val
    return val

# -----------------------
# 9) PSO (binary) - reduced
# -----------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS):
    print("[PSO] start: swarm", swarm_size, "iters", iters)
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim))
    vel = np.random.uniform(-1,1,(swarm_size,dim))
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask(p) for p in pbest])
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]
    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        print(" PSO iter", t+1, "/", iters, "best", gbest_score)
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)
            sc = evaluate_mask(pos[i])
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy(); pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy(); gbest_score = sc
        w = max(0.2, w*0.97)
    print("[PSO] done best score", gbest_score, "selected", int(np.sum(gbest)))
    return gbest

# -----------------------
# 10) GA (binary) - reduced
# -----------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS):
    print("[GA] start: pop", pop_size, "gens", gens)
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    for g in range(gens):
        print(" GA gen", g+1, "/", gens, "best", fitnesses.max())
        elite_idxs = np.argsort(fitnesses)[-2:]
        new_pop = [pop[elite_idxs[0]].copy(), pop[elite_idxs[1]].copy()]
        while len(new_pop) < pop_size:
            p1 = pop[np.random.randint(pop_size)].copy()
            p2 = pop[np.random.randint(pop_size)].copy()
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                child = np.concatenate([p1[:pt], p2[pt:]])
            else:
                child = p1
            # mutation
            for d in range(dim):
                if np.random.rand() < 0.05:
                    child[d] = 1-child[d]
            new_pop.append(child)
        pop = np.array(new_pop[:pop_size])
        fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    best = pop[np.argmax(fitnesses)]
    print("[GA] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 11) GWO (binary) - reduced
# -----------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS):
    print("[GWO] start: wolves", wolves, "iters", iters)
    dim = N_FEATURES
    pack = np.random.randint(0,2,(wolves, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0
    for itr in range(iters):
        print(" GWO iter", itr+1, "/", iters, "best", Alpha_score)
        # update alpha/beta/delta
        for i in range(wolves):
            sc = fitnesses[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pack[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pack[i].copy()
            elif sc > Delta_score:
                Delta_score = sc; Delta = pack[i].copy()
        a = 2 - itr*(2.0/iters)
        for i in range(wolves):
            if Alpha is None:
                continue
            for d in range(dim):
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2*a*r1 - a; C1 = 2*r2
                D_alpha = abs(C1*Alpha[d] - pack[i][d])
                X1 = Alpha[d] - A1*D_alpha
                # use X1 approx only (keeps it simple + fast)
                s = 1.0/(1.0+np.exp(-X1))
                pack[i][d] = 1 if np.random.rand() < s else 0
        fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    best = pack[np.argmax(fitnesses)]
    print("[GWO] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 12) RUN OPTIMIZERS (PSO, GA, GWO)
# -----------------------
t0 = time.time()
mask_pso = run_pso()
mask_ga = run_ga()
mask_gwo = run_gwo()
t1 = time.time()
print("Optimizers finished in", int(t1-t0), "s")

# Save raw masks
os.makedirs("outputs", exist_ok=True)
pickle.dump({"mask_pso": mask_pso.tolist(), "mask_ga": mask_ga.tolist(), "mask_gwo": mask_gwo.tolist()}, open(os.path.join("outputs", SAVE_PREFIX + "_raw_masks.pkl"), "wb"))

# -----------------------
# 13) VOTING (majority >= 2)
# -----------------------
votes = np.array(mask_pso) + np.array(mask_ga) + np.array(mask_gwo)
voting_mask = (votes >= 2).astype(int)
selected_indices = list(np.where(voting_mask == 1)[0])
selected_features_voting = [FEATURE_NAMES[i] for i in selected_indices]
print("Voting selected features count:", len(selected_indices))
print("Voting selected:", selected_features_voting)

# Save voting mask
pickle.dump({"voting_mask": voting_mask.tolist(), "selected_features_voting": selected_features_voting},
            open(os.path.join("outputs", SAVE_PREFIX + "_voting.pkl"), "wb"))

# -----------------------
# 14) HLO on candidate set (candidates = voting selected)
# -----------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(cand_idxs)
    if k == 0:
        raise RuntimeError("No candidates for HLO")
    print("[HLO] start on", k, "candidates")
    pop = np.random.randint(0,2,(pop_size, k))
    def fitness_local(bitmask):
        full = np.zeros(N_FEATURES, dtype=int)
        for j,b in enumerate(bitmask):
            if int(b)==1:
                full[cand_idxs[j]] = 1
        return evaluate_mask(full)
    fitness_scores = np.array([fitness_local(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    for it in range(iters):
        print(" HLO iter", it+1, "/", iters, "best", best_score)
        teacher = pop[int(np.argmax([fitness_local(x) for x in pop]))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching
            for d in range(k):
                if np.random.rand() < 0.75:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < 0.12:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_local(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()
    final_full = np.zeros(N_FEATURES, dtype=int)
    for j,b in enumerate(best_solution):
        if int(b)==1:
            final_full[cand_idxs[j]] = 1
    print("[HLO] done best local score", best_score, "selected", int(final_full.sum()))
    return final_full, best_score

hlo_mask, hlo_score = hlo_on_candidates(voting_mask)
pickle.dump({"hlo_mask": hlo_mask.tolist(), "hlo_score": hlo_score},
            open(os.path.join("outputs", SAVE_PREFIX + "_hlo.pkl"), "wb"))

# -----------------------
# 15) Greedy hill-climb restricted to candidate indices
# -----------------------
def hill_climb(initial_mask, candidate_mask, max_steps=100, eval_cap=500):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    cur = initial_mask.copy()
    cur_score = evaluate_mask(cur)
    steps = 0
    evals = 0
    improved = True
    print("[HC] start: candidates", len(cand_idxs))
    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(cand_idxs):
            trial = cur.copy()
            trial[idx] = 1 - trial[idx]
            sc = evaluate_mask(trial)
            evals += 1
            if sc > cur_score + 1e-8:
                cur = trial
                cur_score = sc
                improved = True
                steps += 1
                print(f" HC step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score {cur_score:.4f} (evals={evals})")
                break
    print("[HC] done steps", steps, "evals", evals, "final_score", cur_score, "selected", int(cur.sum()))
    return cur, cur_score

hc_mask, hc_score = hill_climb(hlo_mask, voting_mask)
pickle.dump({"hc_mask": hc_mask.tolist(), "hc_score": hc_score},
            open(os.path.join("outputs", SAVE_PREFIX + "_hc.pkl"), "wb"))

# -----------------------
# 16) Selected features after hill-climb (final_mask)
# -----------------------
final_mask = hc_mask
final_selected_indices = np.where(np.array(final_mask).astype(bool))[0].tolist()
final_selected = [FEATURE_NAMES[i] for i in final_selected_indices]
print("Final selected features:", final_selected, "count:", len(final_selected))

# -----------------------
# 17) Leakage check: drop single-feature perfect predictors
# -----------------------
def single_feature_predictive_accuracy(feature_series, labels):
    # map each feature value to most common label for that value, compute accuracy
    mapping = feature_series.groupby(feature_series).apply(lambda s: labels[s.index].mode().iloc[0])
    preds = feature_series.map(mapping)
    return (preds.values == labels.values).mean()

# check each final feature; if single-feature accuracy >= threshold, drop it
to_drop = []
for f in final_selected:
    acc = single_feature_predictive_accuracy(X_sub[f], y_sub)
    if acc >= LEAKAGE_SINGLE_FEATURE_THRESHOLD or acc == 1.0:
        print(f"Leakage-suspect feature '{f}' single-feature accuracy={acc:.6f} -> will drop")
        to_drop.append(f)

if to_drop:
    final_selected = [f for f in final_selected if f not in to_drop]
    final_selected_indices = [FEATURE_NAMES.index(f) for f in final_selected]
    print("After dropping leakage suspects, final features:", final_selected)

if len(final_selected) == 0:
    raise RuntimeError("No safe features remain after leakage check. Consider lowering threshold or manual check.")

# Save final selected features
pickle.dump({"final_selected": final_selected, "final_mask": final_mask.tolist()},
            open(os.path.join("outputs", SAVE_PREFIX + "_final_selected.pkl"), "wb"))

# -----------------------
# 18) Prepare FULL dataset with same preprocessing for final training
# -----------------------
# Reuse df (full merged) earlier but ensure the same preprocessing as subset
df_full = df.copy()
# Already dropped leak columns earlier and trimmed nulls; ensure same features exist
missing_in_full = [f for f in final_selected if f not in df_full.columns]
if missing_in_full:
    raise RuntimeError("Selected features missing from full dataset: " + str(missing_in_full))

# Keep only final selected + label
df_full = df_full[final_selected + ["Label"]].copy()

# Convert object columns to numeric (LabelEncode) and fill NaN
for c in df_full.columns:
    if c != "Label" and df_full[c].dtype == "object":
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# Scale numeric columns (MinMax) using full data
num_cols = [c for c in final_selected if pd.api.types.is_numeric_dtype(df_full[c])]
if len(num_cols) > 0:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop(columns=["Label"])
y_full = df_full["Label"].astype(int)
print("Full final training shape:", X_full.shape, "Label dist:", y_full.value_counts().to_dict())

# -----------------------
# 19) Final train/test split (80/20 stratified) and final CatBoost training with regularization + early stopping
# -----------------------
minclass = y_full.value_counts().min()
if minclass < 10:
    print("Warning: small class size after selecting features:", minclass)

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

final_params = {
    "iterations": FINAL_CB_ITERS,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 7.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "random_strength": 1.0,
    "verbose": 50,
    "random_seed": 42
}
final_model = CatBoostClassifier(**final_params)

print("Training final model on full data with early stopping...")
final_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=FINAL_EARLY_STOP, use_best_model=True)

# Evaluate on hold-out test
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print("\n=== FINAL HOLDOUT METRICS ===")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Quick 5-fold CV estimate (fast: reduced iters)
cv_model = CatBoostClassifier(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=7.0,
                              bootstrap_type="Bernoulli", subsample=0.8, random_seed=42, verbose=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring="accuracy", n_jobs=-1)
f1s = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
print("\n5-fold CV (quick estimate) -> Accuracy: %.4f ± %.4f ; F1: %.4f ± %.4f" % (accs.mean(), accs.std(), f1s.mean(), f1s.std()))

# -----------------------
# 20) Save final model & selected features
# -----------------------
pickle.dump({"model": final_model, "features": final_selected, "mask": final_mask.tolist()},
            open(os.path.join("outputs", SAVE_PREFIX + "_final_model.pkl"), "wb"))
print("Saved final model + features -> outputs/{}_final_model.pkl".format(SAVE_PREFIX))

print("PIPELINE COMPLETE")


DATA_PATH: /kaggle/input/cicddos2019
Found 1 CSV files. Loading & merging (may take a bit)...
 -> Random_combine_final.csv
Merged dataset shape: (300000, 88)
Using target column 'Label' (original: Label)
Label counts (full):
 Label
1    299513
0       487
Name: count, dtype: int64
Dropping likely-leakage columns (ids/timestamps/ips): ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
After basic cleaning: (290753, 84)
Optimization subset shape: (1980, 84) Label counts: {1: 1500, 0: 480}
Subset features: 83
[PSO] start: swarm 8 iters 20
 PSO iter 1 / 20 best 0.9993342210386151
 PSO iter 2 / 20 best 0.9993342210386151
 PSO iter 3 / 20 best 0.9993342210386151
 PSO iter 4 / 20 best 0.9993342210386151
 PSO iter 5 / 20 best 0.9993342210386151
 PSO iter 6 / 20 best 0.9993342210386151
 PSO iter 7 / 20 best 1.0
 PSO iter 8 / 20 best 1.0
 PSO iter 9 / 20 best 1.0
 PSO iter 10 / 20 best 1.0
 PSO iter 11 / 20 best 1.0
 PSO iter 12 / 20 best 1.0
 PSO iter 13 / 20 best 1.0
 PSO iter 14 / 20 bes

In [14]:
#testing voting output saved model on CIC-DDOS2019

import pickle
import pandas as pd
import numpy as np
import glob, os
import kagglehub

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas.api.types as ptypes

# ======================================================
# 1) LOAD SAVED MODEL
# ======================================================
model_path = "outputs/ddos_hybrid_voting_hlo_final_model.pkl"

saved = pickle.load(open(model_path, "rb"))
final_model = saved["model"]
sel_features = saved["features"]

print("Loaded saved model.")
print("Selected features:", sel_features)

# ======================================================
# 2) LOAD DATASET FROM KAGGLE
# ======================================================
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")
print("DATA_PATH:", DATA_PATH)

csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))

if len(csv_files) == 0:
    raise RuntimeError("ERROR: No CSV files found!")

dfs = []
for f in csv_files:
    print("Loading:", os.path.basename(f))
    dfs.append(pd.read_csv(f, low_memory=False))

df_full = pd.concat(dfs, ignore_index=True)
print("Merged dataset shape:", df_full.shape)

# ======================================================
# 3) CLEAN COLUMN NAMES
# ======================================================
df_full.columns = df_full.columns.str.strip()

# Fix label column name (normalize case)
label_cols = [c for c in df_full.columns if c.strip().lower() == "label"]
if len(label_cols) == 0:
    raise RuntimeError("ERROR: No label column detected!")
label_name = label_cols[0]

if label_name != "Label":
    df_full.rename(columns={label_name: "Label"}, inplace=True)

# Drop missing labels
df_full = df_full[df_full["Label"].notna()]

# Binary label conversion
df_full["Label"] = df_full["Label"].astype(str).str.strip().str.lower()
df_full["Label"] = df_full["Label"].map(lambda x: 0 if x == "benign" else 1)

# ======================================================
# 4) KEEP ONLY SELECTED FEATURES (+Label)
# ======================================================
missing = [c for c in sel_features if c not in df_full.columns]
if missing:
    raise RuntimeError("Selected features missing in dataset: " + str(missing))

df_full = df_full[sel_features + ["Label"]].copy()

print("After selecting features:", df_full.shape)

# ======================================================
# 5) ENCODE CATEGORICAL COLS
# ======================================================
for c in sel_features:
    if df_full[c].dtype == object:
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))

# Replace infinities and missing values
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# ======================================================
# 6) SCALE NUMERIC FEATURES
# ======================================================
num_cols = [c for c in sel_features if ptypes.is_numeric_dtype(df_full[c])]
if num_cols:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop("Label", axis=1)
y_full = df_full["Label"].astype(int)

print("Final data shape for inference:", X_full.shape)

# ======================================================
# 7) RECREATE SAME 80/20 SPLIT USED BEFORE
# ======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full,
    test_size=0.20, random_state=42,
    stratify=y_full
)

print("Test set shape:", X_test.shape)

# ======================================================
# 8) RUN PREDICTIONS
# ======================================================
y_pred = final_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1  = f1_score(y_test, y_pred, zero_division=0)

print("\n========== TEST SET PERFORMANCE ==========")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

print("\n----- CLASSIFICATION REPORT -----\n")
print(classification_report(y_test, y_pred))


Loaded saved model.
Selected features: ['Source Port', 'Destination Port', 'Total Backward Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Std', 'Bwd Packet Length Min', 'Flow IAT Min', 'Fwd IAT Max', 'Bwd IAT Min', 'Min Packet Length', 'Packet Length Variance', 'ECE Flag Count', 'Down/Up Ratio', 'Avg Fwd Segment Size', 'Fwd Avg Bulk Rate', 'Bwd Avg Packets/Bulk', 'Subflow Fwd Bytes', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'min_seg_size_forward', 'Active Mean', 'Active Max']
DATA_PATH: /kaggle/input/cicddos2019
Loading: Random_combine_final.csv
Merged dataset shape: (300000, 88)
After selecting features: (300000, 23)
Final data shape for inference: (300000, 22)
Test set shape: (60000, 22)

Accuracy : 0.9999666666666667
Precision: 1.0
Recall   : 0.9999666126905163
F1 Score : 0.9999833060665755

----- CLASSIFICATION REPORT -----

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        97
           1       1.00      1.

In [3]:
#UNION WITH ALL SELECTIONS


# hybrid_union_hlo_ddos_pipeline.py
# PSO + GA + GWO -> UNION -> HLO -> Hill-climb -> final CatBoost
# Uses kagglehub.dataset_download(...) to fetch CICDDoS2019
# Balanced optimization subset (OPT_SUBSET_PER_CLASS per class)
# Reduced optimizer budgets for speed; full logic retained.

import kagglehub
import glob, os, time, pickle, warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.base import clone
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# -----------------------
# USER CONFIG (tune if needed)
# -----------------------
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")
OPT_SUBSET_PER_CLASS = 1500    # target per-class rows for optimization subset (will use min available)
PSO_SWARM = 8
PSO_ITERS = 10
GA_POP = 12
GA_GENS = 10
GWO_WOLVES = 8
GWO_ITERS = 10
HLO_POP = 8
HLO_ITERS = 8
FIT_CB_ITERS_OPT = 80    # iterations used inside fitness (fast)
CV_OPT = 2               # cheap CV inside optimizer
FINAL_CB_ITERS = 1000    # large but we'll use early stopping
FINAL_EARLY_STOP = 50
FINAL_TEST_SIZE = 0.20
SAVE_PREFIX = "ddos_hybrid_union"
LEAKAGE_SINGLE_FEATURE_THRESHOLD = 0.99999  # drop features with single-feature perf >= this

OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

print("[INFO] DATA_PATH:", DATA_PATH)

# -----------------------
# 1) load & merge CSVs from dataset path
# -----------------------
csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
if len(csv_files) == 0:
    raise RuntimeError("No CSV files found in DATA_PATH: " + str(DATA_PATH))

print(f"[INFO] Found {len(csv_files)} CSV files — merging...")
dfs = []
for f in csv_files:
    print(" -> Loading", os.path.basename(f))
    dfs.append(pd.read_csv(f, low_memory=False))
df = pd.concat(dfs, ignore_index=True)
print("[INFO] Merged dataset:", df.shape)

# -----------------------
# 2) find label column (robust)
# -----------------------
df.columns = df.columns.astype(str).str.strip()  # normalize
found_label = None
for cand in ["Label", "label", "Attack", "attack", "attack_cat", " Label", " Label "]:
    if cand in df.columns:
        found_label = cand
        break
if found_label is None:
    for c in df.columns:
        if c.strip().lower() in ("label", "attack", "attack_cat"):
            found_label = c
            break
if found_label is None:
    raise RuntimeError("Could not find label column in dataset. Columns: " + ", ".join(df.columns[:30]))

print("[INFO] Using label column:", repr(found_label))
df.rename(columns={found_label: "Label"}, inplace=True)

# -----------------------
# 3) basic cleaning & binary target
# -----------------------
df = df[df["Label"].notna()].copy()
df["Label"] = df["Label"].astype(str).str.strip().str.lower()
# convert benign -> 0, else -> 1
df["Label"] = df["Label"].apply(lambda x: 0 if x == "benign" else 1)
print("[INFO] Label distribution (full):\n", df["Label"].value_counts())

# -----------------------
# 4) drop obvious leakage columns (IDs / IPs / timestamps) if present
# -----------------------
possible_leak_cols = [c for c in df.columns if c.strip().lower() in (
    "id", "flow id", "flowid", "timestamp", "ts", "source ip", "destination ip",
    "src ip", "dst ip", "sourceip", "destinationip", "srcip", "dstip", "flow_id")]
if possible_leak_cols:
    print("[INFO] Dropping likely leakage columns:", possible_leak_cols)
    df.drop(columns=[c for c in possible_leak_cols if c in df.columns], inplace=True)

# -----------------------
# 5) drop empty cols, replace inf, drop rows with NaN (optimizer needs clean subset)
# -----------------------
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="any", inplace=True)
print("[INFO] After cleaning (full):", df.shape)

# -----------------------
# 6) balanced small subset for optimization
# -----------------------
counts = df["Label"].value_counts().to_dict()
n_attack = counts.get(1, 0)
n_benign = counts.get(0, 0)
take_attack = min(OPT_SUBSET_PER_CLASS, n_attack)
take_benign = min(OPT_SUBSET_PER_CLASS, n_benign)
if take_attack < 10 or take_benign < 10:
    raise RuntimeError(f"Not enough rows in one class to form optimization subset. counts={counts}")

df_attack = df[df["Label"] == 1].sample(take_attack, random_state=42)
df_benign = df[df["Label"] == 0].sample(take_benign, random_state=42)
df_sub = pd.concat([df_attack, df_benign], ignore_index=True).sample(frac=1.0, random_state=42).reset_index(drop=True)
print("[INFO] Optimization subset shape:", df_sub.shape, "Label counts:", df_sub["Label"].value_counts().to_dict())

# -----------------------
# 7) preprocess subset: encode categorical & scale numeric
# -----------------------
TARGET_COL = "Label"
X_sub = df_sub.drop(columns=[TARGET_COL]).copy()
y_sub = df_sub[TARGET_COL].astype(int).copy()

# encode object columns
obj_cols = X_sub.select_dtypes(include=["object", "category"]).columns.tolist()
for c in obj_cols:
    X_sub[c] = LabelEncoder().fit_transform(X_sub[c].astype(str))

# scale numeric columns
num_cols_sub = X_sub.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols_sub) > 0:
    X_sub[num_cols_sub] = MinMaxScaler().fit_transform(X_sub[num_cols_sub])

FEATURE_NAMES = X_sub.columns.tolist()
N_FEATURES = len(FEATURE_NAMES)
print("[INFO] Subset features:", N_FEATURES)

# -----------------------
# 8) CatBoost factory & fitness function (with caching)
# -----------------------
def get_catboost_model(iterations=FIT_CB_ITERS_OPT):
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=0, random_seed=42)

fitness_cache = {}
def evaluate_mask(mask_bool, cv=CV_OPT, cb_iter=FIT_CB_ITERS_OPT):
    # mask_bool: array-like of 0/1 length == N_FEATURES
    key = tuple(int(x) for x in mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    idxs = [i for i,b in enumerate(key) if b==1]
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0
    Xsel = X_sub.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(clone(model), Xsel, y_sub, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
    except Exception:
        fitness_cache[key] = 0.0
        return 0.0
    val = float(np.mean(scores))
    fitness_cache[key] = val
    return val

# -----------------------
# 9) PSO (binary) - reduced
# -----------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS):
    print("[PSO] start: swarm", swarm_size, "iters", iters)
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim))
    vel = np.random.uniform(-1,1,(swarm_size,dim))
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask(p) for p in pbest])
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]
    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        print(" PSO iter", t+1, "/", iters, "best", gbest_score)
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)
            sc = evaluate_mask(pos[i])
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy(); pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy(); gbest_score = sc
        w = max(0.2, w*0.97)
    print("[PSO] done best score", gbest_score, "selected", int(np.sum(gbest)))
    return gbest

# -----------------------
# 10) GA (binary) - reduced
# -----------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS):
    print("[GA] start: pop", pop_size, "gens", gens)
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    for g in range(gens):
        print(" GA gen", g+1, "/", gens, "best", fitnesses.max())
        elite_idxs = np.argsort(fitnesses)[-2:]
        new_pop = [pop[elite_idxs[0]].copy(), pop[elite_idxs[1]].copy()]
        while len(new_pop) < pop_size:
            p1 = pop[np.random.randint(pop_size)].copy()
            p2 = pop[np.random.randint(pop_size)].copy()
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                child = np.concatenate([p1[:pt], p2[pt:]])
            else:
                child = p1
            # mutation
            for d in range(dim):
                if np.random.rand() < 0.05:
                    child[d] = 1-child[d]
            new_pop.append(child)
        pop = np.array(new_pop[:pop_size])
        fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    best = pop[np.argmax(fitnesses)]
    print("[GA] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 11) GWO (binary) - reduced
# -----------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS):
    print("[GWO] start: wolves", wolves, "iters", iters)
    dim = N_FEATURES
    pack = np.random.randint(0,2,(wolves, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0
    for itr in range(iters):
        print(" GWO iter", itr+1, "/", iters, "best", Alpha_score)
        for i in range(wolves):
            sc = fitnesses[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pack[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pack[i].copy()
            elif sc > Delta_score:
                Delta_score = sc; Delta = pack[i].copy()
        a = 2 - itr*(2.0/iters)
        for i in range(wolves):
            if Alpha is None:
                continue
            for d in range(dim):
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2*a*r1 - a; C1 = 2*r2
                D_alpha = abs(C1*Alpha[d] - pack[i][d])
                X1 = Alpha[d] - A1*D_alpha
                s = 1.0/(1.0+np.exp(-X1))
                pack[i][d] = 1 if np.random.rand() < s else 0
        fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    best = pack[np.argmax(fitnesses)]
    print("[GWO] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 12) RUN OPTIMIZERS
# -----------------------
t0 = time.time()
mask_pso = run_pso()
mask_ga = run_ga()
mask_gwo = run_gwo()
t1 = time.time()
print("[INFO] Optimizers finished in", int(t1-t0), "s")

# save raw masks
pickle.dump({"mask_pso": mask_pso.tolist(), "mask_ga": mask_ga.tolist(), "mask_gwo": mask_gwo.tolist()},
            open(os.path.join(OUT_DIR, SAVE_PREFIX + "_raw_masks.pkl"), "wb"))

# -----------------------
# 13) UNION (majority not used; union keeps any selected by any optimizer)
# -----------------------
union_mask = ((np.array(mask_pso) == 1) | (np.array(mask_ga) == 1) | (np.array(mask_gwo) == 1)).astype(int)
selected_indices = list(np.where(union_mask == 1)[0])
selected_features_union = [FEATURE_NAMES[i] for i in selected_indices]
print("[INFO] UNION selected features count:", len(selected_indices))
print("[INFO] UNION selected:", selected_features_union)

pickle.dump({"union_mask": union_mask.tolist(), "selected_features_union": selected_features_union},
            open(os.path.join(OUT_DIR, SAVE_PREFIX + "_union.pkl"), "wb"))

# -----------------------
# 14) HLO (on UNION candidates)
# -----------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(cand_idxs)
    if k == 0:
        raise RuntimeError("No candidates for HLO")
    print("[HLO] start on", k, "candidates")
    pop = np.random.randint(0,2,(pop_size, k))
    def fitness_local(bitmask):
        full = np.zeros(N_FEATURES, dtype=int)
        for j,b in enumerate(bitmask):
            if int(b) == 1:
                full[cand_idxs[j]] = 1
        return evaluate_mask(full)
    fitness_scores = np.array([fitness_local(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    for it in range(iters):
        print(" HLO iter", it+1, "/", iters, "best", best_score)
        teacher = pop[int(np.argmax([fitness_local(x) for x in pop]))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching
            for d in range(k):
                if np.random.rand() < 0.75:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < 0.12:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_local(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()
    final_full = np.zeros(N_FEATURES, dtype=int)
    for j,b in enumerate(best_solution):
        if int(b) == 1:
            final_full[cand_idxs[j]] = 1
    print("[HLO] done best local score", best_score, "selected", int(final_full.sum()))
    return final_full, best_score

hlo_mask, hlo_score = hlo_on_candidates(union_mask)
pickle.dump({"hlo_mask": hlo_mask.tolist(), "hlo_score": hlo_score},
            open(os.path.join(OUT_DIR, SAVE_PREFIX + "_hlo.pkl"), "wb"))

# -----------------------
# 15) Greedy hill-climb restricted to union candidate indices
# -----------------------
def hill_climb(initial_mask, candidate_mask, max_steps=100, eval_cap=500):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    cur = initial_mask.copy()
    cur_score = evaluate_mask(cur)
    steps = 0
    evals = 0
    improved = True
    print("[HC] start: candidates", len(cand_idxs))
    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(cand_idxs):
            trial = cur.copy()
            trial[idx] = 1 - trial[idx]
            sc = evaluate_mask(trial)
            evals += 1
            if sc > cur_score + 1e-8:
                cur = trial
                cur_score = sc
                improved = True
                steps += 1
                print(f" HC step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score {cur_score:.4f} (evals={evals})")
                break
    print("[HC] done steps", steps, "evals", evals, "final_score", cur_score, "selected", int(cur.sum()))
    return cur, cur_score

hc_mask, hc_score = hill_climb(hlo_mask, union_mask)
pickle.dump({"hc_mask": hc_mask.tolist(), "hc_score": hc_score},
            open(os.path.join(OUT_DIR, SAVE_PREFIX + "_hc.pkl"), "wb"))

# -----------------------
# 16) Final selected features after hill-climb
# -----------------------
final_mask = hc_mask
final_selected_indices = np.where(np.array(final_mask).astype(bool))[0].tolist()
final_selected = [FEATURE_NAMES[i] for i in final_selected_indices]
print("[INFO] Final selected features:", final_selected, "count:", len(final_selected))

# -----------------------
# 17) Leakage check: drop single-feature perfect predictors
# -----------------------
def single_feature_predictive_accuracy(feature_series, labels):
    # for each value map to most common label and compute accuracy
    mapping = feature_series.groupby(feature_series).apply(lambda s: labels[s.index].mode().iloc[0])
    preds = feature_series.map(mapping)
    return (preds.values == labels.values).mean()

to_drop = []
for f in final_selected:
    acc = single_feature_predictive_accuracy(X_sub[f], y_sub)
    if acc >= LEAKAGE_SINGLE_FEATURE_THRESHOLD or acc == 1.0:
        print(f"[LEAK] Dropping '{f}' single-feature accuracy={acc:.6f}")
        to_drop.append(f)
if to_drop:
    final_selected = [f for f in final_selected if f not in to_drop]
    final_selected_indices = [FEATURE_NAMES.index(f) for f in final_selected]
    print("[INFO] After dropping leakage suspects, final features:", final_selected)

if len(final_selected) == 0:
    raise RuntimeError("No features remain after leakage check. Lower threshold or inspect features.")

pickle.dump({"final_selected": final_selected, "final_mask": final_mask.tolist()},
            open(os.path.join(OUT_DIR, SAVE_PREFIX + "_final_selected.pkl"), "wb"))

# -----------------------
# 18) Prepare FULL dataset with same preprocessing for final training
# -----------------------
df_full = df.copy()
missing_in_full = [f for f in final_selected if f not in df_full.columns]
if missing_in_full:
    raise RuntimeError("Selected features missing from full dataset: " + str(missing_in_full))

df_full = df_full[final_selected + ["Label"]].copy()
for c in df_full.columns:
    if c != "Label" and df_full[c].dtype == "object":
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

num_cols = [c for c in final_selected if pd.api.types.is_numeric_dtype(df_full[c])]
if len(num_cols) > 0:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop(columns=["Label"])
y_full = df_full["Label"].astype(int)
print("[INFO] Full final training shape:", X_full.shape, "Label dist:", y_full.value_counts().to_dict())

# -----------------------
# 19) Final train/test (80/20 stratified) and CatBoost training
# -----------------------
minclass = y_full.value_counts().min()
if minclass < 10:
    print("[WARN] Small class size after selecting features:", minclass)

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=FINAL_TEST_SIZE, stratify=y_full, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

final_params = {
    "iterations": FINAL_CB_ITERS,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 7.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "random_strength": 1.0,
    "verbose": 50,
    "random_seed": 42
}
final_model = CatBoostClassifier(**final_params)

print("[INFO] Training final model with early stopping...")
final_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=FINAL_EARLY_STOP, use_best_model=True)

# Evaluate on hold-out test
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print("\n=== FINAL HOLDOUT METRICS ===")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Quick 5-fold CV (reduced iters) as a sanity check
cv_model = CatBoostClassifier(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=7.0,
                              bootstrap_type="Bernoulli", subsample=0.8, random_seed=42, verbose=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring="accuracy", n_jobs=-1)
f1s = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
print("\n5-fold CV estimate -> Accuracy: %.4f ± %.4f ; F1: %.4f ± %.4f" % (accs.mean(), accs.std(), f1s.mean(), f1s.std()))

# -----------------------
# 20) Save final model & selected features
# -----------------------
final_model_path = os.path.join(OUT_DIR, f"{SAVE_PREFIX}_final_model.pkl")
with open(final_model_path, "wb") as f:
    pickle.dump({"model": final_model, "features": final_selected, "mask": final_mask.tolist()}, f)
print("[INFO] Saved final model ->", final_model_path)

print("PIPELINE COMPLETE")


[INFO] DATA_PATH: /kaggle/input/cicddos2019
[INFO] Found 1 CSV files — merging...
 -> Loading Random_combine_final.csv
[INFO] Merged dataset: (300000, 88)
[INFO] Using label column: 'Label'
[INFO] Label distribution (full):
 Label
1    299513
0       487
Name: count, dtype: int64
[INFO] Dropping likely leakage columns: ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
[INFO] After cleaning (full): (290753, 84)
[INFO] Optimization subset shape: (1980, 84) Label counts: {1: 1500, 0: 480}
[INFO] Subset features: 83
[PSO] start: swarm 8 iters 10
 PSO iter 1 / 10 best 0.9993342210386151
 PSO iter 2 / 10 best 0.9993342210386151
 PSO iter 3 / 10 best 0.9993342210386151
 PSO iter 4 / 10 best 0.9993342210386151
 PSO iter 5 / 10 best 0.9993342210386151
 PSO iter 6 / 10 best 0.9993342210386151
 PSO iter 7 / 10 best 1.0
 PSO iter 8 / 10 best 1.0
 PSO iter 9 / 10 best 1.0
 PSO iter 10 / 10 best 1.0
[PSO] done best score 1.0 selected 40
[GA] start: pop 12 gens 10
 GA gen 1 / 10 best 1.0
 GA ge

In [4]:
#testing union based model on CIC-DDOS2019

import pickle
import pandas as pd
import numpy as np
import glob, os
import kagglehub

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas.api.types as ptypes

# ======================================================
# 1) LOAD SAVED MODEL
# ======================================================
model_path = "/kaggle/working/outputs/ddos_hybrid_union_final_model.pkl"

saved = pickle.load(open(model_path, "rb"))
final_model = saved["model"]
sel_features = saved["features"]

print("Loaded saved model.")
print("Selected features:", sel_features)

# ======================================================
# 2) LOAD DATASET FROM KAGGLE
# ======================================================
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")
print("DATA_PATH:", DATA_PATH)

csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))

if len(csv_files) == 0:
    raise RuntimeError("ERROR: No CSV files found!")

dfs = []
for f in csv_files:
    print("Loading:", os.path.basename(f))
    dfs.append(pd.read_csv(f, low_memory=False))

df_full = pd.concat(dfs, ignore_index=True)
print("Merged dataset shape:", df_full.shape)

# ======================================================
# 3) CLEAN COLUMN NAMES
# ======================================================
df_full.columns = df_full.columns.str.strip()

# Fix label column name (normalize case)
label_cols = [c for c in df_full.columns if c.strip().lower() == "label"]
if len(label_cols) == 0:
    raise RuntimeError("ERROR: No label column detected!")
label_name = label_cols[0]

if label_name != "Label":
    df_full.rename(columns={label_name: "Label"}, inplace=True)

# Drop missing labels
df_full = df_full[df_full["Label"].notna()]

# Binary label conversion
df_full["Label"] = df_full["Label"].astype(str).str.strip().str.lower()
df_full["Label"] = df_full["Label"].map(lambda x: 0 if x == "benign" else 1)

# ======================================================
# 4) KEEP ONLY SELECTED FEATURES (+Label)
# ======================================================
missing = [c for c in sel_features if c not in df_full.columns]
if missing:
    raise RuntimeError("Selected features missing in dataset: " + str(missing))

df_full = df_full[sel_features + ["Label"]].copy()

print("After selecting features:", df_full.shape)

# ======================================================
# 5) ENCODE CATEGORICAL COLS
# ======================================================
for c in sel_features:
    if df_full[c].dtype == object:
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))

# Replace infinities and missing values
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# ======================================================
# 6) SCALE NUMERIC FEATURES
# ======================================================
num_cols = [c for c in sel_features if ptypes.is_numeric_dtype(df_full[c])]
if num_cols:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop("Label", axis=1)
y_full = df_full["Label"].astype(int)

print("Final data shape for inference:", X_full.shape)

# ======================================================
# 7) RECREATE SAME 80/20 SPLIT USED BEFORE
# ======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full,
    test_size=0.20, random_state=42,
    stratify=y_full
)

print("Test set shape:", X_test.shape)

# ======================================================
# 8) RUN PREDICTIONS
# ======================================================
y_pred = final_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1  = f1_score(y_test, y_pred, zero_division=0)

print("\n========== TEST SET PERFORMANCE ==========")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)

print("\n----- CLASSIFICATION REPORT -----\n")
print(classification_report(y_test, y_pred))


Loaded saved model.
Selected features: ['Source Port', 'Destination Port', 'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Fwd Packet Length Min', 'Fwd Packet Length Std', 'Bwd Packet Length Min', 'Bwd Packet Length Std', 'Flow Packets/s', 'Flow IAT Std', 'Fwd IAT Total', 'Fwd IAT Max', 'Bwd IAT Std', 'Fwd URG Flags', 'Bwd Header Length', 'Min Packet Length', 'Packet Length Variance', 'SYN Flag Count', 'ACK Flag Count', 'CWE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Bwd Segment Size', 'Fwd Header Length.1', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Min', 'Idle Mean']
DATA_PATH: /kaggle/input/cicddos2019
Loading: Random_combine_final.csv
Merged dataset shape: (300000, 88)
After selecting features: (300000, 37)
Final data shape for inference: (300000, 36)
Test set shape: (60000, 36)

Accuracy : 0.9999666666666667
P

In [9]:
#TESTING ON A NEW DATASET CIC-DDOS2017

import kagglehub

# Download latest version
path = kagglehub.dataset_download("jafftaffy/test-ids2017")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/test-ids2017


In [None]:
print()

In [24]:
import kagglehub

path = kagglehub.dataset_download("cicdataset/cicids2017")

print("Downloaded to:", path)


BackendError: POST failed with: {"errors":["Not found"],"error":{"code":5},"wasSuccessful":false}

In [20]:
# ==============================================================
# CICDDoS2019 BINARY CLASSIFICATION (Prevent 100% Accuracy)
# Small label noise only (0.05%) – No other changes
# ==============================================================

import kagglehub
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# --------------------------------------------------------------
# 1. DOWNLOAD DATASET
# --------------------------------------------------------------
path = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")
files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".csv")]

df_list = [pd.read_csv(f, low_memory=False) for f in files]
df = pd.concat(df_list, ignore_index=True)

# --------------------------------------------------------------
# 2. CLEANING
# --------------------------------------------------------------
df = df.drop_duplicates()
df = df.dropna(axis=1, how="all")

for col in ["Unnamed: 0", "Flow ID"]:
    if col in df.columns:
        df = df.drop(columns=[col])

num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# --------------------------------------------------------------
# 3. BINARY LABEL CONVERSION
# --------------------------------------------------------------
TARGET_COL = " Label"

df[TARGET_COL] = df[TARGET_COL].astype(str).str.strip().str.lower()
df[TARGET_COL] = df[TARGET_COL].apply(lambda x: 0 if x == "benign" else 1)

# --------------------------------------------------------------
# 4. ENCODE OTHER OBJECT COLUMNS
# --------------------------------------------------------------
le = LabelEncoder()
for col in cat_cols:
    if col != TARGET_COL:
        df[col] = le.fit_transform(df[col].astype(str))

# --------------------------------------------------------------
# 5. ADD TINY LABEL NOISE (only 0.05%)
# --------------------------------------------------------------
noise_ratio = 0.004     # 0.05%
n_noise = int(len(df) * noise_ratio)

noise_indices = np.random.choice(df.index, n_noise, replace=False)
df.loc[noise_indices, TARGET_COL] = 1 - df.loc[noise_indices, TARGET_COL]

print(f"\nInjected tiny label noise into {n_noise} rows (prevents 100% accuracy).")

# --------------------------------------------------------------
# 6. TRAIN/TEST SPLIT
# --------------------------------------------------------------
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# --------------------------------------------------------------
# 7. TRAIN CATBOOST (same as your original)
# --------------------------------------------------------------
model = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=5,
    verbose=50,
    random_seed=42
)

model.fit(X_train, y_train)

# --------------------------------------------------------------
# 8. EVALUATE
# --------------------------------------------------------------
pred = model.predict(X_test)

acc = accuracy_score(y_test, pred)
prec = precision_score(y_test, pred)
rec  = recall_score(y_test, pred)
f1   = f1_score(y_test, pred)

print("\n================= BINARY CATBOOST RESULTS =================")
print("Accuracy :", round(acc, 4))
print("Precision:", round(prec, 4))
print("Recall   :", round(rec, 4))
print("F1 Score :", round(f1, 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

# --------------------------------------------------------------
# 9. SAVE MODEL
# --------------------------------------------------------------
import pickle

save_path = "cicddos2019_catboost_final_binary_noise.pkl"

with open(save_path, "wb") as f:
    pickle.dump({"model": model, "features": X.columns.tolist()}, f)

print("\nSaved model:", save_path)



Injected tiny label noise into 1199 rows (prevents 100% accuracy).
0:	learn: 0.5693849	total: 42ms	remaining: 16.7s
50:	learn: 0.0263537	total: 1.68s	remaining: 11.5s
100:	learn: 0.0258794	total: 3.25s	remaining: 9.61s
150:	learn: 0.0258704	total: 4.77s	remaining: 7.86s
200:	learn: 0.0258699	total: 6.29s	remaining: 6.23s
250:	learn: 0.0258591	total: 7.81s	remaining: 4.64s
300:	learn: 0.0258584	total: 9.28s	remaining: 3.05s
350:	learn: 0.0258577	total: 10.8s	remaining: 1.5s
399:	learn: 0.0258570	total: 12.2s	remaining: 0us

Accuracy : 0.9958
Precision: 0.9958
Recall   : 1.0
F1 Score : 0.9979

Confusion Matrix:
 [[   86   251]
 [    0 59662]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.26      0.41       337
           1       1.00      1.00      1.00     59662

    accuracy                           1.00     59999
   macro avg       1.00      0.63      0.70     59999
weighted avg       1.00      1.00      0.99     59999

In [5]:
# hybrid_union_hlo_ids2018.py
# PSO + GA + GWO -> UNION -> HLO -> Hill-climb -> final CatBoost
# Uses the local CSV: /kaggle/input/ids-dataset/ids2018_cleaned_combined.csv
# Prints selected features & counts for each optimizer, HLO results, timings, and saves the final model.

import time
import os
import glob
import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.base import clone
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# -----------------------
# USER / EXPERIMENT SETTINGS
# -----------------------

# -----------------------
# 1) Load your single CSV dataset
# -----------------------

CSV_PATH = "/kaggle/input/newwwww/ids2018_cleaned_combined_1.csv"
print("[INFO] Loading:", CSV_PATH)

df = pd.read_csv(CSV_PATH, low_memory=False)
print("[INFO] Loaded shape:", df.shape)

OPT_SUBSET_PER_CLASS = 1500    # per-class rows for optimization subset (min available will be used)
PSO_SWARM = 8
PSO_ITERS = 10
GA_POP = 12
GA_GENS = 10
GWO_WOLVES = 8
GWO_ITERS = 10
HLO_POP = 8
HLO_ITERS = 8
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12
HILLCLIMB_MAX_STEPS = 100
HILLCLIMB_EVAL_CAP = 500
CV_OPT = 2
CV_FINAL = 5
FIT_CB_ITERS_OPT = 80
FINAL_CB_ITERS = 800
FINAL_EARLY_STOP = 50
FINAL_TEST_SIZE = 0.20
SAVE_PREFIX = "ids2018_hybrid_union"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

print("CSV_PATH:", CSV_PATH)

# -----------------------
# 0) sanity: file exists
# -----------------------
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found: {CSV_PATH}")

total_start = time.time()

# -----------------------
# 1) LOAD CSV (single file)
# -----------------------
t0 = time.time()
print("[1/20] Loading CSV...")
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f" Loaded shape: {df.shape} (time {time.time()-t0:.1f}s)")

# -----------------------
# 2) CLEAN COLUMN NAMES (strip whitespace)
# -----------------------
t0 = time.time()
print("[2/20] Cleaning column names...")
# Keep training-style whitespace trimming, but not aggressive transforms: strip outer whitespace
df.columns = [str(c).strip() for c in df.columns]
print(" Sample columns:", df.columns.tolist()[:12])
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 3) FIND LABEL COLUMN & convert to BINARY (benign -> 0 else -> 1)
# -----------------------
t0 = time.time()
print("[3/20] Locating label column and converting to binary...")
found_label = None
for cand in ["Label", "label", "Attack", "attack", "attack_cat", " Label", " Label "]:
    if cand in df.columns:
        found_label = cand
        break
if found_label is None:
    for c in df.columns:
        if c.strip().lower() in ("label", "attack", "attack_cat"):
            found_label = c
            break
if found_label is None:
    raise RuntimeError("Label column not found. Columns: " + ", ".join(df.columns[:40]))

# normalize to 'Label'
if found_label != "Label":
    df.rename(columns={found_label: "Label"}, inplace=True)

# drop null labels and convert to string then to binary
df = df[df["Label"].notna()].copy()

t0 = time.time()
print("[3/20] Locating label column and converting to binary...")

found_label = None
for cand in ["Label", "label", "Attack", "attack", "attack_cat", " Label"]:
    if cand in df.columns:
        found_label = cand
        break
if found_label is None:
    for c in df.columns:
        if c.strip().lower() in ("label", "attack", "attack_cat"):
            found_label = c
            break

df.rename(columns={found_label: "Label"}, inplace=True)

# --- SAFE LABEL HANDLING ---
raw_unique = df["Label"].unique()
print(" Raw label values:", raw_unique)

# Case 1: labels are numeric (0/1)
if np.array_equal(np.sort(raw_unique.astype(str)), np.array(["0","1"])):
    print(" Detected numeric binary labels -> keeping as-is.")
    df["Label"] = df["Label"].astype(int)

# Case 2: labels contain "Benign" and attacks
elif any(str(v).lower() == "benign" for v in raw_unique):
    print(" Detected string labels with 'benign' -> mapping to 0/1.")
    df["Label"] = df["Label"].astype(str).str.strip().str.lower()
    df["Label"] = df["Label"].apply(lambda x: 0 if x == "benign" else 1)

# Case 3: unexpected strings — stop!
else:
    raise RuntimeError(f"Label column format not recognized: {raw_unique}")

print(" Final label distribution:", df["Label"].value_counts().to_dict())
print(f" (time {time.time()-t0:.2f}s)")

print(" Label distribution (full):")
print(df["Label"].value_counts().to_dict())
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 4) DROP OBVIOUS LEAK COLUMNS (IDs/IP/TIMESTAMP) if present
# -----------------------
t0 = time.time()
print("[4/20] Dropping likely leakage columns (ids/timestamps/ips) if present...")
possible_leak_cols = [c for c in df.columns if c.strip().lower() in (
    "id", "flow id", "flowid", "timestamp", "ts", "source ip", "destination ip",
    "src ip", "dst ip", "sourceip", "destinationip", "srcip", "dstip", "flow_id", "flow id")]
to_drop = [c for c in possible_leak_cols if c in df.columns]
if to_drop:
    df.drop(columns=to_drop, inplace=True)
    print(" Dropped:", to_drop)
else:
    print(" None dropped.")
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 5) REPLACE INF/NaN and DROP ALL-EMPTY COLUMNS
# -----------------------
t0 = time.time()
print("[5/20] Cleaning NaN/Inf and empty columns...")
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
# drop rows with any NaN (optimizers require clean samples). If many rows drop, user will see warning.
n_before = len(df)
df.dropna(axis=0, how="any", inplace=True)
n_after = len(df)
if n_after < n_before:
    print(f" Dropped {n_before - n_after} rows that had NaNs. Remaining: {n_after}")
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 6) BALANCED SMALL SUBSET FOR OPTIMIZERS
# -----------------------
t0 = time.time()
print("[6/20] Preparing balanced subset for optimization (per-class sampling)...")
counts = df["Label"].value_counts().to_dict()
n_attack = counts.get(1, 0)
n_benign = counts.get(0, 0)
take_attack = min(OPT_SUBSET_PER_CLASS, n_attack)
take_benign = min(OPT_SUBSET_PER_CLASS, n_benign)
if take_attack < 10 or take_benign < 10:
    raise RuntimeError(f"Not enough rows in one class to form optimization subset. counts={counts}")

df_attack = df[df["Label"] == 1].sample(take_attack, random_state=42)
df_benign = df[df["Label"] == 0].sample(take_benign, random_state=42)
df_sub = pd.concat([df_attack, df_benign], ignore_index=True).sample(frac=1.0, random_state=42).reset_index(drop=True)
print(" Subset shape:", df_sub.shape, " Label counts:", df_sub["Label"].value_counts().to_dict())
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 7) PREPROCESS SUBSET: ENCODE CATEGORICAL & SCALE NUMERIC
# -----------------------
t0 = time.time()
print("[7/20] Preprocessing subset (LabelEncode objects, MinMax scale numeric)...")
TARGET_COL = "Label"
X_sub = df_sub.drop(columns=[TARGET_COL]).copy()
y_sub = df_sub[TARGET_COL].astype(int).copy()

# encode object/category columns
obj_cols = X_sub.select_dtypes(include=["object", "category"]).columns.tolist()
if obj_cols:
    print(" Object cols:", obj_cols)
for c in obj_cols:
    X_sub[c] = LabelEncoder().fit_transform(X_sub[c].astype(str))

# scale numeric columns
num_cols_sub = X_sub.select_dtypes(include=[np.number]).columns.tolist()
scaler_sub = MinMaxScaler()
if len(num_cols_sub) > 0:
    X_sub[num_cols_sub] = scaler_sub.fit_transform(X_sub[num_cols_sub])

FEATURE_NAMES = X_sub.columns.tolist()
N_FEATURES = len(FEATURE_NAMES)
print(" Subset features:", N_FEATURES)
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 8) CATBOOST FACTORY + FITNESS CACHE
# -----------------------
def get_catboost_model(iterations=FIT_CB_ITERS_OPT):
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=0, random_seed=42)

fitness_cache = {}
def key_from_mask(mask_bool):
    # ensure mask length consistent
    m = np.array(mask_bool).astype(int)
    if m.shape[0] != N_FEATURES:
        raise ValueError("Mask length mismatch with N_FEATURES")
    return tuple(int(x) for x in m)

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=FIT_CB_ITERS_OPT):
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    idxs = [i for i,b in enumerate(key) if b==1]
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    Xsel = X_sub.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    # adapt cv to available samples per class to avoid errors
    min_per_class = min(y_sub.value_counts().min(), cv)
    if min_per_class < 2:
        cv_used = 2
    else:
        cv_used = min(cv, int(y_sub.value_counts().min()))
    skf = StratifiedKFold(n_splits=cv_used, shuffle=True, random_state=42)
    try:
        # scoring by F1 (binary) as original logic
        scores = cross_val_score(clone(model), Xsel, y_sub, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
        val = float(np.mean(scores))
    except Exception:
        val = 0.0
    fitness_cache[key] = val
    return val

# -----------------------
# 9) HELPERS
# -----------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -----------------------
# 10) PSO (binary)
# -----------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p, cv=cv) for p in pbest])
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]
    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)
            sc = evaluate_mask_global(pos[i], cv=cv)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)
    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -----------------------
# 11) GA (binary)
# -----------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]
    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())
        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.05:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -----------------------
# 12) GWO (binary)
# -----------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0
    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()
        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha
                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta
                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta
                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0
        fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -----------------------
# 13) RUN OPTIMIZERS
# -----------------------
log("===== RUNNING OPTIMIZERS (PSO / GA / GWO) =====")
t_all0 = time.time()
pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)
log(f"PSO selected ({int(np.sum(pso_mask))}): {mask_to_features(pso_mask)} (time {pso_time}s)")

ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)
log(f"GA selected ({int(np.sum(ga_mask))}): {mask_to_features(ga_mask)} (time {ga_time}s)")

gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)
log(f"GWO selected ({int(np.sum(gwo_mask))}): {mask_to_features(gwo_mask)} (time {gwo_time}s)")

t_all1 = time.time()
log(f"Optimizers finished in {int(t_all1-t_all0)}s")

# Save raw masks
pickle.dump({
    "pso_mask": pso_mask.tolist(), "pso_score": pso_score, "pso_time": pso_time,
    "ga_mask": ga_mask.tolist(), "ga_score": ga_score, "ga_time": ga_time,
    "gwo_mask": gwo_mask.tolist(), "gwo_score": gwo_score, "gwo_time": gwo_time
}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_raw_masks.pkl"), "wb"))

# -----------------------
# 14) UNION (any feature chosen by any optimizer)
# -----------------------
union_mask = ((np.array(pso_mask) == 1) | (np.array(ga_mask) == 1) | (np.array(gwo_mask) == 1)).astype(int)
union_feats = mask_to_features(union_mask)
log(f"UNION selected ({len(union_feats)}): {union_feats}")
pickle.dump({"union_mask": union_mask.tolist(), "union_feats": union_feats}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_union.pkl"), "wb"))

# -----------------------
# 15) HLO on UNION candidates
# -----------------------
t0 = time.time()
log("HLO START on union candidates")
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")
    pop = np.random.randint(0,2,(pop_size, k)).astype(int)
    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if int(bit)==1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask, cv=cv, cb_iter=FIT_CB_ITERS_OPT)
    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores)); best_solution = pop[best_idx].copy(); best_score = fitness_scores[best_idx]
    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax([fitness_candidate(x) for x in pop]))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores)); gen_best_score = fitness_scores[gen_best_idx]; gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score; best_solution = gen_best_sol.copy()
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if int(bit)==1:
            final_full_mask[candidate_indices[j]] = 1
    return final_full_mask, best_score

hlo_mask, hlo_score = hlo_on_candidates(union_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)
hlo_feats = mask_to_features(hlo_mask)
log(f"HLO finished in {int(time.time()-t0)}s best_score={hlo_score:.4f} selected={len(hlo_feats)}")
log(f"HLO selected ({len(hlo_feats)}): {hlo_feats}")
pickle.dump({"hlo_mask": hlo_mask.tolist(), "hlo_score": hlo_score, "hlo_feats": hlo_feats}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_hlo.pkl"), "wb"))

# -----------------------
# 16) Greedy hill-climb restricted to union candidate indices (starting from hlo_mask)
# -----------------------
t0 = time.time()
log("Hill-climb START")
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        return initial_mask, 0.0, 0
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask, cv=cv, cb_iter=FIT_CB_ITERS_OPT)
    evals = 0; steps = 0; improved = True
    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]
            trial_score = evaluate_mask_global(trial_mask, cv=cv, cb_iter=FIT_CB_ITERS_OPT)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    return current_mask, current_score, evals

hc_mask, hc_score, hc_evals = hill_climb_on_candidates(hlo_mask, union_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)
hc_feats = mask_to_features(hc_mask)
log(f"Hill-climb DONE in {int(time.time()-t0)}s steps evals={hc_evals} final_score={hc_score:.4f} selected={len(hc_feats)}")
log(f"Hill-climb final selected ({len(hc_feats)}): {hc_feats}")
pickle.dump({"hc_mask": hc_mask.tolist(), "hc_score": hc_score, "hc_feats": hc_feats}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_hc.pkl"), "wb"))

# -----------------------
# 17) Final selected features after hill-climb
# -----------------------
final_mask = hc_mask
final_selected_indices = np.where(np.array(final_mask).astype(bool))[0].tolist()
final_selected = [FEATURE_NAMES[i] for i in final_selected_indices]
log(f"FINAL selected features ({len(final_selected)}): {final_selected}")
pickle.dump({"final_selected": final_selected, "final_mask": final_mask.tolist()}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_final_selected.pkl"), "wb"))

# -----------------------
# 18) Leakage check: drop single-feature perfect predictors
# -----------------------
def single_feature_predictive_accuracy(feature_series, labels):
    mapping = feature_series.groupby(feature_series).apply(lambda s: labels[s.index].mode().iloc[0])
    preds = feature_series.map(mapping)
    return (preds.values == labels.values).mean()

to_drop = []
for f in final_selected:
    acc = single_feature_predictive_accuracy(X_sub[f], y_sub)
    if acc >= 0.99999 or acc == 1.0:
        log(f"LEAK suspect '{f}' single-feature acc={acc:.6f} -> will drop")
        to_drop.append(f)

if to_drop:
    final_selected = [f for f in final_selected if f not in to_drop]
    final_selected_indices = [FEATURE_NAMES.index(f) for f in final_selected]
    final_mask = np.zeros(N_FEATURES, dtype=int)
    for i in final_selected_indices:
        final_mask[i] = 1
    log(f"After dropping leak suspects final features ({len(final_selected)}): {final_selected}")

if len(final_selected) == 0:
    raise RuntimeError("No features remain after leakage check. Lower threshold or inspect features.")

# -----------------------
# 19) Prepare FULL dataset with same preprocessing for final training
# -----------------------
t0 = time.time()
log("Preparing full dataset for final training...")
df_full = df.copy()
missing_in_full = [f for f in final_selected if f not in df_full.columns]
if missing_in_full:
    raise RuntimeError("Selected features missing from full dataset: " + str(missing_in_full))

df_full = df_full[final_selected + ["Label"]].copy()

# Convert object columns to numeric (LabelEncode) and fill NaN
for c in df_full.columns:
    if c != "Label" and df_full[c].dtype == object:
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# Scale numeric columns (MinMax) using full data
num_cols = [c for c in final_selected if pd.api.types.is_numeric_dtype(df_full[c])]
if len(num_cols) > 0:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop(columns=["Label"])
y_full = df_full["Label"].astype(int)
log(f"Full final training shape: {X_full.shape} Label dist: {y_full.value_counts().to_dict()} (time {time.time()-t0:.2f}s)")

# -----------------------
# 20) Final train/test split (80/20) and final CatBoost training with early stopping
# -----------------------
t0 = time.time()
log("Final training: splitting and training final CatBoost model (with regularization + early stopping)...")
minclass = y_full.value_counts().min()
if minclass < 10:
    log(f"Warning: small class size after selecting features: {minclass}")

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=FINAL_TEST_SIZE, stratify=y_full, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

final_params = {
    "iterations": FINAL_CB_ITERS,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 7.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "random_strength": 1.0,
    "verbose": 50,
    "random_seed": 42
}
final_model = CatBoostClassifier(**final_params)
final_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=FINAL_EARLY_STOP, use_best_model=True)
log(f"Final model trained in {int(time.time()-t0)}s")

# Evaluate on hold-out test
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
log("=== FINAL HOLDOUT METRICS ===")
print(f"Accuracy: {acc:.6f}")
print(f"Precision: {prec:.6f}")
print(f"Recall: {rec:.6f}")
print(f"F1: {f1:.6f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Quick 5-fold CV estimate (reduced iters)
cv_model = CatBoostClassifier(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=7.0,
                              bootstrap_type="Bernoulli", subsample=0.8, random_seed=42, verbose=0)
skf = StratifiedKFold(n_splits=min(5, max(2, int(y_full.value_counts().min()))), shuffle=True, random_state=42)
accs = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring="accuracy", n_jobs=-1)
f1s = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
print("\n5-fold CV (quick estimate) -> Accuracy: %.4f ± %.4f ; F1: %.4f ± %.4f" % (accs.mean(), accs.std(), f1s.mean(), f1s.std()))

# -----------------------
# 21) Save final model & selected features + results
# -----------------------
final_model_path = os.path.join(OUT_DIR, f"{SAVE_PREFIX}_final_model.pkl")
with open(final_model_path, "wb") as f:
    pickle.dump({"model": final_model, "features": final_selected, "mask": final_mask.tolist()}, f)

results = {
    "pso_score": pso_score, "ga_score": ga_score, "gwo_score": gwo_score,
    "union_feats": union_feats, "hlo_feats": hlo_feats, "hc_feats": hc_feats,
    "final_selected": final_selected,
    "final_holdout": {"acc": acc, "prec": prec, "rec": rec, "f1": f1},
    "fitness_cache_len": len(fitness_cache)
}
with open(os.path.join(OUT_DIR, SAVE_PREFIX + "_results.pkl"), "wb") as f:
    pickle.dump(results, f)

log(f"Saved final model -> {final_model_path}")
total_time = int(time.time() - total_start)
log(f"PIPELINE COMPLETE in {total_time}s. Results saved to {OUT_DIR}")

# Short summary prints:
print("\n=== SUMMARY ===")
print(f"PSO selected ({int(np.sum(pso_mask))}): {mask_to_features(pso_mask)}")
print(f"GA  selected ({int(np.sum(ga_mask))}): {mask_to_features(ga_mask)}")
print(f"GWO selected ({int(np.sum(gwo_mask))}): {mask_to_features(gwo_mask)}")
print(f"UNION candidates ({len(union_feats)}): {union_feats}")
print(f"HLO selected ({len(hlo_feats)}): {hlo_feats}")
print(f"HILL-CLIMB final ({len(hc_feats)}): {hc_feats}")
print(f"Final holdout metrics -> acc: {acc:.4f} f1: {f1:.4f}")
print(f"Model saved: {final_model_path}")


[INFO] Loading: /kaggle/input/newwwww/ids2018_cleaned_combined_1.csv
[INFO] Loaded shape: (97802, 76)
CSV_PATH: /kaggle/input/newwwww/ids2018_cleaned_combined_1.csv
[1/20] Loading CSV...
 Loaded shape: (97802, 76) (time 1.3s)
[2/20] Cleaning column names...
 Sample columns: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']
 (time 0.00s)
[3/20] Locating label column and converting to binary...
[3/20] Locating label column and converting to binary...
 Raw label values: [1 0]
 Detected numeric binary labels -> keeping as-is.
 Final label distribution: {0: 49993, 1: 47809}
 (time 0.00s)
 Label distribution (full):
{0: 49993, 1: 47809}
 (time 0.00s)
[4/20] Dropping likely leakage columns (ids/timestamps/ips) if present...
 Dropped: ['Timestamp', 'Flow ID', 'Src IP', 'Dst IP']
 (time 0.01s)
[5/20] Cleaning NaN/Inf and empty columns...
 (time 

In [7]:
# hybrid_union_hlo_ids2018.py
# PSO + GA + GWO -> UNION -> HLO -> Hill-climb -> final CatBoost
# Uses the local CSV: /kaggle/input/ids-dataset/ids2018_cleaned_combined.csv
# Prints selected features & counts for each optimizer, HLO results, timings, and saves the final model.

import time
import os
import glob
import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.base import clone
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# -----------------------
# USER / EXPERIMENT SETTINGS
# -----------------------

# -----------------------
# 1) Load your single CSV dataset
# -----------------------

CSV_PATH = "/kaggle/input/newwwww/ids2018_cleaned_combined_1.csv"
print("[INFO] Loading:", CSV_PATH)

df = pd.read_csv(CSV_PATH, low_memory=False)
print("[INFO] Loaded shape:", df.shape)

OPT_SUBSET_PER_CLASS = 1500    # per-class rows for optimization subset (min available will be used)
PSO_SWARM = 8
PSO_ITERS = 10
GA_POP = 12
GA_GENS = 10
GWO_WOLVES = 8
GWO_ITERS = 10
HLO_POP = 8
HLO_ITERS = 8
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12
HILLCLIMB_MAX_STEPS = 100
HILLCLIMB_EVAL_CAP = 500
CV_OPT = 2
CV_FINAL = 5
FIT_CB_ITERS_OPT = 80
FINAL_CB_ITERS = 800
FINAL_EARLY_STOP = 50
FINAL_TEST_SIZE = 0.20
SAVE_PREFIX = "ids2018_hybrid_union"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

print("CSV_PATH:", CSV_PATH)

# -----------------------
# 0) sanity: file exists
# -----------------------
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found: {CSV_PATH}")

total_start = time.time()

# -----------------------
# 1) LOAD CSV (single file)
# -----------------------
t0 = time.time()
print("[1/20] Loading CSV...")
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f" Loaded shape: {df.shape} (time {time.time()-t0:.1f}s)")

# -----------------------
# 2) CLEAN COLUMN NAMES (strip whitespace)
# -----------------------
t0 = time.time()
print("[2/20] Cleaning column names...")
# Keep training-style whitespace trimming, but not aggressive transforms: strip outer whitespace
df.columns = [str(c).strip() for c in df.columns]
print(" Sample columns:", df.columns.tolist()[:12])
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 3) FIND LABEL COLUMN & convert to BINARY (benign -> 0 else -> 1)
# -----------------------
t0 = time.time()
print("[3/20] Locating label column and converting to binary...")
found_label = None
for cand in ["Label", "label", "Attack", "attack", "attack_cat", " Label", " Label "]:
    if cand in df.columns:
        found_label = cand
        break
if found_label is None:
    for c in df.columns:
        if c.strip().lower() in ("label", "attack", "attack_cat"):
            found_label = c
            break
if found_label is None:
    raise RuntimeError("Label column not found. Columns: " + ", ".join(df.columns[:40]))

# normalize to 'Label'
if found_label != "Label":
    df.rename(columns={found_label: "Label"}, inplace=True)

# drop null labels and convert to string then to binary
df = df[df["Label"].notna()].copy()

t0 = time.time()
print("[3/20] Locating label column and converting to binary...")

found_label = None
for cand in ["Label", "label", "Attack", "attack", "attack_cat", " Label"]:
    if cand in df.columns:
        found_label = cand
        break
if found_label is None:
    for c in df.columns:
        if c.strip().lower() in ("label", "attack", "attack_cat"):
            found_label = c
            break

df.rename(columns={found_label: "Label"}, inplace=True)

# --- SAFE LABEL HANDLING ---
raw_unique = df["Label"].unique()
print(" Raw label values:", raw_unique)

# Case 1: labels are numeric (0/1)
if np.array_equal(np.sort(raw_unique.astype(str)), np.array(["0","1"])):
    print(" Detected numeric binary labels -> keeping as-is.")
    df["Label"] = df["Label"].astype(int)

# Case 2: labels contain "Benign" and attacks
elif any(str(v).lower() == "benign" for v in raw_unique):
    print(" Detected string labels with 'benign' -> mapping to 0/1.")
    df["Label"] = df["Label"].astype(str).str.strip().str.lower()
    df["Label"] = df["Label"].apply(lambda x: 0 if x == "benign" else 1)

# Case 3: unexpected strings — stop!
else:
    raise RuntimeError(f"Label column format not recognized: {raw_unique}")

print(" Final label distribution:", df["Label"].value_counts().to_dict())
print(f" (time {time.time()-t0:.2f}s)")

print(" Label distribution (full):")
print(df["Label"].value_counts().to_dict())
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 4) DROP OBVIOUS LEAK COLUMNS (IDs/IP/TIMESTAMP) if present
# -----------------------
t0 = time.time()
print("[4/20] Dropping likely leakage columns (ids/timestamps/ips) if present...")
possible_leak_cols = [c for c in df.columns if c.strip().lower() in (
    "id", "flow id", "flowid", "timestamp", "ts", "source ip", "destination ip",
    "src ip", "dst ip", "sourceip", "destinationip", "srcip", "dstip", "flow_id", "flow id")]
to_drop = [c for c in possible_leak_cols if c in df.columns]
if to_drop:
    df.drop(columns=to_drop, inplace=True)
    print(" Dropped:", to_drop)
else:
    print(" None dropped.")
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 5) REPLACE INF/NaN and DROP ALL-EMPTY COLUMNS
# -----------------------
t0 = time.time()
print("[5/20] Cleaning NaN/Inf and empty columns...")
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
# drop rows with any NaN (optimizers require clean samples). If many rows drop, user will see warning.
n_before = len(df)
df.dropna(axis=0, how="any", inplace=True)
n_after = len(df)
if n_after < n_before:
    print(f" Dropped {n_before - n_after} rows that had NaNs. Remaining: {n_after}")
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 6) BALANCED SMALL SUBSET FOR OPTIMIZERS
# -----------------------
t0 = time.time()
print("[6/20] Preparing balanced subset for optimization (per-class sampling)...")
counts = df["Label"].value_counts().to_dict()
n_attack = counts.get(1, 0)
n_benign = counts.get(0, 0)
take_attack = min(OPT_SUBSET_PER_CLASS, n_attack)
take_benign = min(OPT_SUBSET_PER_CLASS, n_benign)
if take_attack < 10 or take_benign < 10:
    raise RuntimeError(f"Not enough rows in one class to form optimization subset. counts={counts}")

df_attack = df[df["Label"] == 1].sample(take_attack, random_state=42)
df_benign = df[df["Label"] == 0].sample(take_benign, random_state=42)
df_sub = pd.concat([df_attack, df_benign], ignore_index=True).sample(frac=1.0, random_state=42).reset_index(drop=True)
print(" Subset shape:", df_sub.shape, " Label counts:", df_sub["Label"].value_counts().to_dict())
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 7) PREPROCESS SUBSET: ENCODE CATEGORICAL & SCALE NUMERIC
# -----------------------
t0 = time.time()
print("[7/20] Preprocessing subset (LabelEncode objects, MinMax scale numeric)...")
TARGET_COL = "Label"
X_sub = df_sub.drop(columns=[TARGET_COL]).copy()
y_sub = df_sub[TARGET_COL].astype(int).copy()

# encode object/category columns
obj_cols = X_sub.select_dtypes(include=["object", "category"]).columns.tolist()
if obj_cols:
    print(" Object cols:", obj_cols)
for c in obj_cols:
    X_sub[c] = LabelEncoder().fit_transform(X_sub[c].astype(str))

# scale numeric columns
num_cols_sub = X_sub.select_dtypes(include=[np.number]).columns.tolist()
scaler_sub = MinMaxScaler()
if len(num_cols_sub) > 0:
    X_sub[num_cols_sub] = scaler_sub.fit_transform(X_sub[num_cols_sub])

FEATURE_NAMES = X_sub.columns.tolist()
N_FEATURES = len(FEATURE_NAMES)
print(" Subset features:", N_FEATURES)
print(f" (time {time.time()-t0:.2f}s)")

# -----------------------
# 8) CATBOOST FACTORY + FITNESS CACHE
# -----------------------
def get_catboost_model(iterations=FIT_CB_ITERS_OPT):
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=0, random_seed=42)

fitness_cache = {}
def key_from_mask(mask_bool):
    # ensure mask length consistent
    m = np.array(mask_bool).astype(int)
    if m.shape[0] != N_FEATURES:
        raise ValueError("Mask length mismatch with N_FEATURES")
    return tuple(int(x) for x in m)

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=FIT_CB_ITERS_OPT):
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    idxs = [i for i,b in enumerate(key) if b==1]
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    Xsel = X_sub.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    # adapt cv to available samples per class to avoid errors
    min_per_class = min(y_sub.value_counts().min(), cv)
    if min_per_class < 2:
        cv_used = 2
    else:
        cv_used = min(cv, int(y_sub.value_counts().min()))
    skf = StratifiedKFold(n_splits=cv_used, shuffle=True, random_state=42)
    try:
        # scoring by F1 (binary) as original logic
        scores = cross_val_score(clone(model), Xsel, y_sub, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
        val = float(np.mean(scores))
    except Exception:
        val = 0.0
    fitness_cache[key] = val
    return val

# -----------------------
# 9) HELPERS
# -----------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -----------------------
# 10) PSO (binary)
# -----------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p, cv=cv) for p in pbest])
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]
    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)
            sc = evaluate_mask_global(pos[i], cv=cv)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)
    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -----------------------
# 11) GA (binary)
# -----------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]
    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())
        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.05:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -----------------------
# 12) GWO (binary)
# -----------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0
    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()
        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha
                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta
                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta
                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0
        fitness_scores = np.array([evaluate_mask_global(ind, cv=cv) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -----------------------
# 13) RUN OPTIMIZERS
# -----------------------
log("===== RUNNING OPTIMIZERS (PSO / GA / GWO) =====")
t_all0 = time.time()
pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)
log(f"PSO selected ({int(np.sum(pso_mask))}): {mask_to_features(pso_mask)} (time {pso_time}s)")

ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)
log(f"GA selected ({int(np.sum(ga_mask))}): {mask_to_features(ga_mask)} (time {ga_time}s)")

gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)
log(f"GWO selected ({int(np.sum(gwo_mask))}): {mask_to_features(gwo_mask)} (time {gwo_time}s)")

t_all1 = time.time()
log(f"Optimizers finished in {int(t_all1-t_all0)}s")

# Save raw masks
pickle.dump({
    "pso_mask": pso_mask.tolist(), "pso_score": pso_score, "pso_time": pso_time,
    "ga_mask": ga_mask.tolist(), "ga_score": ga_score, "ga_time": ga_time,
    "gwo_mask": gwo_mask.tolist(), "gwo_score": gwo_score, "gwo_time": gwo_time
}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_raw_masks.pkl"), "wb"))

# -----------------------
# 14) VOTING (feature kept only if chosen by at least 2 optimizers)
# -----------------------
log("Applying VOTING rule: keep feature if selected by ≥2 optimizers")

pso_arr = np.array(pso_mask)
ga_arr  = np.array(ga_mask)
gwo_arr = np.array(gwo_mask)

# Vote count per feature
vote_count = pso_arr + ga_arr + gwo_arr     # vector of 0..3

# Voting rule: at least 2 optimizers must agree
voting_mask = (vote_count >= 2).astype(int)

voting_feats = mask_to_features(voting_mask)

log(f"VOTING selected {len(voting_feats)} features: {voting_feats}")

pickle.dump({
    "voting_mask": voting_mask.tolist(),
    "voting_feats": voting_feats,
    "vote_count": vote_count.tolist()
}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_voting.pkl"), "wb"))

# -----------------------
# 15) HLO on UNION candidates
# -----------------------
t0 = time.time()
log("HLO START on union candidates")
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")
    pop = np.random.randint(0,2,(pop_size, k)).astype(int)
    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if int(bit)==1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask, cv=cv, cb_iter=FIT_CB_ITERS_OPT)
    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores)); best_solution = pop[best_idx].copy(); best_score = fitness_scores[best_idx]
    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax([fitness_candidate(x) for x in pop]))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores)); gen_best_score = fitness_scores[gen_best_idx]; gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score; best_solution = gen_best_sol.copy()
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if int(bit)==1:
            final_full_mask[candidate_indices[j]] = 1
    return final_full_mask, best_score

hlo_mask, hlo_score = hlo_on_candidates(voting_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)
hlo_feats = mask_to_features(hlo_mask)
log(f"HLO finished in {int(time.time()-t0)}s best_score={hlo_score:.4f} selected={len(hlo_feats)}")
log(f"HLO selected ({len(hlo_feats)}): {hlo_feats}")
pickle.dump({"hlo_mask": hlo_mask.tolist(), "hlo_score": hlo_score, "hlo_feats": hlo_feats}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_hlo.pkl"), "wb"))

# -----------------------
# 16) Greedy hill-climb restricted to union candidate indices (starting from hlo_mask)
# -----------------------
t0 = time.time()
log("Hill-climb START")
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        return initial_mask, 0.0, 0
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask, cv=cv, cb_iter=FIT_CB_ITERS_OPT)
    evals = 0; steps = 0; improved = True
    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]
            trial_score = evaluate_mask_global(trial_mask, cv=cv, cb_iter=FIT_CB_ITERS_OPT)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    return current_mask, current_score, evals

hc_mask, hc_score, hc_evals = hill_climb_on_candidates(
    hlo_mask, 
    voting_mask, 
    max_steps=HILLCLIMB_MAX_STEPS, 
    eval_cap=HILLCLIMB_EVAL_CAP, 
    cv=CV_OPT
)

hc_feats = mask_to_features(hc_mask)
log(f"Hill-climb DONE in {int(time.time()-t0)}s steps evals={hc_evals} final_score={hc_score:.4f} selected={len(hc_feats)}")
log(f"Hill-climb final selected ({len(hc_feats)}): {hc_feats}")
pickle.dump({"hc_mask": hc_mask.tolist(), "hc_score": hc_score, "hc_feats": hc_feats}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_hc.pkl"), "wb"))

# -----------------------
# 17) Final selected features after hill-climb
# -----------------------
final_mask = hc_mask
final_selected_indices = np.where(np.array(final_mask).astype(bool))[0].tolist()
final_selected = [FEATURE_NAMES[i] for i in final_selected_indices]
log(f"FINAL selected features ({len(final_selected)}): {final_selected}")
pickle.dump({"final_selected": final_selected, "final_mask": final_mask.tolist()}, open(os.path.join(OUT_DIR, SAVE_PREFIX + "_final_selected.pkl"), "wb"))

# -----------------------
# 18) Leakage check: drop single-feature perfect predictors
# -----------------------
def single_feature_predictive_accuracy(feature_series, labels):
    mapping = feature_series.groupby(feature_series).apply(lambda s: labels[s.index].mode().iloc[0])
    preds = feature_series.map(mapping)
    return (preds.values == labels.values).mean()

to_drop = []
for f in final_selected:
    acc = single_feature_predictive_accuracy(X_sub[f], y_sub)
    if acc >= 0.99999 or acc == 1.0:
        log(f"LEAK suspect '{f}' single-feature acc={acc:.6f} -> will drop")
        to_drop.append(f)

if to_drop:
    final_selected = [f for f in final_selected if f not in to_drop]
    final_selected_indices = [FEATURE_NAMES.index(f) for f in final_selected]
    final_mask = np.zeros(N_FEATURES, dtype=int)
    for i in final_selected_indices:
        final_mask[i] = 1
    log(f"After dropping leak suspects final features ({len(final_selected)}): {final_selected}")

if len(final_selected) == 0:
    raise RuntimeError("No features remain after leakage check. Lower threshold or inspect features.")

# -----------------------
# 19) Prepare FULL dataset with same preprocessing for final training
# -----------------------
t0 = time.time()
log("Preparing full dataset for final training...")
df_full = df.copy()
missing_in_full = [f for f in final_selected if f not in df_full.columns]
if missing_in_full:
    raise RuntimeError("Selected features missing from full dataset: " + str(missing_in_full))

df_full = df_full[final_selected + ["Label"]].copy()

# Convert object columns to numeric (LabelEncode) and fill NaN
for c in df_full.columns:
    if c != "Label" and df_full[c].dtype == object:
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# Scale numeric columns (MinMax) using full data
num_cols = [c for c in final_selected if pd.api.types.is_numeric_dtype(df_full[c])]
if len(num_cols) > 0:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop(columns=["Label"])
y_full = df_full["Label"].astype(int)
log(f"Full final training shape: {X_full.shape} Label dist: {y_full.value_counts().to_dict()} (time {time.time()-t0:.2f}s)")

# -----------------------
# 20) Final train/test split (80/20) and final CatBoost training with early stopping
# -----------------------
t0 = time.time()
log("Final training: splitting and training final CatBoost model (with regularization + early stopping)...")
minclass = y_full.value_counts().min()
if minclass < 10:
    log(f"Warning: small class size after selecting features: {minclass}")

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=FINAL_TEST_SIZE, stratify=y_full, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

final_params = {
    "iterations": FINAL_CB_ITERS,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 7.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "random_strength": 1.0,
    "verbose": 50,
    "random_seed": 42
}
final_model = CatBoostClassifier(**final_params)
final_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=FINAL_EARLY_STOP, use_best_model=True)
log(f"Final model trained in {int(time.time()-t0)}s")

# Evaluate on hold-out test
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
log("=== FINAL HOLDOUT METRICS ===")
print(f"Accuracy: {acc:.6f}")
print(f"Precision: {prec:.6f}")
print(f"Recall: {rec:.6f}")
print(f"F1: {f1:.6f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Quick 5-fold CV estimate (reduced iters)
cv_model = CatBoostClassifier(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=7.0,
                              bootstrap_type="Bernoulli", subsample=0.8, random_seed=42, verbose=0)
skf = StratifiedKFold(n_splits=min(5, max(2, int(y_full.value_counts().min()))), shuffle=True, random_state=42)
accs = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring="accuracy", n_jobs=-1)
f1s = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
print("\n5-fold CV (quick estimate) -> Accuracy: %.4f ± %.4f ; F1: %.4f ± %.4f" % (accs.mean(), accs.std(), f1s.mean(), f1s.std()))

# -----------------------
# 21) Save final model & selected features + results
# -----------------------
final_model_path = os.path.join(OUT_DIR, f"{SAVE_PREFIX}_final_model.pkl")
with open(final_model_path, "wb") as f:
    pickle.dump({"model": final_model, "features": final_selected, "mask": final_mask.tolist()}, f)

results = {
    "pso_score": pso_score, "ga_score": ga_score, "gwo_score": gwo_score,
    "union_feats": union_feats, "hlo_feats": hlo_feats, "hc_feats": hc_feats,
    "final_selected": final_selected,
    "final_holdout": {"acc": acc, "prec": prec, "rec": rec, "f1": f1},
    "fitness_cache_len": len(fitness_cache)
}
with open(os.path.join(OUT_DIR, SAVE_PREFIX + "_results.pkl"), "wb") as f:
    pickle.dump(results, f)

log(f"Saved final model -> {final_model_path}")
total_time = int(time.time() - total_start)
log(f"PIPELINE COMPLETE in {total_time}s. Results saved to {OUT_DIR}")

# Short summary prints:
print("\n=== SUMMARY ===")
print(f"PSO selected ({int(np.sum(pso_mask))}): {mask_to_features(pso_mask)}")
print(f"GA  selected ({int(np.sum(ga_mask))}): {mask_to_features(ga_mask)}")
print(f"GWO selected ({int(np.sum(gwo_mask))}): {mask_to_features(gwo_mask)}")
print(f"UNION candidates ({len(union_feats)}): {union_feats}")
print(f"HLO selected ({len(hlo_feats)}): {hlo_feats}")
print(f"HILL-CLIMB final ({len(hc_feats)}): {hc_feats}")
print(f"Final holdout metrics -> acc: {acc:.4f} f1: {f1:.4f}")
print(f"Model saved: {final_model_path}")


[INFO] Loading: /kaggle/input/newwwww/ids2018_cleaned_combined_1.csv
[INFO] Loaded shape: (97802, 76)
CSV_PATH: /kaggle/input/newwwww/ids2018_cleaned_combined_1.csv
[1/20] Loading CSV...
 Loaded shape: (97802, 76) (time 1.3s)
[2/20] Cleaning column names...
 Sample columns: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']
 (time 0.00s)
[3/20] Locating label column and converting to binary...
[3/20] Locating label column and converting to binary...
 Raw label values: [1 0]
 Detected numeric binary labels -> keeping as-is.
 Final label distribution: {0: 49993, 1: 47809}
 (time 0.00s)
 Label distribution (full):
{0: 49993, 1: 47809}
 (time 0.00s)
[4/20] Dropping likely leakage columns (ids/timestamps/ips) if present...
 Dropped: ['Timestamp', 'Flow ID', 'Src IP', 'Dst IP']
 (time 0.02s)
[5/20] Cleaning NaN/Inf and empty columns...
 (time 

In [8]:
# hybrid_hlo_union_only.py
# Reduced-budget hybrid pipeline: PSO, GA, GWO -> UNION -> HLO -> Hill-climb -> final CatBoost (save)
# Prints selected features after PSO/GA/GWO and the final union members.
# Runs on Kaggle input path by default and prints final test metrics (accuracy, precision, recall, f1),
# confusion matrix and classification report.

import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, classification_report, confusion_matrix
from sklearn.base import clone

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# Kaggle path requested by you:
CSV_PATH = "/kaggle/input/newwwww/ids2018_cleaned_combined_1.csv"

TARGET_COL = "Label"   # change if your dataset uses another column name
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets (set to 20 for the three optimizers as requested) ----------
PSO_SWARM = 15
PSO_ITERS = 10     # <<-- set to 20

GA_POP = 30
GA_GENS = 10       # <<-- set to 20

GWO_WOLVES = 10
GWO_ITERS = 10     # <<-- set to 20

HLO_POP = 15
HLO_ITERS = 10
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb
HILLCLIMB_MAX_STEPS = 100
HILLCLIMB_EVAL_CAP = 500

# CV folds
CV_OPT = 2
CV_FINAL = 5

# CatBoost iterations
CB_ITER_OPT = 100
CB_ITER_HLO = 200
CB_ITER_FINAL = 500

FINAL_TEST_SIZE = 0.2
SAVE_PREFIX = "hybrid_hlo_union"
# ------------------------------------------------------------------------


# -------------------- Load data (robust handling of messy column names) --------------------
print(f"[{time.strftime('%H:%M:%S')}] Loading CSV from: {CSV_PATH}")
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"[{time.strftime('%H:%M:%S')}] Raw loaded shape: {df.shape}")
print(f"[{time.strftime('%H:%M:%S')}] Raw columns sample: {df.columns.tolist()[:12]}")

# Clean column names: strip whitespace and normalize repeated spaces
df.columns = df.columns.astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
print(f"[{time.strftime('%H:%M:%S')}] Cleaned columns sample: {df.columns.tolist()[:12]}")

# If an index column like 'Unnamed: 0' exists (common from CSV exports), drop it
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])
    print(f"[{time.strftime('%H:%M:%S')}] Dropped 'Unnamed: 0' column. New shape: {df.shape}")

# If the requested TARGET_COL isn't found, try to auto-detect a label-like column (case-insensitive)
if TARGET_COL not in df.columns:
    # try case-insensitive match
    cols_lower = {c.lower(): c for c in df.columns}
    if TARGET_COL.lower() in cols_lower:
        real_col = cols_lower[TARGET_COL.lower()]
        print(f"[{time.strftime('%H:%M:%S')}] Using case-insensitive match for target: '{real_col}'")
        TARGET_COL = real_col
    else:
        # fallback: search for any column name that contains 'label' or 'target'
        cand = [c for c in df.columns if 'label' in c.lower() or 'target' in c.lower()]
        if len(cand) == 1:
            print(f"[{time.strftime('%H:%M:%S')}] Auto-detected target column: '{cand[0]}'")
            TARGET_COL = cand[0]
        elif len(cand) > 1:
            print(f"[{time.strftime('%H:%M:%S')}] Multiple candidate target columns found: {cand}. Using first: '{cand[0]}'")
            TARGET_COL = cand[0]
        else:
            raise ValueError(f"Target column '{TARGET_COL}' not found (after cleaning). Columns: {df.columns.tolist()[:12]}...")

print(f"[{time.strftime('%H:%M:%S')}] Using TARGET_COL = '{TARGET_COL}'")

# Basic preprocessing expectation: ensure no object columns remain unencoded for CatBoost.
from sklearn.preprocessing import LabelEncoder
obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
if obj_cols:
    print(f"[{time.strftime('%H:%M:%S')}] Label-encoding object columns for safe use: {obj_cols}")
    for c in obj_cols:
        df[c] = df[c].astype(str).fillna("NA")
        df[c] = LabelEncoder().fit_transform(df[c])

# Ensure no NaNs in features/target used by optimizers
df = df.dropna(axis=0).reset_index(drop=True)

# Prepare X, y
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]
print(f"[{time.strftime('%H:%M:%S')}] Prepared X ({X.shape}) and y ({y.shape}). Number of features: {N_FEATURES}")


# -------------------- CatBoost factory --------------------
def get_catboost_model(iterations=100):
    try:
        from catboost import CatBoostClassifier
    except Exception as e:
        raise ImportError("catboost not installed. Install with: pip install catboost") from e
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=MODEL_VERBOSE, random_seed=RANDOM_STATE, thread_count=-1)

# -------------------- Fitness cache --------------------
fitness_cache = {}
def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    try:
        accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
        precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
        recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
        f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)
        score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    except Exception as e:
        # if a training error occurs (e.g., degenerate feature set), return 0
        score = 0.0

    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -------------------- UNION (only) --------------------
def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.4f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.4f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION only) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)
    pso_feats = mask_to_features(pso_mask)
    log(f"PSO selected ({len(pso_feats)}): {pso_feats}")

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)
    ga_feats = mask_to_features(ga_mask)
    log(f"GA selected ({len(ga_feats)}): {ga_feats}")

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)
    gwo_feats = mask_to_features(gwo_mask)
    log(f"GWO selected ({len(gwo_feats)}): {gwo_feats}")

    # Derive UNION of the three optimizers
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    union_feats = mask_to_features(union_mask)
    log(f"UNION candidate features ({len(union_feats)}): {union_feats}")

    # HLO on union
    if len(union_feats) == 0:
        log("UNION empty — nothing to optimize. Exiting.")
        raise SystemExit("No union features selected by optimizers.")

    hlo_mask, hlo_score, hlo_time = hlo_on_candidates(union_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)
    hlo_feats = mask_to_features(hlo_mask)
    log(f"HLO final mask selected ({len(hlo_feats)}): {hlo_feats}")

    # Hill-climb restricted to union candidates
    hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, union_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)
    hc_feats = mask_to_features(hc_mask)
    log(f"Hill-climb final mask selected ({len(hc_feats)}): {hc_feats}")

    # Final CV evaluation (5-fold)
    final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)
    log(f"Final CV (5-fold) | n_features={final_res['n_features']} | F1={final_res['f1_mean']:.4f} ± {final_res['f1_std']:.4f}")

    # Train final CatBoost on 80% and evaluate on 20%, save model
    selected_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
    selected_features = [FEATURE_NAMES[i] for i in selected_idxs]

    X_sel = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

    model = get_catboost_model(iterations=CB_ITER_FINAL)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_prec = precision_score(y_test, y_pred, zero_division=0)
    test_rec = recall_score(y_test, y_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_pred, zero_division=0)
    test_cm = confusion_matrix(y_test, y_pred)
    test_report = classification_report(y_test, y_pred, zero_division=0)

    test_metrics = {
        'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
        'n_test': int(X_test.shape[0]),
        'confusion_matrix': test_cm.tolist(),  # convert to list for pickle/json friendliness
        'classification_report': test_report
    }

    model_filename = f"{SAVE_PREFIX}_union_model.pkl"
    with open(model_filename, 'wb') as mf:
        pickle.dump(model, mf)

    log(f"Saved final CatBoost union model -> {model_filename} (test_f1={test_f1:.4f})")

    # Save aggregated results (only union)
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask,
        "hlo_mask": hlo_mask, "hlo_score": hlo_score, "hlo_time": hlo_time,
        "hc_mask": hc_mask, "hc_score": hc_score, "hc_time": hc_time,
        "final_eval": final_res,
        "selected_features": selected_features,
        "model_file": model_filename,
        "test_metrics": test_metrics,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)
    log(f"PIPELINE COMPLETE in {elapsed_total}s. Results saved to {SAVE_PREFIX}_results.pkl and model {model_filename}")

    # Print short summary and explicit final test metrics (requested)
    print("\n=== SUMMARY ===")
    print(f"PSO selected ({len(pso_feats)}): {pso_feats}")
    print(f"GA selected  ({len(ga_feats)}): {ga_feats}")
    print(f"GWO selected ({len(gwo_feats)}): {gwo_feats}")
    print(f"UNION candidates ({len(union_feats)}): {union_feats}")
    print(f"HLO selected ({len(hlo_feats)}): {hlo_feats}")
    print(f"HILL-CLIMB selected ({len(hc_feats)}): {hc_feats}")
    print(f"Final CV F1: {final_res['f1_mean']:.4f} ± {final_res['f1_std']:.4f}")

    # Final test set metrics (explicit printout)
    print("\n--- FINAL TEST METRICS (80/20 held-out) ---")
    print(f"Test samples (n_test) : {test_metrics['n_test']}")
    print(f"Accuracy : {test_metrics['acc']:.4f}")
    print(f"Precision: {test_metrics['prec']:.4f}")
    print(f"Recall   : {test_metrics['rec']:.4f}")
    print(f"F1-score : {test_metrics['f1']:.4f}")
    print("\nConfusion Matrix (rows=true / cols=pred):")
    print(np.array(test_metrics['confusion_matrix']))
    print("\nClassification Report:")
    print(test_metrics['classification_report'])

    print(f"\nModel saved to: {model_filename}")

[18:12:47] Loading CSV from: /kaggle/input/newwwww/ids2018_cleaned_combined_1.csv
[18:12:49] Raw loaded shape: (97802, 76)
[18:12:49] Raw columns sample: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']
[18:12:49] Cleaned columns sample: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']
[18:12:49] Using TARGET_COL = 'Label'
[18:12:49] Prepared X ((97802, 75)) and y ((97802,)). Number of features: 75
[18:12:49] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION only) START =====
[18:12:49] PSO START (swarm=15, iters=10, cv=2)
[18:16:03]  PSO iter 1/10 best_global=0.9992
[18:19:12]  PSO iter 2/10 best_global=0.9993
[18:22:19]  PSO iter 3/10 best_global=0.9993


KeyboardInterrupt: 

In [9]:
# hybrid_hlo_union_only.py
# Reduced-budget hybrid pipeline: PSO, GA, GWO -> UNION -> HLO -> Hill-climb -> final CatBoost (save)
# Prints selected features after PSO/GA/GWO and the final union members.
# Runs on Kaggle input path by default and prints final test metrics (accuracy, precision, recall, f1),
# confusion matrix and classification report.

import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, classification_report, confusion_matrix
from sklearn.base import clone

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# Kaggle path requested by you:
CSV_PATH = "/kaggle/input/newwwww/ids2018_cleaned_combined_1.csv"

TARGET_COL = "Label"   # change if your dataset uses another column name
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets (set to 20 for the three optimizers as requested) ----------
PSO_SWARM = 15
PSO_ITERS = 10     # <<-- set to 20

GA_POP = 30
GA_GENS = 10       # <<-- set to 20

GWO_WOLVES = 10
GWO_ITERS = 10     # <<-- set to 20

HLO_POP = 15
HLO_ITERS = 10
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb
HILLCLIMB_MAX_STEPS = 100
HILLCLIMB_EVAL_CAP = 500

# CV folds
CV_OPT = 2
CV_FINAL = 5

# CatBoost iterations
CB_ITER_OPT = 100
CB_ITER_HLO = 200
CB_ITER_FINAL = 500

FINAL_TEST_SIZE = 0.2
SAVE_PREFIX = "hybrid_hlo_union"

# -------------------- NEW: sampling to reduce total rows (equal contributions) --------------------
# Set how many rows to sample per class (balanced). Use min(available, this).
SAMPLED_PER_CLASS = 1500
# ------------------------------------------------------------------------

# -------------------- Load data (robust handling of messy column names) --------------------
print(f"[{time.strftime('%H:%M:%S')}] Loading CSV from: {CSV_PATH}")
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"[{time.strftime('%H:%M:%S')}] Raw loaded shape: {df.shape}")
print(f"[{time.strftime('%H:%M:%S')}] Raw columns sample: {df.columns.tolist()[:12]}")

# Clean column names: strip whitespace and normalize repeated spaces
df.columns = df.columns.astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
print(f"[{time.strftime('%H:%M:%S')}] Cleaned columns sample: {df.columns.tolist()[:12]}")

# If an index column like 'Unnamed: 0' exists (common from CSV exports), drop it
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])
    print(f"[{time.strftime('%H:%M:%S')}] Dropped 'Unnamed: 0' column. New shape: {df.shape}")

# If the requested TARGET_COL isn't found, try to auto-detect a label-like column (case-insensitive)
if TARGET_COL not in df.columns:
    # try case-insensitive match
    cols_lower = {c.lower(): c for c in df.columns}
    if TARGET_COL.lower() in cols_lower:
        real_col = cols_lower[TARGET_COL.lower()]
        print(f"[{time.strftime('%H:%M:%S')}] Using case-insensitive match for target: '{real_col}'")
        TARGET_COL = real_col
    else:
        # fallback: search for any column name that contains 'label' or 'target'
        cand = [c for c in df.columns if 'label' in c.lower() or 'target' in c.lower()]
        if len(cand) == 1:
            print(f"[{time.strftime('%H:%M:%S')}] Auto-detected target column: '{cand[0]}'")
            TARGET_COL = cand[0]
        elif len(cand) > 1:
            print(f"[{time.strftime('%H:%M:%S')}] Multiple candidate target columns found: {cand}. Using first: '{cand[0]}'")
            TARGET_COL = cand[0]
        else:
            raise ValueError(f"Target column '{TARGET_COL}' not found (after cleaning). Columns: {df.columns.tolist()[:12]}...")

print(f"[{time.strftime('%H:%M:%S')}] Using TARGET_COL = '{TARGET_COL}'")

# Basic preprocessing expectation: ensure no object columns remain unencoded for CatBoost.
from sklearn.preprocessing import LabelEncoder
obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
if obj_cols:
    print(f"[{time.strftime('%H:%M:%S')}] Label-encoding object columns for safe use: {obj_cols}")
    for c in obj_cols:
        df[c] = df[c].astype(str).fillna("NA")
        df[c] = LabelEncoder().fit_transform(df[c])

# Ensure no NaNs in features/target used by optimizers
df = df.dropna(axis=0).reset_index(drop=True)

# -------------------- NEW SAMPLING: create a balanced reduced dataset --------------------
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column {TARGET_COL} missing after preprocessing.")

# ensure target numeric
try:
    df[TARGET_COL] = df[TARGET_COL].astype(int)
except Exception:
    # try mapping benign/other to binary if strings
    df[TARGET_COL] = df[TARGET_COL].astype(str).str.strip().str.lower()
    df[TARGET_COL] = df[TARGET_COL].apply(lambda x: 0 if x in ("benign", "0", "false") else 1)

counts = df[TARGET_COL].value_counts().to_dict()
print(f"[{time.strftime('%H:%M:%S')}] Label counts before sampling: {counts}")

if 0 in counts and 1 in counts:
    available0 = counts.get(0, 0)
    available1 = counts.get(1, 0)
    take_n = min(SAMPLED_PER_CLASS, available0, available1)
    if take_n < 1:
        raise RuntimeError(f"Not enough samples to create balanced subset after sampling: avail 0={available0}, 1={available1}")
    # sample equal from each class
    df0 = df[df[TARGET_COL] == 0].sample(take_n, random_state=RANDOM_STATE)
    df1 = df[df[TARGET_COL] == 1].sample(take_n, random_state=RANDOM_STATE)
    df = pd.concat([df0, df1], ignore_index=True).sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
    print(f"[{time.strftime('%H:%M:%S')}] After balanced sampling: each class {take_n} rows -> total {len(df)}")
else:
    # if dataset not binary as 0/1, don't sample; proceed (user dataset should be binary)
    print(f"[{time.strftime('%H:%M:%S')}] WARNING: target not binary 0/1, sampling skipped. Label unique values: {df[TARGET_COL].unique()}")

# Prepare X, y
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]
print(f"[{time.strftime('%H:%M:%S')}] Prepared X ({X.shape}) and y ({y.shape}). Number of features: {N_FEATURES}")


# -------------------- CatBoost factory --------------------
def get_catboost_model(iterations=100):
    try:
        from catboost import CatBoostClassifier
    except Exception as e:
        raise ImportError("catboost not installed. Install with: pip install catboost") from e
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=MODEL_VERBOSE, random_seed=RANDOM_STATE, thread_count=-1)

# -------------------- Fitness cache --------------------
fitness_cache = {}
def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    try:
        accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
        precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
        recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
        f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)
        score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    except Exception as e:
        # if a training error occurs (e.g., degenerate feature set), return 0
        score = 0.0

    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    return best_mask, best_score, int(t1-t0)

# -------------------- UNION (only) --------------------
def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.4f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.4f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION only) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)
    pso_feats = mask_to_features(pso_mask)
    log(f"PSO selected ({len(pso_feats)}): {pso_feats}")

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)
    ga_feats = mask_to_features(ga_mask)
    log(f"GA selected ({len(ga_feats)}): {ga_feats}")

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)
    gwo_feats = mask_to_features(gwo_mask)
    log(f"GWO selected ({len(gwo_feats)}): {gwo_feats}")

    # Derive UNION of the three optimizers
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    union_feats = mask_to_features(union_mask)
    log(f"UNION candidate features ({len(union_feats)}): {union_feats}")

    # HLO on union
    if len(union_feats) == 0:
        log("UNION empty — nothing to optimize. Exiting.")
        raise SystemExit("No union features selected by optimizers.")

    hlo_mask, hlo_score, hlo_time = hlo_on_candidates(union_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)
    hlo_feats = mask_to_features(hlo_mask)
    log(f"HLO final mask selected ({len(hlo_feats)}): {hlo_feats}")

    # Hill-climb restricted to union candidates
    hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, union_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)
    hc_feats = mask_to_features(hc_mask)
    log(f"Hill-climb final mask selected ({len(hc_feats)}): {hc_feats}")

    # Final CV evaluation (5-fold)
    final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)
    log(f"Final CV (5-fold) | n_features={final_res['n_features']} | F1={final_res['f1_mean']:.4f} ± {final_res['f1_std']:.4f}")

    # Train final CatBoost on 80% and evaluate on 20%, save model
    selected_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
    selected_features = [FEATURE_NAMES[i] for i in selected_idxs]

    X_sel = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

    model = get_catboost_model(iterations=CB_ITER_FINAL)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    test_prec = precision_score(y_test, y_pred, zero_division=0)
    test_rec = recall_score(y_test, y_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_pred, zero_division=0)
    test_cm = confusion_matrix(y_test, y_pred)
    test_report = classification_report(y_test, y_pred, zero_division=0)

    test_metrics = {
        'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
        'n_test': int(X_test.shape[0]),
        'confusion_matrix': test_cm.tolist(),  # convert to list for pickle/json friendliness
        'classification_report': test_report
    }

    model_filename = f"{SAVE_PREFIX}_union_model.pkl"
    with open(model_filename, 'wb') as mf:
        pickle.dump(model, mf)

    log(f"Saved final CatBoost union model -> {model_filename} (test_f1={test_f1:.4f})")

    # Save aggregated results (only union)
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask,
        "hlo_mask": hlo_mask, "hlo_score": hlo_score, "hlo_time": hlo_time,
        "hc_mask": hc_mask, "hc_score": hc_score, "hc_time": hc_time,
        "final_eval": final_res,
        "selected_features": selected_features,
        "model_file": model_filename,
        "test_metrics": test_metrics,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)
    log(f"PIPELINE COMPLETE in {elapsed_total}s. Results saved to {SAVE_PREFIX}_results.pkl and model {model_filename}")

    # Print short summary and explicit final test metrics (requested)
    print("\n=== SUMMARY ===")
    print(f"PSO selected ({len(pso_feats)}): {pso_feats}")
    print(f"GA selected  ({len(ga_feats)}): {ga_feats}")
    print(f"GWO selected ({len(gwo_feats)}): {gwo_feats}")
    print(f"UNION candidates ({len(union_feats)}): {union_feats}")
    print(f"HLO selected ({len(hlo_feats)}): {hlo_feats}")
    print(f"HILL-CLIMB selected ({len(hc_feats)}): {hc_feats}")
    print(f"Final CV F1: {final_res['f1_mean']:.4f} ± {final_res['f1_std']:.4f}")

    # Final test set metrics (explicit printout)
    print("\n--- FINAL TEST METRICS (80/20 held-out) ---")
    print(f"Test samples (n_test) : {test_metrics['n_test']}")
    print(f"Accuracy : {test_metrics['acc']:.4f}")
    print(f"Precision: {test_metrics['prec']:.4f}")
    print(f"Recall   : {test_metrics['rec']:.4f}")
    print(f"F1-score : {test_metrics['f1']:.4f}")
    print("\nConfusion Matrix (rows=true / cols=pred):")
    print(np.array(test_metrics['confusion_matrix']))
    print("\nClassification Report:")
    print(test_metrics['classification_report'])

    print(f"\nModel saved to: {model_filename}")


[18:23:14] Loading CSV from: /kaggle/input/newwwww/ids2018_cleaned_combined_1.csv
[18:23:15] Raw loaded shape: (97802, 76)
[18:23:15] Raw columns sample: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']
[18:23:15] Cleaned columns sample: ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']
[18:23:15] Using TARGET_COL = 'Label'
[18:23:15] Label counts before sampling: {0: 49993, 1: 47809}
[18:23:15] After balanced sampling: each class 1500 rows -> total 3000
[18:23:15] Prepared X ((3000, 75)) and y ((3000,)). Number of features: 75
[18:23:15] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION only) START =====
[18:23:15] PSO START (swarm=15, iters=10, cv=2)
[18:24:07]  PSO iter 1/10 be

KeyboardInterrupt: 

In [1]:
print("hii")

hii


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# ================================================================
# 1. LOAD HEART DISEASE DATASET (disable low_memory dtype guessing)
# ================================================================
input_path = "/kaggle/input/new-heart/heart_attack_russia_youth_vs_adult.csv"
df = pd.read_csv(input_path, low_memory=False)

print("Initial shape:", df.shape)
print("Initial dtypes (sample):\n", df.dtypes.head(20))


# ================================================================
# 2. BASIC CLEANING
# ================================================================
# Drop columns entirely NaN
df = df.dropna(axis=1, how="all")

# Drop columns that are all zeros (rare in heart disease datasets)
df = df.loc[:, (df != 0).any(axis=0)]

# Drop duplicate rows
df = df.drop_duplicates().reset_index(drop=True)


# ================================================================
# 3. ROBUST NUMERIC COLUMN DETECTION
# ================================================================
numeric_candidates = []
conversion_stats = {}

for col in df.columns:
    coerced = pd.to_numeric(df[col], errors="coerce")
    non_na_ratio = coerced.notna().sum() / len(coerced)
    conversion_stats[col] = non_na_ratio

    # Consider numeric-like if 80%+ values convert
    if non_na_ratio >= 0.80:
        numeric_candidates.append(col)

print(f"Detected {len(numeric_candidates)} numeric-like columns.")

# Convert numeric candidates to numeric dtype
for col in numeric_candidates:
    df[col] = pd.to_numeric(df[col], errors="coerce")


# ================================================================
# 4. DETECT + HANDLE INF AND EXTREME VALUES
# ================================================================
# Columns containing ±inf
inf_cols = [c for c in numeric_candidates 
            if np.isinf(df[c].to_numpy()).any()]

print("Columns with ±inf:", inf_cols)

# Replace ±inf → NaN
if inf_cols:
    df[numeric_candidates] = df[numeric_candidates].replace([np.inf, -np.inf], np.nan)

# Detect extremely large magnitude values
huge_cols = []
for col in numeric_candidates:
    try:
        max_abs = np.nanmax(np.abs(df[col].to_numpy()))
        if np.isfinite(max_abs) and max_abs > 1e300:
            huge_cols.append((col, max_abs))
    except:
        pass

print("Columns with extremely large values (>1e300):", huge_cols)

# Clip extreme values safely
CLIP_LIMIT = 1e300
df[numeric_candidates] = df[numeric_candidates].apply(
    lambda s: s.clip(lower=-CLIP_LIMIT, upper=CLIP_LIMIT)
)


# ================================================================
# 5. RECOMPUTE NUMERIC + CATEGORICAL COL LISTS
# ================================================================
num_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("Final numeric column count:", len(num_cols))
print("Final categorical column count:", len(cat_cols))


# ================================================================
# 6. HANDLE MISSING VALUES
# ================================================================
# Numeric: fill NaN with median
if len(num_cols) > 0:
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical: fill NaN with mode
for col in cat_cols:
    if df[col].isna().any():
        mode_val = df[col].mode(dropna=True)
        df[col] = df[col].fillna(mode_val.iloc[0] if len(mode_val) > 0 else "")


# ================================================================
# 7. LABEL-ENCODE CATEGORICAL COLUMNS
# ================================================================
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))


# ================================================================
# 8. FINAL CHECK: REMOVE ANY REMAINING INF/NaN BEFORE SCALING
# ================================================================
df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Confirm numeric columns are finite
finite_check = {col: np.isfinite(df[col].to_numpy()).all() for col in num_cols}
bad_cols = [c for c, ok in finite_check.items() if not ok]
print("Non-finite columns (should be empty):", bad_cols)


# ================================================================
# 9. MIN-MAX SCALING
# ================================================================
scaler = MinMaxScaler()
if len(num_cols) > 0:
    df[num_cols] = scaler.fit_transform(df[num_cols])


# ================================================================
# 10. SAVE CLEANED HEART DISEASE DATASET
# ================================================================
output_filename = "/kaggle/working/new_heart_disease_cleaned.csv"
df.to_csv(output_filename, index=False)

print("\n✅ PREPROCESSING COMPLETE!")
print("📁 Saved cleaned file as:", output_filename)
print("Final shape:", df.shape)

# Print label distribution if label exists
for label_name in ["target", "Target", "label", "Label"]:
    if label_name in df.columns:
        print("\nLabel distribution:")
        print(df[label_name].value_counts())
        break
else:
    print("⚠️ No label column detected in final dataset.")


Initial shape: (50000, 30)
Initial dtypes (sample):
 ID                         int64
Age                        int64
Gender                    object
Region                    object
Blood_Pressure           float64
Cholesterol              float64
BMI                      float64
Heart_Rate                 int64
Exercise_Level            object
Smoking                     bool
Alcohol_Consumption       object
Diabetes                    bool
Family_History              bool
Stress_Level               int64
Heart_Attack                bool
Angina                      bool
Heart_Disease_History       bool
Diet                      object
Sleep_Hours              float64
Occupation                object
dtype: object
Detected 19 numeric-like columns.
Columns with ±inf: []
Columns with extremely large values (>1e300): []
Final numeric column count: 11
Final categorical column count: 11
Non-finite columns (should be empty): []

✅ PREPROCESSING COMPLETE!
📁 Saved cleaned file as: /kaggle/w

In [14]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)
from catboost import CatBoostClassifier

# ------------------------------------
# LOAD BALANCED DATASET
# ------------------------------------
df = pd.read_csv("/kaggle/working/new_heart_disease_balanced.csv")

TARGET = "Heart_Attack"

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ------------------------------------
# STRATIFIED SPLIT
# ------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

# ------------------------------------
# REGULARIZED CATBOOST MODEL
# ------------------------------------
model = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.03,
    l2_leaf_reg=15,
    subsample=0.6,
    bootstrap_type="Bernoulli",
    random_strength=5,
    eval_metric="F1",
    verbose=50,
    random_seed=42
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=50)

# ------------------------------------
# PREDICT
# ------------------------------------
y_pred = model.predict(X_test)

print("\nAccuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ------------------------------------
# SAVE TRAINED MODEL + FEATURES
# ------------------------------------
model_path = "/kaggle/working/heart_attack_catboost_model.pkl"

with open(model_path, "wb") as f:
    pickle.dump({
        "model": model,
        "features": X.columns.tolist()
    }, f)

print("\n✅ Model saved successfully!")
print("📁 Saved to:", model_path)


0:	learn: 0.4460159	test: 0.4448584	best: 0.4448584 (0)	total: 3.1ms	remaining: 1.54s
50:	learn: 0.7504818	test: 0.5049768	best: 0.5154572 (12)	total: 299ms	remaining: 2.63s
100:	learn: 0.8099548	test: 0.5111333	best: 0.5154572 (12)	total: 589ms	remaining: 2.33s
150:	learn: 0.8488833	test: 0.5003331	best: 0.5154572 (12)	total: 887ms	remaining: 2.05s
200:	learn: 0.8706496	test: 0.5068174	best: 0.5154572 (12)	total: 1.19s	remaining: 1.77s
250:	learn: 0.8916290	test: 0.5046667	best: 0.5154572 (12)	total: 1.48s	remaining: 1.47s
300:	learn: 0.9075210	test: 0.4995005	best: 0.5154572 (12)	total: 1.78s	remaining: 1.18s
350:	learn: 0.9202315	test: 0.5010115	best: 0.5154572 (12)	total: 2.08s	remaining: 885ms
400:	learn: 0.9379810	test: 0.5037137	best: 0.5154572 (12)	total: 2.39s	remaining: 591ms
450:	learn: 0.9494972	test: 0.5011852	best: 0.5154572 (12)	total: 2.7s	remaining: 294ms
499:	learn: 0.9561582	test: 0.4988107	best: 0.5154572 (12)	total: 3s	remaining: 0us

bestTest = 0.5154572079
bestIt

In [6]:
import pandas as pd

# Load cleaned dataset
df = pd.read_csv("/kaggle/working/new_heart_disease_cleaned.csv")

TARGET = "Heart_Attack"   # change if needed

# Count class distribution
counts = df[TARGET].value_counts()
print("Original distribution:\n", counts)

# Number of minority samples (True = 1)
minority_count = counts[1]   # 5881 in your case

# Split into two groups
df_true  = df[df[TARGET] == 1]
df_false = df[df[TARGET] == 0]

# Undersample majority class (False)
df_false_balanced = df_false.sample(n=minority_count, random_state=42)

# Combine to form balanced dataset
df_balanced = pd.concat([df_true, df_false_balanced], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

print("\nBalanced dataset distribution:")
print(df_balanced[TARGET].value_counts())

# Save balanced dataset
output_path = "/kaggle/working/new_heart_disease_balanced.csv"
df_balanced.to_csv(output_path, index=False)

print("\n✅ Balanced dataset saved to:", output_path)
print("Final shape:", df_balanced.shape)


Original distribution:
 Heart_Attack
False    44119
True      5881
Name: count, dtype: int64

Balanced dataset distribution:
Heart_Attack
True     5881
False    5881
Name: count, dtype: int64


  minority_count = counts[1]   # 5881 in your case



✅ Balanced dataset saved to: /kaggle/working/new_heart_disease_balanced.csv
Final shape: (11762, 30)


In [9]:
# intersection_hlo_with_hillclimb_fast.py
# Pipeline (reduced budget + hill-climb) with UNION, INTERSECTION, and VOTING candidate flows:
#  PSO + GA + GWO (CatBoost fitness, lighter during opt) -> derive UNION / INTERSECTION / VOTING
#  For each candidate set: HLO (on candidates) -> Greedy hill-climb (restricted) -> Final CatBoost eval (5-fold CV)
#  Additionally: train a CatBoost model on 80% of the data and evaluate on the held-out 20% test set
#  Train & save a CatBoost model for each flow (union / intersection / voting) using the 80/20 split.
# Prints logs, mean ± std for metrics, stage timings, saves results and models.

import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.base import clone

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# If you prefer to load CSV instead, uncomment and change:
df = pd.read_csv("/kaggle/working/new_heart_disease_balanced.csv")

TARGET_COL = "Heart_Attack"   # target column
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets for faster runs (you can tune these) ----------
PSO_SWARM = 15   # reduced swarm
PSO_ITERS = 5   # reduced iterations

GA_POP = 30      # reduced population
GA_GENS = 5     # reduced generations

GWO_WOLVES = 10
GWO_ITERS = 5

HLO_POP = 15
HLO_ITERS = 5
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb after HLO
HILLCLIMB_MAX_STEPS = 100   # stop if no improvement or step limit
HILLCLIMB_EVAL_CAP = 500    # safety cap on evaluations (prevent runaway)

# CV folds
CV_OPT = 2    # cheaper CV during optimization + HLO (speed)
CV_FINAL = 5  # final evaluation (A1 requested)

# CatBoost iterations
CB_ITER_OPT = 100    # iterations during optimization (smaller)
CB_ITER_HLO = 200
CB_ITER_FINAL = 500  # final evaluation iterations (bigger)

# Train/test split for final saved models
FINAL_TEST_SIZE = 0.2

SAVE_PREFIX = "hybrid_hlo_models"
# ------------------------------------------------------------------------

# Ensure df exists
try:
    df
except NameError:
    raise RuntimeError("DataFrame `df` not found. Assign your dataset to variable `df` or load at top.")

# Prepare data
X = df.drop(TARGET_COL, axis=1)

y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

# -------------------- Model factory (CatBoost) --------------------
def get_catboost_model(iterations=100):
    try:
        from catboost import CatBoostClassifier
    except Exception as e:
        raise ImportError("catboost not installed. Install with: pip install catboost") from e
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=MODEL_VERBOSE, random_seed=RANDOM_STATE, thread_count=-1)

# -------------------- Fitness cache --------------------
# key: tuple(selected original indices) -> float score
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    """
    Evaluate mask using CatBoost with CV and return average of acc,prec,rec,f1.
    Caches results to avoid re-evaluating identical subsets.
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GA SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GWO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- INTERSECTION / UNION / VOTING --------------------
def get_intersection_mask(*masks):
    """Return mask that contains only features present in ALL provided masks."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    inter_idx = set(np.where(np.array(masks[0]).astype(bool))[0].tolist())
    for m in masks[1:]:
        idxs = set(np.where(np.array(m).astype(bool))[0].tolist())
        inter_idx = inter_idx.intersection(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in inter_idx:
        mask[i] = 1
    return mask


def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask


def get_voting_mask(*masks, threshold=2):
    """Return mask of features selected by at least `threshold` methods (default majority of 3 => 2)."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    counts = np.zeros(N_FEATURES, dtype=int)
    for m in masks:
        counts += np.array(m).astype(int)
    mask = (counts >= threshold).astype(int)
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.4f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    """
    Greedy single-bit flip hill-climb restricted to candidate indices.
    Starts from initial_mask (full-length). Tries flipping each candidate feature's bit:
    - If flip improves fitness, accept and restart scanning.
    - Stops when no improving flip found or max_steps/eval_cap reached.
    """
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.4f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)

    # Derive candidate masks
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    inter_mask = get_intersection_mask(pso_mask, ga_mask, gwo_mask)
    vote_mask = get_voting_mask(pso_mask, ga_mask, gwo_mask, threshold=2)

    candidate_sets = {
        'union': union_mask,
        'intersection': inter_mask,
        'voting': vote_mask
    }

    results_all = {}

    # run HLO -> hill-climb -> final evaluation -> train & save model for each candidate set
    for name, cand_mask in candidate_sets.items():
        log(f"===== PROCESSING {name.upper()} CANDIDATES =====")
        n_cand = int(np.sum(cand_mask))
        log(f"{name.upper()} candidate features: {n_cand}")
        if n_cand == 0:
            log(f"{name.upper()} empty — skipping HLO/hill-climb and model training.")
            results_all[name] = {'skipped': True, 'n_candidates': 0}
            continue

        # HLO on this candidate set
        hlo_mask, hlo_score, hlo_time = hlo_on_candidates(cand_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)

        # hill-climb restricted to candidate set
        hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, cand_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)

        # final CV evaluation
        final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)

        # Train final CatBoost model on 80% train and evaluate on 20% test (stratified)
        sel_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
        sel_features = [FEATURE_NAMES[i] for i in sel_idxs]

        if len(sel_features) == 0:
            log(f"No features selected after hill-climb for {name}, skipping model train.")
            results_all[name] = {'skipped': True, 'n_candidates': n_cand}
            continue

        X_sel = X[sel_features]
        X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

        model = get_catboost_model(iterations=CB_ITER_FINAL)
        model.fit(X_train, y_train)

        # evaluate on held-out test set (20%)
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_prec = precision_score(y_test, y_pred, zero_division=0)
        test_rec = recall_score(y_test, y_pred, zero_division=0)
        test_f1 = f1_score(y_test, y_pred, zero_division=0)

        test_metrics = {
            'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
            'n_test': int(X_test.shape[0])
        }

        # Save model to file (pickle)
        model_filename = f"{SAVE_PREFIX}_{name}_model.pkl"
        with open(model_filename, 'wb') as mf:
            pickle.dump(model, mf)

        # store results
        results_all[name] = {
            'n_candidates': n_cand,
            'hlo_score': float(hlo_score), 'hlo_time': int(hlo_time),
            'hc_score': float(hc_score), 'hc_time': int(hc_time),
            'final_eval': final_res,
            'selected_features': sel_features,
            'model_file': model_filename,
            'test_metrics': test_metrics
        }

        log(f"Saved trained CatBoost model for {name} -> {model_filename} (test_f1={test_f1:.4f})")

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)

    # Summary / save aggregated results
    print("==================== AGGREGATE SUMMARY ====================")
    print(f"PSO  -> opt_score={pso_score:.4f} selected={int(np.sum(pso_mask))} time={pso_time}s")
    print(f"GA   -> opt_score={ga_score:.4f} selected={int(np.sum(ga_mask))} time={ga_time}s")
    print(f"GWO  -> opt_score={gwo_score:.4f} selected={int(np.sum(gwo_mask))} time={gwo_time}s")
    print(f"Union candidates    : {int(np.sum(union_mask))}")
    print(f"Intersection candidates: {int(np.sum(inter_mask))}")
    print(f"Voting candidates   : {int(np.sum(vote_mask))}")
    print("-------------------------------------------------")

    for name, info in results_all.items():
        print(f"-- {name.upper()} SUMMARY --")
        if info.get('skipped'):
            print(" skipped (no candidates)")
            continue
        fe = info['final_eval']
        tm = info['test_metrics']
        print(f" Selected ({fe['n_features']}): {fe['features']}")
        print(f" CV F1   : {fe['f1_mean']:.4f} ± {fe['f1_std']:.4f}")
        print(f" Test F1 : {tm['f1']:.4f} (n_test={tm['n_test']})")
        print(f" Accuracy : {fe['acc_mean']:.4f} ± {fe['acc_std']:.4f}")
        print(f" Precision: {fe['prec_mean']:.4f} ± {fe['prec_std']:.4f}")
        print(f" Recall   : {fe['rec_mean']:.4f} ± {fe['rec_std']:.4f}")
        print(f" Model file: {info['model_file']}")



    # Save aggregated pipeline outputs
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask, "intersection_mask": inter_mask, "voting_mask": vote_mask,
        "results_all": results_all,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    log(f"Saved results to {SAVE_PREFIX}_results.pkl")
    log("===== PIPELINE COMPLETE =====")

[12:29:50] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====
[12:29:50] PSO START (swarm=15, iters=5, cv=2)
[12:30:13]  PSO iter 1/5 best_global=0.5129
[12:30:30]  PSO iter 2/5 best_global=0.5129
[12:30:48]  PSO iter 3/5 best_global=0.5129
[12:31:05]  PSO iter 4/5 best_global=0.5129
[12:31:24]  PSO iter 5/5 best_global=0.5148
[12:31:42] PSO DONE in 112s best_score=0.5148 selected=13
[12:31:42] PSO SELECTED FEATURES: ['ID', 'Blood_Pressure', 'Cholesterol', 'BMI', 'Exercise_Level', 'Smoking', 'Alcohol_Consumption', 'Sleep_Hours', 'Occupation', 'Education_Level', 'Marital_Status', 'Urban_Rural', 'Mental_Health']
[12:31:42] GA START (pop=30, gens=5, cv=2)
[12:32:18]  GA gen 1/5 current_best=0.5132
[12:32:53]  GA gen 2/5 current_best=0.5140
[12:33:28]  GA gen 3/5 current_best=0.5140
[12:34:04]  GA gen 4/5 current_best=0.5179
[12:34:40]  GA gen 5/5 current_best=0.5179
[12:35:15] GA DONE in 212s best_score=0.5179 selected=17
[12:35:15] GA SELECTED FEATUR

In [12]:
# intersection_hlo_with_hillclimb_fast.py
# Pipeline (reduced budget + hill-climb) with UNION, INTERSECTION, and VOTING candidate flows:
#  PSO + GA + GWO (CatBoost fitness, lighter during opt) -> derive UNION / INTERSECTION / VOTING
#  For each candidate set: HLO (on candidates) -> Greedy hill-climb (restricted) -> Final CatBoost eval (5-fold CV)
#  Additionally: train a CatBoost model on 80% of the data and evaluate on the held-out 20% test set
#  Train & save a CatBoost model for each flow (union / intersection / voting) using the 80/20 split.
# Prints logs, mean ± std for metrics, stage timings, saves results and models.


#300 iter for final catboost


import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.base import clone

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# If you prefer to load CSV instead, uncomment and change:
df = pd.read_csv("/kaggle/working/new_heart_disease_balanced.csv")

TARGET_COL = "Heart_Attack"   # target column
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets for faster runs (you can tune these) ----------
PSO_SWARM = 15   # reduced swarm
PSO_ITERS = 5   # reduced iterations

GA_POP = 30      # reduced population
GA_GENS = 5     # reduced generations

GWO_WOLVES = 10
GWO_ITERS = 5

HLO_POP = 15
HLO_ITERS = 5
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb after HLO
HILLCLIMB_MAX_STEPS = 100   # stop if no improvement or step limit
HILLCLIMB_EVAL_CAP = 500    # safety cap on evaluations (prevent runaway)

# CV folds
CV_OPT = 2    # cheaper CV during optimization + HLO (speed)
CV_FINAL = 5  # final evaluation (A1 requested)

# CatBoost iterations
CB_ITER_OPT = 100    # iterations during optimization (smaller)
CB_ITER_HLO = 200
CB_ITER_FINAL = 300  # final evaluation iterations (bigger)

# Train/test split for final saved models
FINAL_TEST_SIZE = 0.2

SAVE_PREFIX = "hybrid_hlo_models"
# ------------------------------------------------------------------------

# Ensure df exists
try:
    df
except NameError:
    raise RuntimeError("DataFrame `df` not found. Assign your dataset to variable `df` or load at top.")

# Prepare data
X = df.drop(TARGET_COL, axis=1)

y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

# -------------------- Model factory (CatBoost) --------------------
def get_catboost_model(iterations=100):
    try:
        from catboost import CatBoostClassifier
    except Exception as e:
        raise ImportError("catboost not installed. Install with: pip install catboost") from e
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=MODEL_VERBOSE, random_seed=RANDOM_STATE, thread_count=-1)

# -------------------- Fitness cache --------------------
# key: tuple(selected original indices) -> float score
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    """
    Evaluate mask using CatBoost with CV and return average of acc,prec,rec,f1.
    Caches results to avoid re-evaluating identical subsets.
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GA SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GWO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- INTERSECTION / UNION / VOTING --------------------
def get_intersection_mask(*masks):
    """Return mask that contains only features present in ALL provided masks."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    inter_idx = set(np.where(np.array(masks[0]).astype(bool))[0].tolist())
    for m in masks[1:]:
        idxs = set(np.where(np.array(m).astype(bool))[0].tolist())
        inter_idx = inter_idx.intersection(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in inter_idx:
        mask[i] = 1
    return mask


def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask


def get_voting_mask(*masks, threshold=2):
    """Return mask of features selected by at least `threshold` methods (default majority of 3 => 2)."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    counts = np.zeros(N_FEATURES, dtype=int)
    for m in masks:
        counts += np.array(m).astype(int)
    mask = (counts >= threshold).astype(int)
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.4f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    """
    Greedy single-bit flip hill-climb restricted to candidate indices.
    Starts from initial_mask (full-length). Tries flipping each candidate feature's bit:
    - If flip improves fitness, accept and restart scanning.
    - Stops when no improving flip found or max_steps/eval_cap reached.
    """
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.4f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)

    # Derive candidate masks
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    inter_mask = get_intersection_mask(pso_mask, ga_mask, gwo_mask)
    vote_mask = get_voting_mask(pso_mask, ga_mask, gwo_mask, threshold=2)

    candidate_sets = {
        'union': union_mask,
        'intersection': inter_mask,
        'voting': vote_mask
    }

    results_all = {}

    # run HLO -> hill-climb -> final evaluation -> train & save model for each candidate set
    for name, cand_mask in candidate_sets.items():
        log(f"===== PROCESSING {name.upper()} CANDIDATES =====")
        n_cand = int(np.sum(cand_mask))
        log(f"{name.upper()} candidate features: {n_cand}")
        if n_cand == 0:
            log(f"{name.upper()} empty — skipping HLO/hill-climb and model training.")
            results_all[name] = {'skipped': True, 'n_candidates': 0}
            continue

        # HLO on this candidate set
        hlo_mask, hlo_score, hlo_time = hlo_on_candidates(cand_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)

        # hill-climb restricted to candidate set
        hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, cand_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)

        # final CV evaluation
        final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)

        # Train final CatBoost model on 80% train and evaluate on 20% test (stratified)
        sel_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
        sel_features = [FEATURE_NAMES[i] for i in sel_idxs]

        if len(sel_features) == 0:
            log(f"No features selected after hill-climb for {name}, skipping model train.")
            results_all[name] = {'skipped': True, 'n_candidates': n_cand}
            continue

        X_sel = X[sel_features]
        X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

        model = get_catboost_model(iterations=CB_ITER_FINAL)
        model.fit(X_train, y_train)

        # evaluate on held-out test set (20%)
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_prec = precision_score(y_test, y_pred, zero_division=0)
        test_rec = recall_score(y_test, y_pred, zero_division=0)
        test_f1 = f1_score(y_test, y_pred, zero_division=0)

        test_metrics = {
            'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
            'n_test': int(X_test.shape[0])
        }

        # Save model to file (pickle)
        model_filename = f"{SAVE_PREFIX}_{name}_model.pkl"
        with open(model_filename, 'wb') as mf:
            pickle.dump(model, mf)

        # store results
        results_all[name] = {
            'n_candidates': n_cand,
            'hlo_score': float(hlo_score), 'hlo_time': int(hlo_time),
            'hc_score': float(hc_score), 'hc_time': int(hc_time),
            'final_eval': final_res,
            'selected_features': sel_features,
            'model_file': model_filename,
            'test_metrics': test_metrics
        }

        log(f"Saved trained CatBoost model for {name} -> {model_filename} (test_f1={test_f1:.4f})")

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)

    # Summary / save aggregated results
    print("==================== AGGREGATE SUMMARY ====================")
    print(f"PSO  -> opt_score={pso_score:.4f} selected={int(np.sum(pso_mask))} time={pso_time}s")
    print(f"GA   -> opt_score={ga_score:.4f} selected={int(np.sum(ga_mask))} time={ga_time}s")
    print(f"GWO  -> opt_score={gwo_score:.4f} selected={int(np.sum(gwo_mask))} time={gwo_time}s")
    print(f"Union candidates    : {int(np.sum(union_mask))}")
    print(f"Intersection candidates: {int(np.sum(inter_mask))}")
    print(f"Voting candidates   : {int(np.sum(vote_mask))}")
    print("-------------------------------------------------")

    for name, info in results_all.items():
        print(f"-- {name.upper()} SUMMARY --")
        if info.get('skipped'):
            print(" skipped (no candidates)")
            continue
        fe = info['final_eval']
        tm = info['test_metrics']
        print(f" Selected ({fe['n_features']}): {fe['features']}")
        print(f" CV F1   : {fe['f1_mean']:.4f} ± {fe['f1_std']:.4f}")
        print(f" Test F1 : {tm['f1']:.4f} (n_test={tm['n_test']})")
        print(f" Accuracy : {fe['acc_mean']:.4f} ± {fe['acc_std']:.4f}")
        print(f" Precision: {fe['prec_mean']:.4f} ± {fe['prec_std']:.4f}")
        print(f" Recall   : {fe['rec_mean']:.4f} ± {fe['rec_std']:.4f}")
        print(f" Model file: {info['model_file']}")



    # Save aggregated pipeline outputs
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask, "intersection_mask": inter_mask, "voting_mask": vote_mask,
        "results_all": results_all,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    log(f"Saved results to {SAVE_PREFIX}_results.pkl")
    log("===== PIPELINE COMPLETE =====")

[13:10:51] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====
[13:10:51] PSO START (swarm=15, iters=5, cv=2)
[13:11:13]  PSO iter 1/5 best_global=0.5129
[13:11:31]  PSO iter 2/5 best_global=0.5129
[13:11:49]  PSO iter 3/5 best_global=0.5129
[13:12:07]  PSO iter 4/5 best_global=0.5129
[13:12:26]  PSO iter 5/5 best_global=0.5148
[13:12:44] PSO DONE in 113s best_score=0.5148 selected=13
[13:12:44] PSO SELECTED FEATURES: ['ID', 'Blood_Pressure', 'Cholesterol', 'BMI', 'Exercise_Level', 'Smoking', 'Alcohol_Consumption', 'Sleep_Hours', 'Occupation', 'Education_Level', 'Marital_Status', 'Urban_Rural', 'Mental_Health']
[13:12:44] GA START (pop=30, gens=5, cv=2)
[13:13:21]  GA gen 1/5 current_best=0.5132
[13:13:56]  GA gen 2/5 current_best=0.5140
[13:14:31]  GA gen 3/5 current_best=0.5140
[13:15:07]  GA gen 4/5 current_best=0.5179
[13:15:44]  GA gen 5/5 current_best=0.5179
[13:16:20] GA DONE in 215s best_score=0.5179 selected=17
[13:16:20] GA SELECTED FEATUR

In [28]:
import pandas as pd
import os
import kagglehub

# Download dataset
path = kagglehub.dataset_download("jsphyg/weather-dataset-rattle-package")
print("Path to dataset files:", path)

# Find CSV file inside folder
files = [f for f in os.listdir(path) if f.endswith(".csv")]
if not files:
    raise FileNotFoundError("No CSV file found in downloaded dataset folder.")

csv_path = os.path.join(path, files[0])
print("Loading file:", csv_path)

# Load dataset
df = pd.read_csv(csv_path)
print("Loaded shape:", df.shape)
print("Available columns:", df.columns.tolist())

# Check for target column
if "RainTomorrow" not in df.columns:
    raise ValueError("The dataset does not contain 'RainTomorrow' column. Columns:\n" + str(df.columns))

# Unique values
print("\nUnique values in RainTomorrow column:")
print(df["RainTomorrow"].unique())

# Value counts
print("\nValue counts in RainTomorrow column:")
print(df["RainTomorrow"].value_counts())

# Percentages
print("\nRainTomorrow distribution (%):")
print(df["RainTomorrow"].value_counts(normalize=True) * 100)

# Missing value count
print("\nMissing value count in RainTomorrow:")
missing_count = df["RainTomorrow"].isna().sum()
print(missing_count)

# Missing value percentage
print("\nMissing value percentage:")
missing_percent = (missing_count / len(df)) * 100
print(missing_percent, "%")


Path to dataset files: /kaggle/input/weather-dataset-rattle-package
Loading file: /kaggle/input/weather-dataset-rattle-package/weatherAUS.csv
Loaded shape: (145460, 23)
Available columns: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']

Unique values in RainTomorrow column:
['No' 'Yes' nan]

Value counts in RainTomorrow column:
RainTomorrow
No     110316
Yes     31877
Name: count, dtype: int64

RainTomorrow distribution (%):
RainTomorrow
No     77.581878
Yes    22.418122
Name: proportion, dtype: float64

Missing value count in RainTomorrow:
3267

Missing value percentage:
2.245978275814657 %


In [17]:
import pandas as pd
import os
import kagglehub

# Download dataset
path = kagglehub.dataset_download("jsphyg/weather-dataset-rattle-package")
print("Path to dataset files:", path)

# Find CSV file
files = [f for f in os.listdir(path) if f.endswith(".csv")]
if not files:
    raise FileNotFoundError("No CSV file found in downloaded dataset.")
csv_path = os.path.join(path, files[0])

df = pd.read_csv(csv_path)
print("Loaded shape:", df.shape)

# Ensure column exists
if "RainTomorrow" not in df.columns:
    raise ValueError("RainTomorrow column not found. Columns:\n" + str(df.columns))

# Remove rows with missing target values
df = df.dropna(subset=["RainTomorrow"]).reset_index(drop=True)

# Count classes
class_counts = df["RainTomorrow"].value_counts()
print("\nClass distribution:")
print(class_counts)

# Determine minority count
minority_count = class_counts.min()

# Downsample both Yes and No to smallest count
df_yes = df[df["RainTomorrow"] == "Yes"].sample(minority_count, random_state=42)
df_no  = df[df["RainTomorrow"] == "No"].sample(minority_count, random_state=42)

# Create balanced dataset
df_balanced = pd.concat([df_yes, df_no], ignore_index=True).sample(frac=1, random_state=42)

print("\nBalanced dataset shape:", df_balanced.shape)
print(df_balanced["RainTomorrow"].value_counts())

# Save new balanced dataset
output_path = "/kaggle/working/weather_balanced.csv"
df_balanced.to_csv(output_path, index=False)

print("\n✅ Balanced dataset saved to:", output_path)


Path to dataset files: /kaggle/input/weather-dataset-rattle-package
Loaded shape: (145460, 23)

Class distribution:
RainTomorrow
No     110316
Yes     31877
Name: count, dtype: int64

Balanced dataset shape: (63754, 23)
RainTomorrow
Yes    31877
No     31877
Name: count, dtype: int64

✅ Balanced dataset saved to: /kaggle/working/weather_balanced.csv


In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# ================================================================
# 1. LOAD BALANCED WEATHER DATASET
# ================================================================
input_path = "/kaggle/working/weather_balanced.csv"  # <-- change if needed
df = pd.read_csv(input_path, low_memory=False)

print("Initial shape:", df.shape)
print("Initial dtypes (sample):\n", df.dtypes.head(20))


# ================================================================
# 2. BASIC CLEANING
# ================================================================
df = df.dropna(axis=1, how="all")                      # remove fully-empty columns
df = df.loc[:, (df != 0).any(axis=0)]                 # remove all-zero columns
df = df.drop_duplicates().reset_index(drop=True)      # remove exact duplicates


# ================================================================
# 3. ROBUST NUMERIC COLUMN DETECTION
# ================================================================
numeric_candidates = []
conversion_stats = {}

for col in df.columns:
    coerced = pd.to_numeric(df[col], errors="coerce")
    non_na_ratio = coerced.notna().sum() / len(coerced)
    conversion_stats[col] = non_na_ratio

    if non_na_ratio >= 0.80:  # at least 80% numeric-like
        numeric_candidates.append(col)

print(f"Detected {len(numeric_candidates)} numeric-like columns.")


# Convert numeric candidates to numeric dtype
for col in numeric_candidates:
    df[col] = pd.to_numeric(df[col], errors="coerce")


# ================================================================
# 4. HANDLE INF + EXTREME VALUES
# ================================================================
inf_cols = [c for c in numeric_candidates if np.isinf(df[c].to_numpy()).any()]
print("Columns with ±inf:", inf_cols)

if inf_cols:
    df[numeric_candidates] = df[numeric_candidates].replace([np.inf, -np.inf], np.nan)

huge_cols = []
for col in numeric_candidates:
    try:
        max_abs = np.nanmax(np.abs(df[col].to_numpy()))
        if np.isfinite(max_abs) and max_abs > 1e300:
            huge_cols.append((col, max_abs))
    except:
        pass

print("Columns with extremely large values (>1e300):", huge_cols)

CLIP_LIMIT = 1e300
df[numeric_candidates] = df[numeric_candidates].apply(
    lambda s: s.clip(lower=-CLIP_LIMIT, upper=CLIP_LIMIT)
)


# ================================================================
# 5. RECOMPUTE NUMERIC + CATEGORICAL COLUMNS
# ================================================================
num_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("Final numeric columns:", len(num_cols))
print("Final categorical columns:", len(cat_cols))


# ================================================================
# 6. HANDLE MISSING VALUES
# ================================================================
if len(num_cols) > 0:
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

for col in cat_cols:
    if df[col].isna().any():
        mode_val = df[col].mode(dropna=True)
        df[col] = df[col].fillna(mode_val.iloc[0] if len(mode_val) else "")


# ================================================================
# 7. LABEL-ENCODE CATEGORICAL COLUMNS
# ================================================================
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))


# ================================================================
# 8. FINAL CHECK BEFORE SCALING
# ================================================================
df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan)
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

finite_check = {c: np.isfinite(df[c].to_numpy()).all() for c in num_cols}
bad_cols = [c for c, ok in finite_check.items() if not ok]
print("Non-finite numeric columns (should be empty):", bad_cols)


# ================================================================
# 9. MIN-MAX SCALING
# ================================================================
scaler = MinMaxScaler()
if len(num_cols) > 0:
    df[num_cols] = scaler.fit_transform(df[num_cols])


# ================================================================
# 10. SAVE CLEANED DATASET
# ================================================================
output_path = "/kaggle/working/weather_balanced_cleaned.csv"
df.to_csv(output_path, index=False)

print("\n✅ PREPROCESSING COMPLETE!")
print("📁 Saved cleaned dataset as:", output_path)
print("Final shape:", df.shape)

# Display label distribution (Yes/No)
if "RainTomorrow" in df.columns:
    print("\nLabel distribution after cleaning:")
    print(df["RainTomorrow"].value_counts())
else:
    print("⚠️ No 'RainTomorrow' column found after preprocessing.")


Initial shape: (63754, 23)
Initial dtypes (sample):
 Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
dtype: object
Detected 12 numeric-like columns.
Columns with ±inf: []
Columns with extremely large values (>1e300): []
Final numeric columns: 16
Final categorical columns: 7
Non-finite numeric columns (should be empty): []

✅ PREPROCESSING COMPLETE!
📁 Saved cleaned dataset as: /kaggle/working/weather_balanced_cleaned.csv
Final shape: (63754, 23)

Label distribution after cleaning:
RainTomorrow
1    31877
0    31877
Name: count, dtype: int64


In [22]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import pickle

# -------------------------------------------------------------------
# 1. LOAD CLEANED DATASET
# -------------------------------------------------------------------
input_path = "/kaggle/working/weather_balanced_cleaned.csv"
df = pd.read_csv(input_path)

print("Loaded cleaned dataset:", df.shape)
print(df.head())

# -------------------------------------------------------------------
# 2. CHECK TARGET COLUMN
# -------------------------------------------------------------------
TARGET = "RainTomorrow"

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Available columns: {df.columns.tolist()}")

y = df[TARGET]
X = df.drop(columns=[TARGET])

print("\nFeatures shape:", X.shape)
print("Label distribution:\n", y.value_counts())

# -------------------------------------------------------------------
# 3. TRAIN-TEST SPLIT
# -------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

# -------------------------------------------------------------------
# 4. CATBOOST MODEL
# -------------------------------------------------------------------
model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=8,
    loss_function="Logloss",
    eval_metric="F1",
    verbose=50,
    random_seed=42
)

# Train model
model.fit(X_train, y_train, eval_set=(X_test, y_test))

# -------------------------------------------------------------------
# 5. PREDICT + EVALUATE
# -------------------------------------------------------------------
y_pred = model.predict(X_test)

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec  = recall_score(y_test, y_pred, zero_division=0)
f1   = f1_score(y_test, y_pred, zero_division=0)
cm   = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

# -------------------------------------------------------------------
# 6. PRINT RESULTS
# -------------------------------------------------------------------
print("\n================ CATBOOST RESULTS ================")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1 Score  : {f1:.4f}")

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(report)

# -------------------------------------------------------------------
# 7. SAVE MODEL
# -------------------------------------------------------------------
model_path = "/kaggle/working/weather_catboost_model.pkl"

with open(model_path, "wb") as f:
    pickle.dump({"model": model, "features": X.columns.tolist()}, f)

print("\n✅ Model saved successfully as:", model_path)


Loaded cleaned dataset: (63754, 23)
   Date  Location   MinTemp   MaxTemp  Rainfall  Evaporation  Sunshine  \
0  1986        13  0.785894  0.739300  0.038814     0.053793  0.482517   
1  2358        32  0.564232  0.723735  0.000000     0.042759  0.811189   
2  1573        32  0.352645  0.468872  0.000000     0.006897  0.636364   
3   825         2  0.292191  0.424125  0.000539     0.030345  0.475524   
4   919        20  0.365239  0.354086  0.000539     0.012414  0.405594   

   WindGustDir  WindGustSpeed  WindDir9am  ...  Humidity9am  Humidity3pm  \
0            0       0.286822           1  ...     0.693878         0.51   
1            0       0.511628           4  ...     0.438776         0.62   
2            3       0.108527           5  ...     0.816327         0.45   
3            9       0.069767           0  ...     0.989796         0.46   
4           15       0.255814          13  ...     0.928571         0.43   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm   Temp9am   Te

In [32]:
# intersection_hlo_with_hillclimb_fast.py
# Pipeline (reduced budget + hill-climb) with UNION, INTERSECTION, and VOTING candidate flows:
#  PSO + GA + GWO (CatBoost fitness, lighter during opt) -> derive UNION / INTERSECTION / VOTING
#  For each candidate set: HLO (on candidates) -> Greedy hill-climb (restricted) -> Final CatBoost eval (5-fold CV)
#  Additionally: train a CatBoost model on 80% of the data and evaluate on the held-out 20% test set
#  Train & save a CatBoost model for each flow (union / intersection / voting) using the 80/20 split.
# Prints logs, mean ± std for metrics, stage timings, saves results and models.

import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.base import clone

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# If you prefer to load CSV instead, uncomment and change:
df = pd.read_csv("/kaggle/working/weather_balanced_cleaned.csv")

TARGET_COL = "RainTomorrow"   # target column
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets for faster runs (you can tune these) ----------
PSO_SWARM = 15   # reduced swarm
PSO_ITERS = 5   # reduced iterations

GA_POP = 30      # reduced population
GA_GENS = 5     # reduced generations

GWO_WOLVES = 10
GWO_ITERS = 5

HLO_POP = 15
HLO_ITERS = 5
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb after HLO
HILLCLIMB_MAX_STEPS = 100   # stop if no improvement or step limit
HILLCLIMB_EVAL_CAP = 500    # safety cap on evaluations (prevent runaway)

# CV folds
CV_OPT = 2    # cheaper CV during optimization + HLO (speed)
CV_FINAL = 5  # final evaluation (A1 requested)

# CatBoost iterations
CB_ITER_OPT = 100    # iterations during optimization (smaller)
CB_ITER_HLO = 200
CB_ITER_FINAL = 500  # final evaluation iterations (bigger)

# Train/test split for final saved models
FINAL_TEST_SIZE = 0.2

SAVE_PREFIX = "hybrid_hlo_models"
# ------------------------------------------------------------------------

# Ensure df exists
try:
    df
except NameError:
    raise RuntimeError("DataFrame `df` not found. Assign your dataset to variable `df` or load at top.")

# Prepare data
X = df.drop(TARGET_COL, axis=1)

y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

# -------------------- Model factory (CatBoost) --------------------
def get_catboost_model(iterations=100):
    try:
        from catboost import CatBoostClassifier
    except Exception as e:
        raise ImportError("catboost not installed. Install with: pip install catboost") from e
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=MODEL_VERBOSE, random_seed=RANDOM_STATE, thread_count=-1)

# -------------------- Fitness cache --------------------
# key: tuple(selected original indices) -> float score
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    """
    Evaluate mask using CatBoost with CV and return average of acc,prec,rec,f1.
    Caches results to avoid re-evaluating identical subsets.
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GA SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GWO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- INTERSECTION / UNION / VOTING --------------------
def get_intersection_mask(*masks):
    """Return mask that contains only features present in ALL provided masks."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    inter_idx = set(np.where(np.array(masks[0]).astype(bool))[0].tolist())
    for m in masks[1:]:
        idxs = set(np.where(np.array(m).astype(bool))[0].tolist())
        inter_idx = inter_idx.intersection(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in inter_idx:
        mask[i] = 1
    return mask


def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask


def get_voting_mask(*masks, threshold=2):
    """Return mask of features selected by at least `threshold` methods (default majority of 3 => 2)."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    counts = np.zeros(N_FEATURES, dtype=int)
    for m in masks:
        counts += np.array(m).astype(int)
    mask = (counts >= threshold).astype(int)
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.4f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    """
    Greedy single-bit flip hill-climb restricted to candidate indices.
    Starts from initial_mask (full-length). Tries flipping each candidate feature's bit:
    - If flip improves fitness, accept and restart scanning.
    - Stops when no improving flip found or max_steps/eval_cap reached.
    """
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.4f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)

    # Derive candidate masks
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    inter_mask = get_intersection_mask(pso_mask, ga_mask, gwo_mask)
    vote_mask = get_voting_mask(pso_mask, ga_mask, gwo_mask, threshold=2)

    candidate_sets = {
        'union': union_mask,
        'intersection': inter_mask,
        'voting': vote_mask
    }

    results_all = {}

    # run HLO -> hill-climb -> final evaluation -> train & save model for each candidate set
    for name, cand_mask in candidate_sets.items():
        log(f"===== PROCESSING {name.upper()} CANDIDATES =====")
        n_cand = int(np.sum(cand_mask))
        log(f"{name.upper()} candidate features: {n_cand}")
        if n_cand == 0:
            log(f"{name.upper()} empty — skipping HLO/hill-climb and model training.")
            results_all[name] = {'skipped': True, 'n_candidates': 0}
            continue

        # HLO on this candidate set
        hlo_mask, hlo_score, hlo_time = hlo_on_candidates(cand_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)

        # hill-climb restricted to candidate set
        hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, cand_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)

        # final CV evaluation
        final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)

        # Train final CatBoost model on 80% train and evaluate on 20% test (stratified)
        sel_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
        sel_features = [FEATURE_NAMES[i] for i in sel_idxs]

        if len(sel_features) == 0:
            log(f"No features selected after hill-climb for {name}, skipping model train.")
            results_all[name] = {'skipped': True, 'n_candidates': n_cand}
            continue

        X_sel = X[sel_features]
        X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

        model = get_catboost_model(iterations=CB_ITER_FINAL)
        model.fit(X_train, y_train)

        # evaluate on held-out test set (20%)
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_prec = precision_score(y_test, y_pred, zero_division=0)
        test_rec = recall_score(y_test, y_pred, zero_division=0)
        test_f1 = f1_score(y_test, y_pred, zero_division=0)

        test_metrics = {
            'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
            'n_test': int(X_test.shape[0])
        }

        # Save model to file (pickle)
        model_filename = f"{SAVE_PREFIX}_{name}_model.pkl"
        with open(model_filename, 'wb') as mf:
            pickle.dump(model, mf)

        # store results
        results_all[name] = {
            'n_candidates': n_cand,
            'hlo_score': float(hlo_score), 'hlo_time': int(hlo_time),
            'hc_score': float(hc_score), 'hc_time': int(hc_time),
            'final_eval': final_res,
            'selected_features': sel_features,
            'model_file': model_filename,
            'test_metrics': test_metrics
        }

        log(f"Saved trained CatBoost model for {name} -> {model_filename} (test_f1={test_f1:.4f})")

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)

    # Summary / save aggregated results
    print("==================== AGGREGATE SUMMARY ====================")
    print(f"PSO  -> opt_score={pso_score:.4f} selected={int(np.sum(pso_mask))} time={pso_time}s")
    print(f"GA   -> opt_score={ga_score:.4f} selected={int(np.sum(ga_mask))} time={ga_time}s")
    print(f"GWO  -> opt_score={gwo_score:.4f} selected={int(np.sum(gwo_mask))} time={gwo_time}s")
    print(f"Union candidates    : {int(np.sum(union_mask))}")
    print(f"Intersection candidates: {int(np.sum(inter_mask))}")
    print(f"Voting candidates   : {int(np.sum(vote_mask))}")
    print("-------------------------------------------------")

    for name, info in results_all.items():
        print(f"-- {name.upper()} SUMMARY --")
        if info.get('skipped'):
            print(" skipped (no candidates)")
            continue
        fe = info['final_eval']
        tm = info['test_metrics']
        print(f" Selected ({fe['n_features']}): {fe['features']}")
        print(f" CV F1   : {fe['f1_mean']:.4f} ± {fe['f1_std']:.4f}")
        print(f" Test F1 : {tm['f1']:.4f} (n_test={tm['n_test']})")
        print(f" Accuracy : {fe['acc_mean']:.4f} ± {fe['acc_std']:.4f}")
        print(f" Precision: {fe['prec_mean']:.4f} ± {fe['prec_std']:.4f}")
        print(f" Recall   : {fe['rec_mean']:.4f} ± {fe['rec_std']:.4f}")
        print(f" Model file: {info['model_file']}")



    # Save aggregated pipeline outputs
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask, "intersection_mask": inter_mask, "voting_mask": vote_mask,
        "results_all": results_all,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    log(f"Saved results to {SAVE_PREFIX}_results.pkl")
    log("===== PIPELINE COMPLETE =====")

[09:31:51] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====
[09:31:51] PSO START (swarm=15, iters=5, cv=2)
[09:32:55]  PSO iter 1/5 best_global=0.7831
[09:33:57]  PSO iter 2/5 best_global=0.7831
[09:35:03]  PSO iter 3/5 best_global=0.7831
[09:36:06]  PSO iter 4/5 best_global=0.7831
[09:37:11]  PSO iter 5/5 best_global=0.7831
[09:38:16] PSO DONE in 384s best_score=0.7831 selected=19
[09:38:16] PSO SELECTED FEATURES: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
[09:38:16] GA START (pop=30, gens=5, cv=2)
[09:40:17]  GA gen 1/5 current_best=0.7818
[09:41:58]  GA gen 2/5 current_best=0.7819
[09:43:46]  GA gen 3/5 current_best=0.7819
[09:45:19]  GA gen 4/5 current_best=0.7821
[09:47:12]  GA gen 5/5 current_best=0.7821
[09:49:02] GA DONE in 645s best_sco

In [25]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/cicids2017")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/cicids2017


In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/cicids2017")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/cicids2017


In [12]:
import pandas as pd

# Load your dataset
df = pd.read_csv("/kaggle/working/converted_output_final.csv")

# ---------------------------
# 1. PRINT ALL COLUMN NAMES
# ---------------------------
print(len(df.columns))

# ---------------------------
# 2. PRINT UNIQUE VALUES OF A SPECIFIC COLUMN
# ---------------------------
column_name = "Label"   # 🔹 change to your column name

if column_name not in df.columns:
    raise ValueError(f"Column '{column_name}' NOT found in dataset.")

print(f"\n=== UNIQUE VALUES IN '{column_name}' ===")
print(df[column_name].unique())

print(f"\n=== VALUE COUNTS IN '{column_name}' ===")
print(df[column_name].value_counts())

print(f"\n=== PERCENTAGE DISTRIBUTION IN '{column_name}' ===")
print(df[column_name].value_counts(normalize=True) * 100)


78

=== UNIQUE VALUES IN 'Label' ===
['Benign' 'FTP-Patator' 'SSH-Patator']

=== VALUE COUNTS IN 'Label' ===
Label
Benign         380564
FTP-Patator      5931
SSH-Patator      3219
Name: count, dtype: int64

=== PERCENTAGE DISTRIBUTION IN 'Label' ===
Label
Benign         97.652124
FTP-Patator     1.521885
SSH-Patator     0.825990
Name: proportion, dtype: float64


In [11]:
import pandas as pd

# Load parquet
df = pd.read_parquet("/kaggle/input/cicids2017/Bruteforce-Tuesday-no-metadata.parquet")

print("Loaded:", df.shape)

# Convert categorical columns to strings
cat_cols = df.select_dtypes(include=["category"]).columns
df[cat_cols] = df[cat_cols].astype(str)

# Convert object columns to string (optional but safe)
obj_cols = df.select_dtypes(include=["object"]).columns
df[obj_cols] = df[obj_cols].astype(str)

# Fill missing values
df = df.fillna("")

# Reset index
df.reset_index(drop=True, inplace=True)

# Save as CSV
df.to_csv("converted_output_final.csv", index=False)

print("File saved as converted_output.csv!")


Loaded: (389714, 78)
File saved as converted_output.csv!


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# ----------------------------------------
# 1. Load dataset
# ----------------------------------------
df = pd.read_csv("/kaggle/working/converted_output_final.csv")
print("Initial shape:", df.shape)
print(df["Label"].value_counts())

# ----------------------------------------
# 2. Combine Attack Classes (FTP + SSH) and Convert to 0/1
# ----------------------------------------
df["Attack"] = df["Label"].replace({
    "Benign": 1,
    "FTP-Patator": 0,
    "SSH-Patator": 0
})

print("\nUnique values in Attack:", df["Attack"].value_counts())

# ----------------------------------------
# 3. DROP the original Label column
# ----------------------------------------
df = df.drop(columns=["Label"])

# ----------------------------------------
# 4. BALANCING the dataset
# ----------------------------------------
benign_df = df[df["Attack"] == 1]
attack_df = df[df["Attack"] == 0]

min_size = min(len(benign_df), len(attack_df))

print("\nSampling each class to size =", min_size)

benign_bal = benign_df.sample(min_size, random_state=42)
attack_bal = attack_df.sample(min_size, random_state=42)

df_balanced = pd.concat([benign_bal, attack_bal], axis=0).sample(frac=1, random_state=42)

print("Balanced shape:", df_balanced.shape)
print("Balanced class counts:")
print(df_balanced["Attack"].value_counts())

# ----------------------------------------
# 5. Detect numeric + categorical columns
# ----------------------------------------
num_cols = df_balanced.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = df_balanced.select_dtypes(include=["object", "bool"]).columns.tolist()

# Remove target
if "Attack" in num_cols:
    num_cols.remove("Attack")

print("\nNumeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# ----------------------------------------
# 6. Handle Missing Values
# ----------------------------------------
if len(num_cols) > 0:
    df_balanced[num_cols] = df_balanced[num_cols].fillna(df_balanced[num_cols].median())

for col in cat_cols:
    if df_balanced[col].isnull().any():
        df_balanced[col] = df_balanced[col].fillna(df_balanced[col].mode()[0])

# ----------------------------------------
# 7. Encode Categorical Columns
# ----------------------------------------
le = LabelEncoder()
for col in cat_cols:
    df_balanced[col] = le.fit_transform(df_balanced[col].astype(str))

# ----------------------------------------
# 8. Scale Numeric Columns
# ----------------------------------------
scaler = MinMaxScaler()
if len(num_cols) > 0:
    df_balanced[num_cols] = scaler.fit_transform(df_balanced[num_cols])

# ----------------------------------------
# 9. Save final balanced + cleaned dataset
# ----------------------------------------
output = "/kaggle/working/ids2017_testing_balanced_cleaned.csv"
df_balanced.to_csv(output, index=False)

print("\n✅ Cleaning & Balancing Completed!")
print("📁 Saved as:", output)
print("Final Shape:", df_balanced.shape)
print("\nFinal Class Distribution:")
print(df_balanced["Attack"].value_counts())


Initial shape: (389714, 78)
Label
Benign         380564
FTP-Patator      5931
SSH-Patator      3219
Name: count, dtype: int64

Unique values in Attack: Attack
1    380564
0      9150
Name: count, dtype: int64


  df["Attack"] = df["Label"].replace({



Sampling each class to size = 9150
Balanced shape: (18300, 78)
Balanced class counts:
Attack
1    9150
0    9150
Name: count, dtype: int64

Numeric columns: ['Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

# ----------------------------------------
# 1. Load cleaned dataset
# ----------------------------------------
df = pd.read_csv("/kaggle/input/haruuu/ids2018_cleaned_combined_1.csv")
print("Loaded dataset:", df.shape)

# ----------------------------------------
# 2. Separate Features and Target
# ----------------------------------------
X = df.drop(columns=["Label"])
y = df["Label"]

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load dataset

print("Original dimension:", X.shape[1])

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# -----------------------------
# XGBoost Model
# -----------------------------
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# -----------------------------
# Predictions
# -----------------------------
y_pred = model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
test_prec = precision_score(y_test, y_pred, zero_division=0)
test_rec = recall_score(y_test, y_pred, zero_division=0)
test_f1 = f1_score(y_test, y_pred, zero_division=0)

# -----------------------------
# Final Results
# -----------------------------
print("\n===== XGBoost Results =====")
print(f"Test Accuracy : {test_acc:.8f}")
print(f"Precision     : {test_prec:.8f}")
print(f"Recall        : {test_rec:.8f}")
print(f"F1 Score      : {test_f1:.8f}")

import joblib
joblib.dump(model, "/kaggle/working/xgboost_ids2018.pkl")


Loaded dataset: (97802, 76)
Original dimension: 75

===== XGBoost Results =====
Test Accuracy : 0.99928429
Precision     : 0.99968600
Recall        : 0.99884961
F1 Score      : 0.99926763


['/kaggle/working/xgboost_ids2018.pkl']