In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
print("hi")

hi


In [3]:
# intersection_hlo_with_hillclimb_fast.py
# Pipeline (reduced budget + hill-climb) with UNION, INTERSECTION, and VOTING candidate flows:
#  PSO + GA + GWO (CatBoost fitness, lighter during opt) -> derive UNION / INTERSECTION / VOTING
#  For each candidate set: HLO (on candidates) -> Greedy hill-climb (restricted) -> Final CatBoost eval (5-fold CV)
#  Additionally: train a CatBoost model on 80% of the data and evaluate on the held-out 20% test set
#  Train & save a CatBoost model for each flow (union / intersection / voting) using the 80/20 split.
# Prints logs, mean ± std for metrics, stage timings, saves results and models.

import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.base import clone

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# If you prefer to load CSV instead, uncomment and change:
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")

TARGET_COL = "Label"   # target column
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets for faster runs (you can tune these) ----------
PSO_SWARM = 15   # reduced swarm
PSO_ITERS = 3   # reduced iterations

GA_POP = 30      # reduced population
GA_GENS = 3     # reduced generations

GWO_WOLVES = 10
GWO_ITERS = 3

HLO_POP = 15
HLO_ITERS = 5
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb after HLO
HILLCLIMB_MAX_STEPS = 100   # stop if no improvement or step limit
HILLCLIMB_EVAL_CAP = 500    # safety cap on evaluations (prevent runaway)

# CV folds
CV_OPT = 2    # cheaper CV during optimization + HLO (speed)
CV_FINAL = 5  # final evaluation (A1 requested)

# CatBoost iterations
CB_ITER_OPT = 100    # iterations during optimization (smaller)
CB_ITER_HLO = 200
CB_ITER_FINAL = 500  # final evaluation iterations (bigger)

# Train/test split for final saved models
FINAL_TEST_SIZE = 0.2

SAVE_PREFIX = "hybrid_hlo_models"
# ------------------------------------------------------------------------

# Ensure df exists
try:
    df
except NameError:
    raise RuntimeError("DataFrame `df` not found. Assign your dataset to variable `df` or load at top.")

# Prepare data
X = df.drop(TARGET_COL, axis=1)

y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

# -------------------- Model factory (CatBoost) --------------------
def get_catboost_model(iterations=100):
    try:
        from catboost import CatBoostClassifier
    except Exception as e:
        raise ImportError("catboost not installed. Install with: pip install catboost") from e
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=MODEL_VERBOSE, random_seed=RANDOM_STATE, thread_count=-1)

# -------------------- Fitness cache --------------------
# key: tuple(selected original indices) -> float score
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    """
    Evaluate mask using CatBoost with CV and return average of acc,prec,rec,f1.
    Caches results to avoid re-evaluating identical subsets.
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.4f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.4f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GA SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.4f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.4f} selected={int(np.sum(best_mask))}")
    log(f"GWO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- INTERSECTION / UNION / VOTING --------------------
def get_intersection_mask(*masks):
    """Return mask that contains only features present in ALL provided masks."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    inter_idx = set(np.where(np.array(masks[0]).astype(bool))[0].tolist())
    for m in masks[1:]:
        idxs = set(np.where(np.array(m).astype(bool))[0].tolist())
        inter_idx = inter_idx.intersection(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in inter_idx:
        mask[i] = 1
    return mask


def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask


def get_voting_mask(*masks, threshold=2):
    """Return mask of features selected by at least `threshold` methods (default majority of 3 => 2)."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    counts = np.zeros(N_FEATURES, dtype=int)
    for m in masks:
        counts += np.array(m).astype(int)
    mask = (counts >= threshold).astype(int)
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.4f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.4f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    """
    Greedy single-bit flip hill-climb restricted to candidate indices.
    Starts from initial_mask (full-length). Tries flipping each candidate feature's bit:
    - If flip improves fitness, accept and restart scanning.
    - Stops when no improving flip found or max_steps/eval_cap reached.
    """
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.4f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)

    # Derive candidate masks
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    inter_mask = get_intersection_mask(pso_mask, ga_mask, gwo_mask)
    vote_mask = get_voting_mask(pso_mask, ga_mask, gwo_mask, threshold=2)

    candidate_sets = {
        'union': union_mask,
        'intersection': inter_mask,
        'voting': vote_mask
    }

    results_all = {}

    # run HLO -> hill-climb -> final evaluation -> train & save model for each candidate set
    for name, cand_mask in candidate_sets.items():
        log(f"===== PROCESSING {name.upper()} CANDIDATES =====")
        n_cand = int(np.sum(cand_mask))
        log(f"{name.upper()} candidate features: {n_cand}")
        if n_cand == 0:
            log(f"{name.upper()} empty — skipping HLO/hill-climb and model training.")
            results_all[name] = {'skipped': True, 'n_candidates': 0}
            continue

        # HLO on this candidate set
        hlo_mask, hlo_score, hlo_time = hlo_on_candidates(cand_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)

        # hill-climb restricted to candidate set
        hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, cand_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)

        # final CV evaluation
        final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)

        # Train final CatBoost model on 80% train and evaluate on 20% test (stratified)
        sel_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
        sel_features = [FEATURE_NAMES[i] for i in sel_idxs]

        if len(sel_features) == 0:
            log(f"No features selected after hill-climb for {name}, skipping model train.")
            results_all[name] = {'skipped': True, 'n_candidates': n_cand}
            continue

        X_sel = X[sel_features]
        X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

        model = get_catboost_model(iterations=CB_ITER_FINAL)
        model.fit(X_train, y_train)

        # evaluate on held-out test set (20%)
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_prec = precision_score(y_test, y_pred, zero_division=0)
        test_rec = recall_score(y_test, y_pred, zero_division=0)
        test_f1 = f1_score(y_test, y_pred, zero_division=0)

        test_metrics = {
            'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
            'n_test': int(X_test.shape[0])
        }

        # Save model to file (pickle)
        model_filename = f"{SAVE_PREFIX}_{name}_model.pkl"
        with open(model_filename, 'wb') as mf:
            pickle.dump(model, mf)

        # store results
        results_all[name] = {
            'n_candidates': n_cand,
            'hlo_score': float(hlo_score), 'hlo_time': int(hlo_time),
            'hc_score': float(hc_score), 'hc_time': int(hc_time),
            'final_eval': final_res,
            'selected_features': sel_features,
            'model_file': model_filename,
            'test_metrics': test_metrics
        }

        log(f"Saved trained CatBoost model for {name} -> {model_filename} (test_f1={test_f1:.4f})")

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)

    # Summary / save aggregated results
    print("==================== AGGREGATE SUMMARY ====================")
    print(f"PSO  -> opt_score={pso_score:.4f} selected={int(np.sum(pso_mask))} time={pso_time}s")
    print(f"GA   -> opt_score={ga_score:.4f} selected={int(np.sum(ga_mask))} time={ga_time}s")
    print(f"GWO  -> opt_score={gwo_score:.4f} selected={int(np.sum(gwo_mask))} time={gwo_time}s")
    print(f"Union candidates    : {int(np.sum(union_mask))}")
    print(f"Intersection candidates: {int(np.sum(inter_mask))}")
    print(f"Voting candidates   : {int(np.sum(vote_mask))}")
    print("-------------------------------------------------")

    for name, info in results_all.items():
        print(f"-- {name.upper()} SUMMARY --")
        if info.get('skipped'):
            print(" skipped (no candidates)")
            continue
        fe = info['final_eval']
        tm = info['test_metrics']
        print(f" Selected ({fe['n_features']}): {fe['features']}")
        print(f" CV F1   : {fe['f1_mean']:.4f} ± {fe['f1_std']:.4f}")
        print(f" Test F1 : {tm['f1']:.4f} (n_test={tm['n_test']})")
        print(f" Accuracy : {fe['acc_mean']:.4f} ± {fe['acc_std']:.4f}")
        print(f" Precision: {fe['prec_mean']:.4f} ± {fe['prec_std']:.4f}")
        print(f" Recall   : {fe['rec_mean']:.4f} ± {fe['rec_std']:.4f}")
        print(f" Model file: {info['model_file']}")



    # Save aggregated pipeline outputs
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask, "intersection_mask": inter_mask, "voting_mask": vote_mask,
        "results_all": results_all,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    log(f"Saved results to {SAVE_PREFIX}_results.pkl")
    log("===== PIPELINE COMPLETE =====")

[12:05:29] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====
[12:05:29] PSO START (swarm=15, iters=3, cv=2)
[12:08:35]  PSO iter 1/3 best_global=0.9992
[12:11:35]  PSO iter 2/3 best_global=0.9993
[12:14:35]  PSO iter 3/3 best_global=0.9993
[12:17:30] PSO DONE in 721s best_score=0.9993 selected=34
[12:17:30] PSO SELECTED FEATURES: ['Dst Port', 'Timestamp', 'Tot Fwd Pkts', 'TotLen Fwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Pkts/s', 'Fwd IAT Mean', 'Fwd IAT Max', 'Bwd IAT Tot', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Len', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Mean', 'FIN Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Fwd Seg Size Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Src IP', 'Src Port']
[12:17:30] GA START (pop=30, gens=3, cv=2)
[12:23:29]  GA gen 1/3 current_best=

pca

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")

X = df.drop("Label",axis=1)
y = df["Label"].astype(int)

# 1) Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2) PCA to retain 95% variance
pca = PCA(n_components=0.95)  
X_pca = pca.fit_transform(X_scaled)

print("Original dimension:", X.shape[1])
print("PCA dimension:", X_pca.shape[1])

# 3) Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.20, stratify=y, random_state=42
)

# 4) CatBoost
model = CatBoostClassifier(iterations=500,
                           learning_rate=0.05,
                           depth=6,
                           verbose=0)

# Fit
model.fit(X_train, y_train)

# Test prediction
y_pred = model.predict(X_test)

# Metrics
test_acc = accuracy_score(y_test, y_pred)
test_prec = precision_score(y_test, y_pred, zero_division=0)
test_rec = recall_score(y_test, y_pred, zero_division=0)
test_f1 = f1_score(y_test, y_pred, zero_division=0)

print("\n=== PCA MODEL RESULTS ===")
print("Test Accuracy :", test_acc)
print("Precision      :", test_prec)
print("Recall         :", test_rec)
print("F1 Score       :", test_f1)


Original dimension: 75
PCA dimension: 24

=== PCA MODEL RESULTS ===
Test Accuracy : 0.9941209549614027
Precision      : 0.9963223704949039
Recall         : 0.9916335494666387
F1 Score       : 0.9939724304208817


Chi sqaure 

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from catboost import CatBoostClassifier

# -------------------------
# 1. Prepare data
# -------------------------
TARGET_COL = "Label"

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

print("Original dimension:", X.shape[1])

# Train-test split (same style as your hybrid pipeline)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

# -------------------------
# 2. Scale to non-negative for chi-square
# -------------------------
scaler = MinMaxScaler()   # maps features to [0, 1]
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# 3. Try different numbers of selected features
# -------------------------
k_values = [5, 10, 15, 20, 30]

results_chi2 = {}

for k in k_values:
    print("\n" + "="*60)
    print(f"CHI-SQUARE + CatBoost with top-{k} features")
    print("="*60)

    # 3.1 Chi-Square feature selection
    selector = SelectKBest(score_func=chi2, k=k)
    X_train_k = selector.fit_transform(X_train_scaled, y_train)
    X_test_k = selector.transform(X_test_scaled)

    # Get selected feature names (from original X)
    selected_mask = selector.get_support()
    selected_features = X.columns[selected_mask].tolist()
    print(f"Selected {k} features:")
    print(selected_features)

    # 3.2 Train CatBoost on selected features
    model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=42
    )

    model.fit(X_train_k, y_train)

    # 3.3 Evaluate on test set
    y_pred = model.predict(X_test_k)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Test Accuracy : {acc:.6f}")
    print(f"Precision     : {prec:.6f}")
    print(f"Recall        : {rec:.6f}")
    print(f"F1 Score      : {f1:.6f}")

    # store results for later comparison
    results_chi2[k] = {
        "features": selected_features,
        "acc": acc,
        "prec": prec,
        "rec": rec,
        "f1": f1
    }

print("\n=========== SUMMARY: Chi-Square + CatBoost ===========")
for k, info in results_chi2.items():
    print(f"Top-{k} features -> F1={info['f1']:.6f}, Acc={info['acc']:.6f}")


Original dimension: 75

CHI-SQUARE + CatBoost with top-5 features
Selected 5 features:
['Dst Port', 'Timestamp', 'Bwd Pkts/s', 'Init Bwd Win Byts', 'Fwd Seg Size Min']
Test Accuracy : 0.999131
Precision     : 0.999686
Recall        : 0.998536
F1 Score      : 0.999111

CHI-SQUARE + CatBoost with top-10 features
Selected 10 features:
['Dst Port', 'Timestamp', 'Bwd Pkt Len Min', 'Flow Pkts/s', 'Bwd IAT Min', 'Bwd Pkts/s', 'ACK Flag Cnt', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min']
Test Accuracy : 0.999080
Precision     : 0.999581
Recall        : 0.998536
F1 Score      : 0.999058

CHI-SQUARE + CatBoost with top-15 features
Selected 15 features:
['Dst Port', 'Timestamp', 'Fwd Pkt Len Max', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Flow Pkts/s', 'Bwd IAT Mean', 'Bwd IAT Min', 'Bwd Pkts/s', 'ACK Flag Cnt', 'Bwd Seg Size Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min']
Test Accuracy : 0.999233
Precision     : 0.999791
Recall        : 0.9986

Mutual information

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from catboost import CatBoostClassifier

# -------------------------
# 1. Prepare data
# -------------------------
TARGET_COL = "Label"

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

print("Original dimension:", X.shape[1])

# Train-test split (keep same style as other experiments)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

# -------------------------
# 2. Try different numbers of selected features
# -------------------------
k_values = [5, 10, 15, 20, 30]

results_mi = {}

for k in k_values:
    print("\n" + "="*60)
    print(f"MUTUAL INFORMATION + CatBoost with top-{k} features")
    print("="*60)

    # 2.1 Mutual Information feature selection
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_train_k = selector.fit_transform(X_train, y_train)
    X_test_k = selector.transform(X_test)

    # Get selected feature names (from original X)
    selected_mask = selector.get_support()
    selected_features = X.columns[selected_mask].tolist()
    print(f"Selected {k} features:")
    print(selected_features)

    # 2.2 Train CatBoost on selected features
    model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=42
    )

    model.fit(X_train_k, y_train)

    # 2.3 Evaluate on test set
    y_pred = model.predict(X_test_k)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Test Accuracy : {acc:.6f}")
    print(f"Precision     : {prec:.6f}")
    print(f"Recall        : {rec:.6f}")
    print(f"F1 Score      : {f1:.6f}")

    # store results for later comparison
    results_mi[k] = {
        "features": selected_features,
        "acc": acc,
        "prec": prec,
        "rec": rec,
        "f1": f1
    }

print("\n=========== SUMMARY: Mutual Information + CatBoost ===========")
for k, info in results_mi.items():
    print(f"Top-{k} features -> F1={info['f1']:.6f}, Acc={info['acc']:.6f}")


Original dimension: 75

MUTUAL INFORMATION + CatBoost with top-5 features
Selected 5 features:
['Dst Port', 'Timestamp', 'Fwd Header Len', 'Pkt Len Max', 'Init Fwd Win Byts']
Test Accuracy : 0.999284
Precision     : 0.999895
Recall        : 0.998640
F1 Score      : 0.999267

MUTUAL INFORMATION + CatBoost with top-10 features
Selected 10 features:
['Dst Port', 'Timestamp', 'TotLen Fwd Pkts', 'Fwd Header Len', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'Subflow Fwd Byts', 'Init Fwd Win Byts']
Test Accuracy : 0.999131
Precision     : 0.999895
Recall        : 0.998327
F1 Score      : 0.999110

MUTUAL INFORMATION + CatBoost with top-15 features
Selected 15 features:
['Dst Port', 'Timestamp', 'TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Flow IAT Max', 'Fwd Header Len', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Subflow Fwd Byts', 'Subflow Bwd Byts', 'Init Fwd Win Byts']
Test Accuracy : 0.999029
Precision     : 0.999686
Recall

Recursive feature elimination

In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from catboost import CatBoostClassifier

# -------------------------
# 1. Prepare data
# -------------------------
TARGET_COL = "Label"

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

print("Original dimension:", X.shape[1])

# Train-test split (same style as other baselines)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

# -------------------------
# 2. Try different numbers of selected features with RFE
# -------------------------
k_values = [5, 10, 15, 20, 30]

results_rfe = {}

for k in k_values:
    print("\n" + "="*60)
    print(f"RFE (CatBoost) with top-{k} features")
    print("="*60)

    # Base estimator for RFE (reduced iterations to save time)
    base_model = CatBoostClassifier(
        iterations=200,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=42
    )

    # 2.1 RFE setup
    selector = RFE(
        estimator=base_model,
        n_features_to_select=k,
        step=1
    )

    # Fit RFE on training data
    selector.fit(X_train, y_train)

    # Transform train and test sets
    X_train_k = selector.transform(X_train)
    X_test_k = selector.transform(X_test)

    # Get selected feature names
    selected_mask = selector.get_support()
    selected_features = X.columns[selected_mask].tolist()
    print(f"Selected {k} features:")
    print(selected_features)

    # 2.2 Train a fresh CatBoost on the selected features (for fair comparison)
    model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=42
    )

    model.fit(X_train_k, y_train)

    # 2.3 Evaluate on test set
    y_pred = model.predict(X_test_k)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Test Accuracy : {acc:.6f}")
    print(f"Precision     : {prec:.6f}")
    print(f"Recall        : {rec:.6f}")
    print(f"F1 Score      : {f1:.6f}")

    # store results for later comparison
    results_rfe[k] = {
        "features": selected_features,
        "acc": acc,
        "prec": prec,
        "rec": rec,
        "f1": f1
    }

print("\n=========== SUMMARY: RFE (CatBoost) ===========")
for k, info in results_rfe.items():
    print(f"Top-{k} features -> F1={info['f1']:.6f}, Acc={info['acc']:.6f}")


Original dimension: 75

RFE (CatBoost) with top-5 features
Selected 5 features:
['Dst Port', 'Timestamp', 'Flow IAT Mean', 'Fwd IAT Mean', 'Init Bwd Win Byts']
Test Accuracy : 0.999335
Precision     : 0.999686
Recall        : 0.998954
F1 Score      : 0.999320

RFE (CatBoost) with top-10 features
Selected 10 features:
['Dst Port', 'Timestamp', 'Bwd Pkt Len Min', 'Flow Pkts/s', 'Flow IAT Mean', 'Fwd IAT Mean', 'Bwd Seg Size Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min']
Test Accuracy : 0.999029
Precision     : 0.999477
Recall        : 0.998536
F1 Score      : 0.999006

RFE (CatBoost) with top-15 features
Selected 15 features:
['Dst Port', 'Timestamp', 'Flow Duration', 'Bwd Pkt Len Min', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd Pkts/s', 'ECE Flag Cnt', 'Bwd Seg Size Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min']
Test Accuracy : 0.999131
Precision     : 0.999477
Recall        : 0.998745
F1 Score      : 0.9

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd

# Load your dataset
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")

X = df.drop("Label",axis=1)
y = df["Label"].astype(int)

feature_names = X.columns.tolist()
num_features = X.shape[1]

# Train RF to get feature importance (100 trees is enough)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Importance list (sort descending)
importances = rf.feature_importances_
rank_idx = np.argsort(importances)[::-1]  # descending order
sorted_features = [feature_names[i] for i in rank_idx]

print("Original dimension:", num_features)

# Evaluate CatBoost using varying top-k selected features
K_values = [5, 10, 15, 20, 30]
results = {}

print("\n============================================================")
print("RANDOM FOREST IMPORTANCE + CATBOOST BASELINE")
print("============================================================")

for k in K_values:
    top_k_features = sorted_features[:k]
    X_sel = X[top_k_features]

    X_train, X_test, y_train, y_test = train_test_split(
        X_sel, y, test_size=0.20, stratify=y, random_state=42
    )

    model = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=6, verbose=0)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    test_acc = accuracy_score(y_test, y_pred)
    test_prec = precision_score(y_test, y_pred, zero_division=0)
    test_rec = recall_score(y_test, y_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"\n============================================================")
    print(f"RF Importance + CatBoost with top-{k} features")
    print("============================================================")
    print("Selected", k, "features:")
    print(top_k_features)
    print(f"Test Accuracy : {test_acc:.8f}")
    print(f"Precision     : {test_prec:.8f}")
    print(f"Recall        : {test_rec:.8f}")
    print(f"F1 Score      : {test_f1:.8f}")

    results[k] = test_f1

print("\n=========== SUMMARY: Random Forest Importance + CatBoost ===========")
for k in K_values:
    print(f"Top-{k} features -> F1={results[k]:.8f}")


Original dimension: 75

RANDOM FOREST IMPORTANCE + CATBOOST BASELINE

RF Importance + CatBoost with top-5 features
Selected 5 features:
['Timestamp', 'Fwd Seg Size Min', 'Dst Port', 'Init Fwd Win Byts', 'Init Bwd Win Byts']
Test Accuracy : 0.99913092
Precision     : 0.99958128
Recall        : 0.99864045
F1 Score      : 0.99911065

RF Importance + CatBoost with top-10 features
Selected 10 features:
['Timestamp', 'Fwd Seg Size Min', 'Dst Port', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Pkt Len Max', 'TotLen Fwd Pkts', 'Fwd Header Len', 'Bwd Pkt Len Mean', 'Fwd Pkt Len Mean']
Test Accuracy : 0.99918205
Precision     : 0.99989527
Recall        : 0.99843129
F1 Score      : 0.99916274

RF Importance + CatBoost with top-15 features
Selected 15 features:
['Timestamp', 'Fwd Seg Size Min', 'Dst Port', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Pkt Len Max', 'TotLen Fwd Pkts', 'Fwd Header Len', 'Bwd Pkt Len Mean', 'Fwd Pkt Len Mean', 'Fwd Pkts/s', 'Pkt Len Max', 'Bwd Seg Size Avg', 'Flow

L1 Lasso Feature Selection + CatBoost

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd

# Load dataset
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")

X = df.drop("Label", axis=1)
y = df["Label"].astype(int)
feature_names = X.columns.tolist()

print("Original dimension:", X.shape[1])

# Train Logistic regression with L1 penalty to select features
log_reg = LogisticRegression(penalty='l1', solver='liblinear', C=0.3, max_iter=2000)
log_reg.fit(X, y)

# Get non-zero coefficients
coef = log_reg.coef_[0]
selected_idx = np.where(coef != 0)[0]
selected_features = [feature_names[i] for i in selected_idx]

print("\nSelected features using L1 (Lasso):")
print(selected_features)
print("Total selected:", len(selected_features))

# ----- CatBoost on selected features -----

X_sel = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y, test_size=0.20, stratify=y, random_state=42
)

model = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=6, verbose=0)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
test_prec = precision_score(y_test, y_pred, zero_division=0)
test_rec = recall_score(y_test, y_pred, zero_division=0)
test_f1 = f1_score(y_test, y_pred, zero_division=0)

print("\n===== L1 (Lasso) + CatBoost Results =====")
print(f"Test Accuracy : {test_acc:.8f}")
print(f"Precision     : {test_prec:.8f}")
print(f"Recall        : {test_rec:.8f}")
print(f"F1 Score      : {test_f1:.8f}")


Original dimension: 75

Selected features using L1 (Lasso):
['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Flow Byts/s', 'Flow IAT Mean', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Bwd IAT Tot', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'ECE Flag Cnt', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Subflow Fwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min', 'Active Max', 'Idle Max', 'Flow ID', 'Src IP', 'Src Port']
Total selected: 44

===== L1 (Lasso) + CatBoost Results =====
Test Accuracy : 0.99928429
Precision     : 0.99989529
Recall        : 0.99864045
F1 Score      : 0.99926748


In [9]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from catboost import CatBoostClassifier

# -------------------------
# 1. Load data
# -------------------------
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")

TARGET_COL = "Label"

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

feature_names = X.columns.tolist()
print("Original dimension:", X.shape[1])

# -------------------------
# 2. Train CatBoost on ALL features to get importance
# -------------------------
base_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    verbose=0,
    random_seed=42
)

base_model.fit(X, y)

# Get feature importances from CatBoost
importances = base_model.get_feature_importance()
# Sort indices by importance (descending)
sorted_idx = np.argsort(importances)[::-1]
sorted_features = [feature_names[i] for i in sorted_idx]

print("\nTop 20 features by CatBoost importance:")
for i in range(20):
    print(f"{i+1:2d}. {sorted_features[i]}  (importance={importances[sorted_idx[i]]:.6f})")

# -------------------------
# 3. Evaluate CatBoost using top-k important features
# -------------------------
K_values = [5, 10, 15, 20, 30]
results_cb_imp = {}

print("\n============================================================")
print("CATBOOST FEATURE IMPORTANCE + CATBOOST BASELINE")
print("============================================================")

for k in K_values:
    top_k_features = sorted_features[:k]
    X_sel = X[top_k_features]

    # Same train-test strategy as other baselines
    X_train, X_test, y_train, y_test = train_test_split(
        X_sel, y,
        test_size=0.20,
        stratify=y,
        random_state=42
    )

    model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"\n============================================================")
    print(f"CatBoost-Importance + CatBoost with top-{k} features")
    print("============================================================")
    print(f"Selected {k} features:")
    print(top_k_features)
    print(f"Test Accuracy : {acc:.8f}")
    print(f"Precision     : {prec:.8f}")
    print(f"Recall        : {rec:.8f}")
    print(f"F1 Score      : {f1:.8f}")

    results_cb_imp[k] = {
        "features": top_k_features,
        "acc": acc,
        "prec": prec,
        "rec": rec,
        "f1": f1
    }

print("\n=========== SUMMARY: CatBoost Importance + CatBoost ===========")
for k, info in results_cb_imp.items():
    print(f"Top-{k} features -> F1={info['f1']:.8f}, Acc={info['acc']:.8f}")


Original dimension: 75

Top 20 features by CatBoost importance:
 1. Timestamp  (importance=71.969810)
 2. Dst Port  (importance=16.404052)
 3. Flow IAT Min  (importance=1.128789)
 4. Init Fwd Win Byts  (importance=0.973730)
 5. Bwd Pkt Len Min  (importance=0.850694)
 6. Fwd Seg Size Min  (importance=0.802357)
 7. Fwd Pkts/s  (importance=0.709243)
 8. Fwd IAT Min  (importance=0.548268)
 9. RST Flag Cnt  (importance=0.462437)
10. Flow Pkts/s  (importance=0.458003)
11. Flow IAT Mean  (importance=0.453412)
12. Init Bwd Win Byts  (importance=0.393335)
13. Fwd IAT Mean  (importance=0.331477)
14. Fwd IAT Max  (importance=0.300318)
15. Bwd Pkt Len Mean  (importance=0.293593)
16. Bwd Pkt Len Max  (importance=0.284212)
17. Bwd Seg Size Avg  (importance=0.273176)
18. Fwd IAT Tot  (importance=0.258784)
19. ECE Flag Cnt  (importance=0.247212)
20. Flow IAT Max  (importance=0.243708)

CATBOOST FEATURE IMPORTANCE + CATBOOST BASELINE

CatBoost-Importance + CatBoost with top-5 features
Selected 5 featur

In [4]:
#VOTING FOR 20 ITERATIONS


# hybrid_voting_hlo_ddos_pipeline.py
# Single-file: PSO + GA + GWO -> VOTING -> HLO -> Hill-climb -> Final CatBoost
# Option A: optimization subset = 3000 rows (1500 benign + 1500 attack)

import kagglehub
import glob, os, time, pickle, warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.base import clone
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# -----------------------
# USER CONFIG
# -----------------------
DATA_PATH = kagglehub.dataset_download("sizlingdhairya1/cicddos2019")  # your loader
OPT_SUBSET_PER_CLASS = 1500     # 1500 benign + 1500 attack => 3000 rows
PSO_SWARM = 8
PSO_ITERS = 20
GA_POP = 12
GA_GENS = 20
GWO_WOLVES = 8
GWO_ITERS = 20
HLO_POP = 8
HLO_ITERS = 10
FIT_CB_ITERS_OPT = 80    # CatBoost iterations used inside fitness (fast)
CV_OPT = 2               # cheap CV inside optimizer
FINAL_CB_ITERS = 1000    # final model iterations (early stopping used)
FINAL_EARLY_STOP = 50
SAVE_PREFIX = "ddos_hybrid_voting_hlo"
LEAKAGE_SINGLE_FEATURE_THRESHOLD = 0.99999  # single-feature accuracy threshold to treat as leakage

print("DATA_PATH:", DATA_PATH)

# -----------------------
# 1) load all CSVs from dataset path
# -----------------------
csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
if len(csv_files) == 0:
    raise RuntimeError("No CSV files found in DATA_PATH: " + DATA_PATH)

print(f"Found {len(csv_files)} CSV files. Loading & merging (may take a bit)...")
dfs = []
for f in csv_files:
    print(" ->", os.path.basename(f))
    dfs.append(pd.read_csv(f, low_memory=False))
df = pd.concat(dfs, ignore_index=True)
print("Merged dataset shape:", df.shape)

# -----------------------
# 2) normalize columns and find label
# -----------------------
df.columns = df.columns.str.strip()
TARGET_CANDIDATES = ["Label", "label", " Attack", "attack_cat", "Label "]
found_label = None
for c in ["Label", "label", "Attack", "attack", "attack_cat"]:
    if c in df.columns:
        found_label = c
        break
if found_label is None:
    # try case-insensitive lookup
    for c in df.columns:
        if c.strip().lower() == "label" or c.strip().lower() == "attack":
            found_label = c
            break
if found_label is None:
    raise RuntimeError("Cannot find label column. Columns available: " + ", ".join(df.columns[:30]))

# normalize label column name
df.rename(columns={found_label: "Label"}, inplace=True)
print("Using target column 'Label' (original: {})".format(found_label))

# -----------------------
# 3) keep only rows with non-null label and make binary target
# -----------------------
df = df[df["Label"].notna()].copy()
df["Label"] = df["Label"].astype(str).str.strip().str.lower()
# convert benign -> 0 else -> 1 (attack)
df["Label"] = df["Label"].apply(lambda x: 0 if x == "benign" else 1)
print("Label counts (full):\n", df["Label"].value_counts())

# -----------------------
# 4) drop obviously leaking columns if present (IDs, IPs, timestamps)
# -----------------------
possible_leak_cols = [c for c in df.columns if c.strip().lower() in (
    "id", "flow id", "flowid", "timestamp", "ts", "source ip", "destination ip",
    "src ip", "dst ip", "sourceip", "destinationip", "srcip", "dstip")]
if possible_leak_cols:
    print("Dropping likely-leakage columns (ids/timestamps/ips):", possible_leak_cols)
    df.drop(columns=[c for c in possible_leak_cols if c in df.columns], inplace=True)

# -----------------------
# 5) basic cleaning: drop all-empty columns, replace inf, drop rows with NaN
# -----------------------
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="any", inplace=True)   # safe because we'll use full dataset for final, but optimizer needs no missing
print("After basic cleaning:", df.shape)

# -----------------------
# 6) create balanced small subset for optimization: OPT_SUBSET_PER_CLASS * 2 rows
# -----------------------
counts = df["Label"].value_counts().to_dict()
n_attack = counts.get(1, 0)
n_benign = counts.get(0, 0)
take_attack = min(OPT_SUBSET_PER_CLASS, n_attack)
take_benign = min(OPT_SUBSET_PER_CLASS, n_benign)
if take_attack < 10 or take_benign < 10:
    raise RuntimeError("Not enough rows in one class to form the optimization subset. counts=" + str(counts))

df_attack = df[df["Label"] == 1].sample(take_attack, random_state=42)
df_benign = df[df["Label"] == 0].sample(take_benign, random_state=42)
df_sub = pd.concat([df_attack, df_benign], ignore_index=True).sample(frac=1.0, random_state=42).reset_index(drop=True)
print("Optimization subset shape:", df_sub.shape, "Label counts:", df_sub["Label"].value_counts().to_dict())

# -----------------------
# 7) preprocess subset: encode categorical & scale numeric
# -----------------------
TARGET_COL = "Label"
X_sub = df_sub.drop(columns=[TARGET_COL]).copy()
y_sub = df_sub[TARGET_COL].astype(int).copy()

# encode object columns
obj_cols = X_sub.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    X_sub[c] = LabelEncoder().fit_transform(X_sub[c].astype(str))
# numeric scaling
num_cols_sub = X_sub.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols_sub) > 0:
    X_sub[num_cols_sub] = MinMaxScaler().fit_transform(X_sub[num_cols_sub])

FEATURE_NAMES = X_sub.columns.tolist()
N_FEATURES = len(FEATURE_NAMES)
print("Subset features:", N_FEATURES)

# -----------------------
# 8) CatBoost factory & fitness function with caching
# -----------------------
def get_catboost_model(iterations=FIT_CB_ITERS_OPT):
    return CatBoostClassifier(iterations=iterations, learning_rate=0.05, depth=6,
                              verbose=0, random_seed=42)

fitness_cache = {}
def evaluate_mask(mask_bool, cv=CV_OPT, cb_iter=FIT_CB_ITERS_OPT):
    key = tuple(int(x) for x in mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    idxs = [i for i,b in enumerate(key) if b==1]
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0
    Xsel = X_sub.iloc[:, idxs]
    model = get_catboost_model(iterations=cb_iter)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(clone(model), Xsel, y_sub, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
    except Exception as e:
        # if CatBoost fails (e.g. unexpected types), return 0
        fitness_cache[key] = 0.0
        return 0.0
    val = float(np.mean(scores))
    fitness_cache[key] = val
    return val

# -----------------------
# 9) PSO (binary) - reduced
# -----------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS):
    print("[PSO] start: swarm", swarm_size, "iters", iters)
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim))
    vel = np.random.uniform(-1,1,(swarm_size,dim))
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask(p) for p in pbest])
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]
    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        print(" PSO iter", t+1, "/", iters, "best", gbest_score)
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)
            sc = evaluate_mask(pos[i])
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy(); pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy(); gbest_score = sc
        w = max(0.2, w*0.97)
    print("[PSO] done best score", gbest_score, "selected", int(np.sum(gbest)))
    return gbest

# -----------------------
# 10) GA (binary) - reduced
# -----------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS):
    print("[GA] start: pop", pop_size, "gens", gens)
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    for g in range(gens):
        print(" GA gen", g+1, "/", gens, "best", fitnesses.max())
        elite_idxs = np.argsort(fitnesses)[-2:]
        new_pop = [pop[elite_idxs[0]].copy(), pop[elite_idxs[1]].copy()]
        while len(new_pop) < pop_size:
            p1 = pop[np.random.randint(pop_size)].copy()
            p2 = pop[np.random.randint(pop_size)].copy()
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                child = np.concatenate([p1[:pt], p2[pt:]])
            else:
                child = p1
            # mutation
            for d in range(dim):
                if np.random.rand() < 0.05:
                    child[d] = 1-child[d]
            new_pop.append(child)
        pop = np.array(new_pop[:pop_size])
        fitnesses = np.array([evaluate_mask(ind) for ind in pop])
    best = pop[np.argmax(fitnesses)]
    print("[GA] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 11) GWO (binary) - reduced
# -----------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS):
    print("[GWO] start: wolves", wolves, "iters", iters)
    dim = N_FEATURES
    pack = np.random.randint(0,2,(wolves, dim))
    fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0
    for itr in range(iters):
        print(" GWO iter", itr+1, "/", iters, "best", Alpha_score)
        # update alpha/beta/delta
        for i in range(wolves):
            sc = fitnesses[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pack[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pack[i].copy()
            elif sc > Delta_score:
                Delta_score = sc; Delta = pack[i].copy()
        a = 2 - itr*(2.0/iters)
        for i in range(wolves):
            if Alpha is None:
                continue
            for d in range(dim):
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2*a*r1 - a; C1 = 2*r2
                D_alpha = abs(C1*Alpha[d] - pack[i][d])
                X1 = Alpha[d] - A1*D_alpha
                # use X1 approx only (keeps it simple + fast)
                s = 1.0/(1.0+np.exp(-X1))
                pack[i][d] = 1 if np.random.rand() < s else 0
        fitnesses = np.array([evaluate_mask(ind) for ind in pack])
    best = pack[np.argmax(fitnesses)]
    print("[GWO] done best score", fitnesses.max(), "selected", int(np.sum(best)))
    return best

# -----------------------
# 12) RUN OPTIMIZERS (PSO, GA, GWO)
# -----------------------
t0 = time.time()
mask_pso = run_pso()
mask_ga = run_ga()
mask_gwo = run_gwo()
t1 = time.time()
print("Optimizers finished in", int(t1-t0), "s")

# Save raw masks
os.makedirs("outputs", exist_ok=True)
pickle.dump({"mask_pso": mask_pso.tolist(), "mask_ga": mask_ga.tolist(), "mask_gwo": mask_gwo.tolist()}, open(os.path.join("outputs", SAVE_PREFIX + "_raw_masks.pkl"), "wb"))


# -----------------------
# 13) INTERSECTION of PSO, GA, GWO (strict intersection — NO fallback)
# -----------------------
mask_pso_arr = np.array(mask_pso).astype(int)
mask_ga_arr  = np.array(mask_ga).astype(int)
mask_gwo_arr = np.array(mask_gwo).astype(int)

# strict intersection: feature must be chosen by ALL three optimizers
intersection_mask = (mask_pso_arr & mask_ga_arr & mask_gwo_arr).astype(int)
selected_indices = list(np.where(intersection_mask == 1)[0])
selected_features_intersection = [FEATURE_NAMES[i] for i in selected_indices]

print("Intersection selected features count:", len(selected_indices))
print("Intersection selected features:", selected_features_intersection)

# Save intersection mask
pickle.dump(
    {"intersection_mask": intersection_mask.tolist(), "selected_features_intersection": selected_features_intersection},
    open(os.path.join("outputs", SAVE_PREFIX + "_intersection.pkl"), "wb")
)

# -----------------------
# 14) HLO on candidate set (candidates = intersection selected)
# -----------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(cand_idxs)
    if k == 0:
        raise RuntimeError("No candidates for HLO (intersection is empty)")
    print("[HLO] start on", k, "candidates")
    pop = np.random.randint(0,2,(pop_size, k))
    def fitness_local(bitmask):
        full = np.zeros(N_FEATURES, dtype=int)
        for j,b in enumerate(bitmask):
            if int(b)==1:
                full[cand_idxs[j]] = 1
        return evaluate_mask(full)
    fitness_scores = np.array([fitness_local(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    for it in range(iters):
        print(" HLO iter", it+1, "/", iters, "best", best_score)
        teacher = pop[int(np.argmax([fitness_local(x) for x in pop]))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching
            for d in range(k):
                if np.random.rand() < 0.75:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < 0.12:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_local(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()
    final_full = np.zeros(N_FEATURES, dtype=int)
    for j,b in enumerate(best_solution):
        if int(b)==1:
            final_full[cand_idxs[j]] = 1
    print("[HLO] done best local score", best_score, "selected", int(final_full.sum()))
    return final_full, best_score

# call HLO using the strict intersection mask
hlo_mask, hlo_score = hlo_on_candidates(intersection_mask)
pickle.dump({"hlo_mask": hlo_mask.tolist(), "hlo_score": hlo_score},
            open(os.path.join("outputs", SAVE_PREFIX + "_hlo.pkl"), "wb"))

# -----------------------
# 15) Greedy hill-climb restricted to candidate indices (use intersection_mask)
# -----------------------
def hill_climb(initial_mask, candidate_mask, max_steps=100, eval_cap=500):
    cand_idxs = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    cur = initial_mask.copy()
    cur_score = evaluate_mask(cur)
    steps = 0
    evals = 0
    improved = True
    print("[HC] start: candidates", len(cand_idxs))
    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(cand_idxs):
            trial = cur.copy()
            trial[idx] = 1 - trial[idx]
            sc = evaluate_mask(trial)
            evals += 1
            if sc > cur_score + 1e-8:
                cur = trial
                cur_score = sc
                improved = True
                steps += 1
                print(f" HC step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score {cur_score:.4f} (evals={evals})")
                break
    print("[HC] done steps", steps, "evals", evals, "final_score", cur_score, "selected", int(cur.sum()))
    return cur, cur_score

hc_mask, hc_score = hill_climb(hlo_mask, intersection_mask)
pickle.dump({"hc_mask": hc_mask.tolist(), "hc_score": hc_score},
            open(os.path.join("outputs", SAVE_PREFIX + "_hc.pkl"), "wb"))



# -----------------------
# 16) Selected features after hill-climb (final_mask)
# -----------------------
final_mask = hc_mask
final_selected_indices = np.where(np.array(final_mask).astype(bool))[0].tolist()
final_selected = [FEATURE_NAMES[i] for i in final_selected_indices]
print("Final selected features:", final_selected, "count:", len(final_selected))

# -----------------------
# 17) Leakage check: drop single-feature perfect predictors
# -----------------------
def single_feature_predictive_accuracy(feature_series, labels):
    # map each feature value to most common label for that value, compute accuracy
    mapping = feature_series.groupby(feature_series).apply(lambda s: labels[s.index].mode().iloc[0])
    preds = feature_series.map(mapping)
    return (preds.values == labels.values).mean()

# check each final feature; if single-feature accuracy >= threshold, drop it
to_drop = []
for f in final_selected:
    acc = single_feature_predictive_accuracy(X_sub[f], y_sub)
    if acc >= LEAKAGE_SINGLE_FEATURE_THRESHOLD or acc == 1.0:
        print(f"Leakage-suspect feature '{f}' single-feature accuracy={acc:.6f} -> will drop")
        to_drop.append(f)

if to_drop:
    final_selected = [f for f in final_selected if f not in to_drop]
    final_selected_indices = [FEATURE_NAMES.index(f) for f in final_selected]
    print("After dropping leakage suspects, final features:", final_selected)

if len(final_selected) == 0:
    raise RuntimeError("No safe features remain after leakage check. Consider lowering threshold or manual check.")

# Save final selected features
pickle.dump({"final_selected": final_selected, "final_mask": final_mask.tolist()},
            open(os.path.join("outputs", SAVE_PREFIX + "_final_selected.pkl"), "wb"))

# -----------------------
# 18) Prepare FULL dataset with same preprocessing for final training
# -----------------------
# Reuse df (full merged) earlier but ensure the same preprocessing as subset
df_full = df.copy()
# Already dropped leak columns earlier and trimmed nulls; ensure same features exist
missing_in_full = [f for f in final_selected if f not in df_full.columns]
if missing_in_full:
    raise RuntimeError("Selected features missing from full dataset: " + str(missing_in_full))

# Keep only final selected + label
df_full = df_full[final_selected + ["Label"]].copy()

# Convert object columns to numeric (LabelEncode) and fill NaN
for c in df_full.columns:
    if c != "Label" and df_full[c].dtype == "object":
        df_full[c] = LabelEncoder().fit_transform(df_full[c].astype(str))
df_full.replace([np.inf, -np.inf], np.nan, inplace=True)
df_full.fillna(0, inplace=True)

# Scale numeric columns (MinMax) using full data
num_cols = [c for c in final_selected if pd.api.types.is_numeric_dtype(df_full[c])]
if len(num_cols) > 0:
    df_full[num_cols] = MinMaxScaler().fit_transform(df_full[num_cols])

X_full = df_full.drop(columns=["Label"])
y_full = df_full["Label"].astype(int)
print("Full final training shape:", X_full.shape, "Label dist:", y_full.value_counts().to_dict())

# -----------------------
# 19) Final train/test split (80/20 stratified) and final CatBoost training with regularization + early stopping
# -----------------------
minclass = y_full.value_counts().min()
if minclass < 10:
    print("Warning: small class size after selecting features:", minclass)

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

final_params = {
    "iterations": FINAL_CB_ITERS,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 7.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "random_strength": 1.0,
    "verbose": 50,
    "random_seed": 42
}
final_model = CatBoostClassifier(**final_params)

print("Training final model on full data with early stopping...")
final_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=FINAL_EARLY_STOP, use_best_model=True)

# Evaluate on hold-out test
y_pred = final_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print("\n=== FINAL HOLDOUT METRICS ===")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Quick 5-fold CV estimate (fast: reduced iters)
cv_model = CatBoostClassifier(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=7.0,
                              bootstrap_type="Bernoulli", subsample=0.8, random_seed=42, verbose=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring="accuracy", n_jobs=-1)
f1s = cross_val_score(cv_model, X_full, y_full, cv=skf, scoring=make_scorer(f1_score), n_jobs=-1)
print("\n5-fold CV (quick estimate) -> Accuracy: %.4f ± %.4f ; F1: %.4f ± %.4f" % (accs.mean(), accs.std(), f1s.mean(), f1s.std()))

# -----------------------
# 20) Save final model & selected features
# -----------------------
pickle.dump({"model": final_model, "features": final_selected, "mask": final_mask.tolist()},
            open(os.path.join("outputs", SAVE_PREFIX + "_final_model.pkl"), "wb"))
print("Saved final model + features -> outputs/{}_final_model.pkl".format(SAVE_PREFIX))

print("PIPELINE COMPLETE")

DATA_PATH: /kaggle/input/cicddos2019
Found 1 CSV files. Loading & merging (may take a bit)...
 -> Random_combine_final.csv
Merged dataset shape: (300000, 88)
Using target column 'Label' (original: Label)
Label counts (full):
 Label
1    299513
0       487
Name: count, dtype: int64
Dropping likely-leakage columns (ids/timestamps/ips): ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
After basic cleaning: (290753, 84)
Optimization subset shape: (1980, 84) Label counts: {1: 1500, 0: 480}
Subset features: 83
[PSO] start: swarm 8 iters 20
 PSO iter 1 / 20 best 0.9993342210386151
 PSO iter 2 / 20 best 0.9993342210386151
 PSO iter 3 / 20 best 0.9993342210386151
 PSO iter 4 / 20 best 0.9993342210386151
 PSO iter 5 / 20 best 0.9993342210386151
 PSO iter 6 / 20 best 0.9993342210386151
 PSO iter 7 / 20 best 1.0
 PSO iter 8 / 20 best 1.0
 PSO iter 9 / 20 best 1.0
 PSO iter 10 / 20 best 1.0
 PSO iter 11 / 20 best 1.0
 PSO iter 12 / 20 best 1.0
 PSO iter 13 / 20 best 1.0
 PSO iter 14 / 20 bes

ALL MODELS RUN CHECK

1. PSO + XGBOOST

In [10]:
import time
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# ================== USER SETTINGS ==================
DATA_PATH = "/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv"
TARGET_COL = "Label"

PSO_SWARM = 20       # swarm size
PSO_ITERS = 10       # *** you asked for 10 iterations ***
CV_FOLDS = 3         # CV folds for fitness (can increase if you want)
TEST_SIZE = 0.2      # final 80/20 split for evaluation
RANDOM_STATE = 42
# ===================================================

# ---------- Load data ----------
df = pd.read_csv(DATA_PATH)
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

print(f"Loaded data: {df.shape}, features={N_FEATURES}")

# ---------- XGBoost model factory ----------
def get_xgb_model():
    return XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

# ---------- Fitness cache ----------
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask(mask_bool):
    """
    Evaluate a feature mask using XGBoost with CV.
    Fitness = mean of (accuracy, precision, recall, f1).
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]

    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, idxs]

    model = get_xgb_model()
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(model, X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(model, X_sel, y, cv=skf,
                            scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(model, X_sel, y, cv=skf,
                           scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(model, X_sel, y, cv=skf,
                          scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# =============== PSO (binary) for feature selection ===============
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={CV_FOLDS})")
    t0 = time.time()
    dim = N_FEATURES

    # Initialize positions (0/1) and velocities
    pos = np.random.randint(0, 2, (swarm_size, dim)).astype(int)
    vel = np.random.uniform(-1, 1, (swarm_size, dim))

    # Personal bests
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask(p.astype(bool)) for p in pos])

    # Global best
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6
    c1 = 1.5
    c2 = 1.5

    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.8f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)

            vel[i] = (
                w * vel[i]
                + c1 * r1 * (pbest[i] - pos[i])
                + c2 * r2 * (gbest - pos[i])
            )
            # Sigmoid + sampling to get binary position
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask(pos[i].astype(bool))
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc

        # inertia decay
        w = max(0.2, w * 0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()

    log(f"PSO DONE in {int(t1 - t0)}s best_score={best_score:.8f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES ({int(np.sum(best_mask))}): {mask_to_features(best_mask)}")
    return best_mask, best_score, int(t1 - t0)

# =============== MAIN: PSO + XGBoost Final Model ===============
if __name__ == "__main__":
    total_t0 = time.time()

    # 1) Run PSO for 10 iterations to select features
    best_mask, best_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS)

    selected_indices = np.where(best_mask.astype(bool))[0].tolist()
    selected_features = [FEATURE_NAMES[i] for i in selected_indices]

    print("\n============= FINAL PSO SELECTION =============")
    print(f"Number of selected features: {len(selected_features)}")
    print("Selected features:")
    print(selected_features)
    print(f"Best PSO fitness score (CV-based): {best_score:.8f}")

    # 2) Train/Test split on selected features
    X_sel = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(
        X_sel, y,
        test_size=TEST_SIZE,
        stratify=y,
        random_state=RANDOM_STATE
    )

    # 3) Final XGBoost on selected features
    final_model = get_xgb_model()
    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)

    test_acc = accuracy_score(y_test, y_pred)
    test_prec = precision_score(y_test, y_pred, zero_division=0)
    test_rec = recall_score(y_test, y_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_pred, zero_division=0)

    print("\n============= FINAL XGBOOST RESULTS (PSO-Selected Features) =============")
    print(f"Test Accuracy : {test_acc:.8f}")
    print(f"Precision     : {test_prec:.8f}")
    print(f"Recall        : {test_rec:.8f}")
    print(f"F1 Score      : {test_f1:.8f}")

    total_t1 = time.time()
    print(f"\nTotal pipeline time: {int(total_t1 - total_t0)} seconds")


Loaded data: (97802, 76), features=75
[13:29:56] PSO START (swarm=20, iters=10, cv=3)
[13:35:35]  PSO iter 1/10 best_global=0.99929282
[13:41:14]  PSO iter 2/10 best_global=0.99934475
[13:46:47]  PSO iter 3/10 best_global=0.99934477
[13:52:22]  PSO iter 4/10 best_global=0.99936559
[13:57:41]  PSO iter 5/10 best_global=0.99936559
[14:03:00]  PSO iter 6/10 best_global=0.99936559
[14:08:31]  PSO iter 7/10 best_global=0.99937597
[14:14:06]  PSO iter 8/10 best_global=0.99937597
[14:19:32]  PSO iter 9/10 best_global=0.99937597
[14:25:01]  PSO iter 10/10 best_global=0.99937597
[14:30:39] PSO DONE in 3642s best_score=0.99937597 selected=39
[14:30:39] PSO SELECTED FEATURES (39): ['Dst Port', 'Protocol', 'Timestamp', 'Tot Fwd Pkts', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Flow Pkts/s', 'Fwd IAT Tot', 'Fwd IAT Std', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Fwd Header Len', 'Fwd Pkts/s', 'Pkt Len Mean', 'Pkt Len Std', 'FIN Flag Cnt', 'SYN Flag Cnt', 'R

In [11]:
import time
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# ================== USER SETTINGS ==================
DATA_PATH = "/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv"
TARGET_COL = "Label"

PSO_SWARM = 20       # swarm size
PSO_ITERS = 10       # PSO iterations
CV_FOLDS = 3         # CV folds for fitness
TEST_SIZE = 0.2      # final 80/20 split
RANDOM_STATE = 42
# ===================================================

# ---------- Load data ----------
df = pd.read_csv(DATA_PATH)
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)

FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

print(f"Loaded data: {df.shape}, features={N_FEATURES}")

# ---------- CatBoost model factory ----------
def get_cat_model(iterations=300):
    return CatBoostClassifier(
        iterations=iterations,
        learning_rate=0.05,
        depth=6,
        verbose=0,
        random_seed=RANDOM_STATE,
        thread_count=-1
    )

# ---------- Fitness cache ----------
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask(mask_bool):
    """
    Evaluate a feature mask using CatBoost with CV.
    Fitness = mean of (accuracy, precision, recall, f1).
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]

    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, idxs]

    model = get_cat_model(iterations=200)  # slightly smaller for CV
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(model, X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(model, X_sel, y, cv=skf,
                            scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(model, X_sel, y, cv=skf,
                           scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(model, X_sel, y, cv=skf,
                          scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# =============== PSO (binary) for feature selection ===============
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={CV_FOLDS})")
    t0 = time.time()
    dim = N_FEATURES

    # Initialize positions (0/1) and velocities
    pos = np.random.randint(0, 2, (swarm_size, dim)).astype(int)
    vel = np.random.uniform(-1, 1, (swarm_size, dim))

    # Personal bests
    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask(p.astype(bool)) for p in pos])

    # Global best
    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6
    c1 = 1.5
    c2 = 1.5

    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.8f}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim)
            r2 = np.random.rand(dim)

            vel[i] = (
                w * vel[i]
                + c1 * r1 * (pbest[i] - pos[i])
                + c2 * r2 * (gbest - pos[i])
            )
            # Sigmoid + sampling to get binary position
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask(pos[i].astype(bool))
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc

        # inertia decay
        w = max(0.2, w * 0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()

    log(f"PSO DONE in {int(t1 - t0)}s best_score={best_score:.8f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES ({int(np.sum(best_mask))}): {mask_to_features(best_mask)}")
    return best_mask, best_score, int(t1 - t0)

# =============== MAIN: PSO + CatBoost Final Model ===============
if __name__ == "__main__":
    total_t0 = time.time()

    # 1) Run PSO for 10 iterations to select features
    best_mask, best_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS)

    selected_indices = np.where(best_mask.astype(bool))[0].tolist()
    selected_features = [FEATURE_NAMES[i] for i in selected_indices]

    print("\n============= FINAL PSO SELECTION =============")
    print(f"Number of selected features: {len(selected_features)}")
    print("Selected features:")
    print(selected_features)
    print(f"Best PSO fitness score (CV-based): {best_score:.8f}")

    # 2) Train/Test split on selected features
    X_sel = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(
        X_sel, y,
        test_size=TEST_SIZE,
        stratify=y,
        random_state=RANDOM_STATE
    )

    # 3) Final CatBoost on selected features
    final_model = get_cat_model(iterations=500)
    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)

    test_acc = accuracy_score(y_test, y_pred)
    test_prec = precision_score(y_test, y_pred, zero_division=0)
    test_rec = recall_score(y_test, y_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_pred, zero_division=0)

    print("\n============= FINAL CATBOOST RESULTS (PSO-Selected Features) =============")
    print(f"Test Accuracy : {test_acc:.8f}")
    print(f"Precision     : {test_prec:.8f}")
    print(f"Recall        : {test_rec:.8f}")
    print(f"F1 Score      : {test_f1:.8f}")

    total_t1 = time.time()
    print(f"\nTotal pipeline time: {int(total_t1 - total_t0)} seconds")


Loaded data: (97802, 76), features=75
[14:33:36] PSO START (swarm=20, iters=10, cv=3)
[14:48:35]  PSO iter 1/10 best_global=0.99938638
[15:03:31]  PSO iter 2/10 best_global=0.99938638
[15:18:26]  PSO iter 3/10 best_global=0.99939676
[15:33:16]  PSO iter 4/10 best_global=0.99944877
[15:47:20]  PSO iter 5/10 best_global=0.99944877
[16:01:31]  PSO iter 6/10 best_global=0.99944877
[16:16:01]  PSO iter 7/10 best_global=0.99944877
[16:30:34]  PSO iter 8/10 best_global=0.99944877
[16:45:01]  PSO iter 9/10 best_global=0.99944877
[16:59:26]  PSO iter 10/10 best_global=0.99944877
[17:14:02] PSO DONE in 9625s best_score=0.99944877 selected=33
[17:14:02] PSO SELECTED FEATURES (33): ['Dst Port', 'Protocol', 'Timestamp', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Flow Pkts/s', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Tot', 'Fwd IAT Max', 'Bwd IAT Tot', 'Bwd IAT Min', 'Fwd PSH Flags', 'Pkt Len

In [3]:
# intersection_hlo_with_hillclimb_fast.py
# Pipeline (reduced budget + hill-climb) with UNION, INTERSECTION, and VOTING candidate flows:
#  PSO + GA + GWO (CatBoost fitness, lighter during opt) -> derive UNION / INTERSECTION / VOTING
#  For each candidate set: HLO (on candidates) -> Greedy hill-climb (restricted) -> Final CatBoost eval (5-fold CV)
#  Additionally: train a CatBoost model on 80% of the data and evaluate on the held-out 20% test set
#  Train & save a CatBoost model for each flow (union / intersection / voting) using the 80/20 split.
# Prints logs, mean ± std for metrics, stage timings, saves results and models.

import time
import pickle
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.base import clone
from xgboost import XGBClassifier


warnings.filterwarnings("ignore")
np.random.seed(42)

# -------------------- USER / EXPERIMENT SETTINGS --------------------
# If you prefer to load CSV instead, uncomment and change:
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")

TARGET_COL = "Label"   # target column
MODEL_VERBOSE = 0            # CatBoost verbosity: 0 = silent
RANDOM_STATE = 42

# ---------- Reduced budgets for faster runs (you can tune these) ----------
PSO_SWARM = 15   # reduced swarm
PSO_ITERS = 10   # reduced iterations

GA_POP = 30      # reduced population
GA_GENS = 10     # reduced generations

GWO_WOLVES = 10
GWO_ITERS = 10

HLO_POP = 15
HLO_ITERS = 10
HLO_TEACHER_FACTOR = 0.75
HLO_MUTATION = 0.12

# Greedy hill-climb after HLO
HILLCLIMB_MAX_STEPS = 100   # stop if no improvement or step limit
HILLCLIMB_EVAL_CAP = 500    # safety cap on evaluations (prevent runaway)

# CV folds
CV_OPT = 2    # cheaper CV during optimization + HLO (speed)
CV_FINAL = 5  # final evaluation (A1 requested)

# CatBoost iterations
CB_ITER_OPT = 100    # iterations during optimization (smaller)
CB_ITER_HLO = 200
CB_ITER_FINAL = 500  # final evaluation iterations (bigger)

# Train/test split for final saved models
FINAL_TEST_SIZE = 0.2

SAVE_PREFIX = "hybrid_hlo_models"
# ------------------------------------------------------------------------

# Ensure df exists
try:
    df
except NameError:
    raise RuntimeError("DataFrame `df` not found. Assign your dataset to variable `df` or load at top.")

# Prepare data
X = df.drop(TARGET_COL, axis=1)

y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

# -------------------- Model factory (CatBoost) --------------------

def get_xgb_model(iterations=100):
    return XGBClassifier(
        n_estimators=iterations,
        learning_rate=0.05,
        max_depth=6,
        subsample=1.0,
        colsample_bytree=1.0,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        eval_metric="logloss",
       
    )


# -------------------- Fitness cache --------------------
# key: tuple(selected original indices) -> float score
fitness_cache = {}

def key_from_mask(mask_bool):
    return tuple(sorted(np.where(np.array(mask_bool).astype(bool))[0].tolist()))

def evaluate_mask_global(mask_bool, cv=CV_OPT, cb_iter=CB_ITER_OPT):
    """
    Evaluate mask using CatBoost with CV and return average of acc,prec,rec,f1.
    Caches results to avoid re-evaluating identical subsets.
    """
    key = key_from_mask(mask_bool)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        fitness_cache[key] = 0.0
        return 0.0

    X_sel = X.iloc[:, list(key)]
    
    model = get_xgb_model(iterations=cb_iter)

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)

    accs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring="accuracy", n_jobs=-1)
    precs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(precision_score, zero_division=0), n_jobs=-1)
    recs = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(recall_score, zero_division=0), n_jobs=-1)
    f1s = cross_val_score(clone(model), X_sel, y, cv=skf, scoring=make_scorer(f1_score, zero_division=0), n_jobs=-1)

    score = float((np.mean(accs) + np.mean(precs) + np.mean(recs) + np.mean(f1s)) / 4.0)
    fitness_cache[key] = score
    return score

# -------------------- Helpers --------------------
def mask_to_features(mask):
    idxs = np.where(np.array(mask).astype(bool))[0].tolist()
    return [FEATURE_NAMES[i] for i in idxs]

def log(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}", flush=True)

# -------------------- PSO (binary) --------------------
def run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT):
    log(f"PSO START (swarm={swarm_size}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm_size,dim)).astype(int)
    vel = np.random.uniform(-1,1,(swarm_size,dim))

    pbest = pos.copy()
    pbest_scores = np.array([evaluate_mask_global(p.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for p in pos])

    gbest_idx = int(np.argmax(pbest_scores))
    gbest = pbest[gbest_idx].copy()
    gbest_score = pbest_scores[gbest_idx]

    w = 0.6; c1 = c2 = 1.5
    for t in range(iters):
        log(f" PSO iter {t+1}/{iters} best_global={gbest_score:.8}")
        for i in range(swarm_size):
            r1 = np.random.rand(dim); r2 = np.random.rand(dim)
            vel[i] = w*vel[i] + c1*r1*(pbest[i] - pos[i]) + c2*r2*(gbest - pos[i])
            s = 1.0 / (1.0 + np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim) < s).astype(int)

            sc = evaluate_mask_global(pos[i].astype(bool), cv=cv, cb_iter=CB_ITER_OPT)
            if sc > pbest_scores[i]:
                pbest[i] = pos[i].copy()
                pbest_scores[i] = sc
            if sc > gbest_score:
                gbest = pos[i].copy()
                gbest_score = sc
        w = max(0.2, w*0.97)

    best_idx = int(np.argmax(pbest_scores))
    best_mask = pbest[best_idx].copy()
    best_score = pbest_scores[best_idx]
    t1 = time.time()
    log(f"PSO DONE in {int(t1-t0)}s best_score={best_score:.8f} selected={int(np.sum(best_mask))}")
    log(f"PSO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GA (binary) --------------------
def run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT):
    log(f"GA START (pop={pop_size}, gens={gens}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(pop_size, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    def tournament_select(k=3):
        idxs = np.random.randint(0, pop_size, k)
        return idxs[np.argmax(fitness_scores[idxs])]

    for g in range(gens):
        log(f" GA gen {g+1}/{gens} current_best={np.max(fitness_scores):.8f}")
        new_pop = []
        # elitism
        elite_idxs = np.argsort(fitness_scores)[-2:]
        new_pop.extend(pop[elite_idxs].tolist())

        while len(new_pop) < pop_size:
            i1 = tournament_select(); i2 = tournament_select()
            p1 = pop[i1].copy(); p2 = pop[i2].copy()
            # crossover
            if np.random.rand() < 0.7:
                pt = np.random.randint(1, dim)
                c1 = np.concatenate([p1[:pt], p2[pt:]])
                c2 = np.concatenate([p2[:pt], p1[pt:]])
            else:
                c1, c2 = p1, p2
            # mutation
            for child in (c1, c2):
                for d in range(dim):
                    if np.random.rand() < 0.1:
                        child[d] = 1 - child[d]
                new_pop.append(child)
                if len(new_pop) >= pop_size:
                    break
        pop = np.array(new_pop[:pop_size])
        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GA DONE in {int(t1-t0)}s best_score={best_score:.8f} selected={int(np.sum(best_mask))}")
    log(f"GA SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- GWO (binary) --------------------
def run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT):
    log(f"GWO START (wolves={wolves}, iters={iters}, cv={cv})")
    t0 = time.time()
    dim = N_FEATURES
    pop = np.random.randint(0,2,(wolves, dim)).astype(int)
    fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    Alpha = Beta = Delta = None
    Alpha_score = Beta_score = Delta_score = -1.0

    for itr in range(iters):
        log(f" GWO iter {itr+1}/{iters} best_alpha={Alpha_score:.8f}")
        for i in range(wolves):
            sc = fitness_scores[i]
            if sc > Alpha_score:
                Delta_score, Beta_score, Alpha_score = Beta_score, Alpha_score, sc
                Delta, Beta, Alpha = Beta, Alpha, pop[i].copy()
            elif sc > Beta_score:
                Delta_score, Beta_score = Beta_score, sc
                Delta, Beta = Beta, pop[i].copy()
            elif sc > Delta_score:
                Delta_score = sc
                Delta = pop[i].copy()

        a = 2 - itr * (2.0 / iters)
        for i in range(wolves):
            for d in range(dim):
                if Alpha is None:
                    continue
                r1, r2 = np.random.rand(), np.random.rand()
                A1 = 2 * a * r1 - a; C1 = 2 * r2
                D_alpha = abs(C1 * Alpha[d] - pop[i][d])
                X1 = Alpha[d] - A1 * D_alpha

                r1, r2 = np.random.rand(), np.random.rand()
                A2 = 2 * a * r1 - a; C2 = 2 * r2
                D_beta = abs(C2 * Beta[d] - pop[i][d])
                X2 = Beta[d] - A2 * D_beta

                r1, r2 = np.random.rand(), np.random.rand()
                A3 = 2 * a * r1 - a; C3 = 2 * r2
                D_delta = abs(C3 * Delta[d] - pop[i][d])
                X3 = Delta[d] - A3 * D_delta

                new_pos = (X1 + X2 + X3) / 3.0
                s = 1.0 / (1.0 + np.exp(-new_pos))
                pop[i][d] = 1 if np.random.rand() < s else 0

        fitness_scores = np.array([evaluate_mask_global(ind.astype(bool), cv=cv, cb_iter=CB_ITER_OPT) for ind in pop])

    best_idx = int(np.argmax(fitness_scores))
    best_mask = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]
    t1 = time.time()
    log(f"GWO DONE in {int(t1-t0)}s best_score={best_score:.8f} selected={int(np.sum(best_mask))}")
    log(f"GWO SELECTED FEATURES: {mask_to_features(best_mask)}")

    return best_mask, best_score, int(t1-t0)

# -------------------- INTERSECTION / UNION / VOTING --------------------
def get_intersection_mask(*masks):
    """Return mask that contains only features present in ALL provided masks."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    inter_idx = set(np.where(np.array(masks[0]).astype(bool))[0].tolist())
    for m in masks[1:]:
        idxs = set(np.where(np.array(m).astype(bool))[0].tolist())
        inter_idx = inter_idx.intersection(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in inter_idx:
        mask[i] = 1
    return mask


def get_union_mask(*masks):
    union_idx = set()
    for m in masks:
        idxs = np.where(np.array(m).astype(bool))[0].tolist()
        union_idx.update(idxs)
    mask = np.zeros(N_FEATURES, dtype=int)
    for i in union_idx:
        mask[i] = 1
    return mask


def get_voting_mask(*masks, threshold=2):
    """Return mask of features selected by at least `threshold` methods (default majority of 3 => 2)."""
    if len(masks) == 0:
        return np.zeros(N_FEATURES, dtype=int)
    counts = np.zeros(N_FEATURES, dtype=int)
    for m in masks:
        counts += np.array(m).astype(int)
    mask = (counts >= threshold).astype(int)
    return mask

# -------------------- HLO on candidates --------------------
def hlo_on_candidates(candidate_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT):
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    k = len(candidate_indices)
    if k == 0:
        raise ValueError("Candidate set is empty.")

    log(f"HLO START on {k} candidate features (pop={pop_size}, iters={iters})")
    t0 = time.time()

    pop = np.random.randint(0,2,(pop_size, k)).astype(int)

    def fitness_candidate(bitmask):
        full_mask = np.zeros(N_FEATURES, dtype=int)
        for j,bit in enumerate(bitmask):
            if bit == 1:
                full_mask[candidate_indices[j]] = 1
        return evaluate_mask_global(full_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)

    fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
    best_idx = int(np.argmax(fitness_scores))
    best_solution = pop[best_idx].copy()
    best_score = fitness_scores[best_idx]

    for it in range(iters):
        log(f" HLO iter {it+1}/{iters} current_best={best_score:.8f}")
        teacher = pop[int(np.argmax(fitness_scores))].copy()
        new_pop = []
        for i in range(pop_size):
            learner = pop[i].copy()
            # teaching phase
            for d in range(k):
                if np.random.rand() < HLO_TEACHER_FACTOR:
                    learner[d] = teacher[d]
            # peer learning
            partner = pop[np.random.randint(pop_size)].copy()
            for d in range(k):
                if learner[d] != partner[d] and np.random.rand() < 0.5:
                    learner[d] = partner[d]
            # mutation
            for d in range(k):
                if np.random.rand() < HLO_MUTATION:
                    learner[d] = 1 - learner[d]
            new_pop.append(learner)
        pop = np.array(new_pop)
        fitness_scores = np.array([fitness_candidate(ind) for ind in pop])
        gen_best_idx = int(np.argmax(fitness_scores))
        gen_best_score = fitness_scores[gen_best_idx]
        gen_best_sol = pop[gen_best_idx].copy()
        if gen_best_score > best_score:
            best_score = gen_best_score
            best_solution = gen_best_sol.copy()

    # map back to full mask
    final_full_mask = np.zeros(N_FEATURES, dtype=int)
    for j,bit in enumerate(best_solution):
        if bit == 1:
            final_full_mask[candidate_indices[j]] = 1

    t1 = time.time()
    log(f"HLO DONE in {int(t1-t0)}s best_score={best_score:.8f} final_selected={int(np.sum(final_full_mask))}")
    return final_full_mask, best_score, int(t1-t0)

# -------------------- Greedy Hill-Climb (local search) --------------------
def hill_climb_on_candidates(initial_mask, candidate_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT):
    """
    Greedy single-bit flip hill-climb restricted to candidate indices.
    Starts from initial_mask (full-length). Tries flipping each candidate feature's bit:
    - If flip improves fitness, accept and restart scanning.
    - Stops when no improving flip found or max_steps/eval_cap reached.
    """
    candidate_indices = np.where(np.array(candidate_mask).astype(bool))[0].tolist()
    if len(candidate_indices) == 0:
        log("Hill-climb: candidate set empty, skipping.")
        return initial_mask, 0.0, 0

    log(f"Hill-climb START over {len(candidate_indices)} candidates (max_steps={max_steps}, eval_cap={eval_cap})")
    t0 = time.time()
    current_mask = initial_mask.copy()
    current_score = evaluate_mask_global(current_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
    evals = 0
    steps = 0
    improved = True

    while improved and steps < max_steps and evals < eval_cap:
        improved = False
        for idx in np.random.permutation(candidate_indices):
            trial_mask = current_mask.copy()
            trial_mask[idx] = 1 - trial_mask[idx]  # flip
            trial_score = evaluate_mask_global(trial_mask.astype(bool), cv=cv, cb_iter=CB_ITER_HLO)
            evals += 1
            if trial_score > current_score + 1e-8:
                current_mask = trial_mask
                current_score = trial_score
                improved = True
                steps += 1
                log(f" Hill-climb step {steps}: flipped {FEATURE_NAMES[idx]} -> new_score={current_score:.4f} (evals={evals})")
                break
            if evals >= eval_cap or steps >= max_steps:
                break
    t1 = time.time()
    log(f"Hill-climb DONE in {int(t1-t0)}s steps={steps} evals={evals} final_score={current_score:.8f} selected={int(np.sum(current_mask))}")
    return current_mask, current_score, int(t1-t0)

# -------------------- Final evaluation (5-fold CV) --------------------
def final_evaluation(mask_bool, cv=CV_FINAL, cb_iter=CB_ITER_FINAL):
    idxs = np.where(np.array(mask_bool).astype(bool))[0].tolist()
    if len(idxs) == 0:
        raise ValueError("Final mask selects zero features.")
    X_sel = X.iloc[:, idxs]
    model = get_xgb_model(iterations=cb_iter)

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    accs = []; precs = []; recs = []; f1s = []
    t0 = time.time()
    for tr,te in skf.split(X_sel, y):
        m = clone(model); m.fit(X_sel.iloc[tr], y.iloc[tr])
        pred = m.predict(X_sel.iloc[te])
        accs.append(accuracy_score(y.iloc[te], pred))
        precs.append(precision_score(y.iloc[te], pred, zero_division=0))
        recs.append(recall_score(y.iloc[te], pred, zero_division=0))
        f1s.append(f1_score(y.iloc[te], pred, zero_division=0))
    t1 = time.time()
    results = {
        "n_features": len(idxs),
        "features": [FEATURE_NAMES[i] for i in idxs],
        "acc_mean": float(np.mean(accs)), "acc_std": float(np.std(accs)),
        "prec_mean": float(np.mean(precs)), "prec_std": float(np.std(precs)),
        "rec_mean": float(np.mean(recs)), "rec_std": float(np.std(recs)),
        "f1_mean": float(np.mean(f1s)), "f1_std": float(np.std(f1s)),
        "eval_time_s": int(t1 - t0)
    }
    return results

# -------------------- MAIN PIPELINE --------------------
if __name__ == "__main__":
    total_t0 = time.time()
    log("===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====")

    # PSO
    pso_mask, pso_score, pso_time = run_pso(swarm_size=PSO_SWARM, iters=PSO_ITERS, cv=CV_OPT)

    # GA
    ga_mask, ga_score, ga_time = run_ga(pop_size=GA_POP, gens=GA_GENS, cv=CV_OPT)

    # GWO
    gwo_mask, gwo_score, gwo_time = run_gwo(wolves=GWO_WOLVES, iters=GWO_ITERS, cv=CV_OPT)

    # Derive candidate masks
    union_mask = get_union_mask(pso_mask, ga_mask, gwo_mask)
    inter_mask = get_intersection_mask(pso_mask, ga_mask, gwo_mask)
    vote_mask = get_voting_mask(pso_mask, ga_mask, gwo_mask, threshold=2)

    candidate_sets = {
        'union': union_mask,
        'intersection': inter_mask,
        'voting': vote_mask
    }

    results_all = {}

    # run HLO -> hill-climb -> final evaluation -> train & save model for each candidate set
    for name, cand_mask in candidate_sets.items():
        log(f"===== PROCESSING {name.upper()} CANDIDATES =====")
        n_cand = int(np.sum(cand_mask))
        log(f"{name.upper()} candidate features: {n_cand}")
        if n_cand == 0:
            log(f"{name.upper()} empty — skipping HLO/hill-climb and model training.")
            results_all[name] = {'skipped': True, 'n_candidates': 0}
            continue

        # HLO on this candidate set
        hlo_mask, hlo_score, hlo_time = hlo_on_candidates(cand_mask, pop_size=HLO_POP, iters=HLO_ITERS, cv=CV_OPT)

        # hill-climb restricted to candidate set
        hc_mask, hc_score, hc_time = hill_climb_on_candidates(hlo_mask, cand_mask, max_steps=HILLCLIMB_MAX_STEPS, eval_cap=HILLCLIMB_EVAL_CAP, cv=CV_OPT)

        # final CV evaluation
        final_res = final_evaluation(hc_mask, cv=CV_FINAL, cb_iter=CB_ITER_FINAL)

        # Train final CatBoost model on 80% train and evaluate on 20% test (stratified)
        sel_idxs = np.where(np.array(hc_mask).astype(bool))[0].tolist()
        sel_features = [FEATURE_NAMES[i] for i in sel_idxs]

        if len(sel_features) == 0:
            log(f"No features selected after hill-climb for {name}, skipping model train.")
            results_all[name] = {'skipped': True, 'n_candidates': n_cand}
            continue

        X_sel = X[sel_features]
        X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=FINAL_TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

        
        model = get_xgb_model(iterations=CB_ITER_FINAL)
        model.fit(X_train, y_train)

        # evaluate on held-out test set (20%)
        y_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_prec = precision_score(y_test, y_pred, zero_division=0)
        test_rec = recall_score(y_test, y_pred, zero_division=0)
        test_f1 = f1_score(y_test, y_pred, zero_division=0)

        test_metrics = {
            'acc': float(test_acc), 'prec': float(test_prec), 'rec': float(test_rec), 'f1': float(test_f1),
            'n_test': int(X_test.shape[0])
        }

        # Save model to file (pickle)
        model_filename = f"{SAVE_PREFIX}_{name}_model.pkl"
        with open(model_filename, 'wb') as mf:
            pickle.dump(model, mf)

        # store results
        results_all[name] = {
            'n_candidates': n_cand,
            'hlo_score': float(hlo_score), 'hlo_time': int(hlo_time),
            'hc_score': float(hc_score), 'hc_time': int(hc_time),
            'final_eval': final_res,
            'selected_features': sel_features,
            'model_file': model_filename,
            'test_metrics': test_metrics
        }

        log(f"Saved trained XGboost model for {name} -> {model_filename} (test_f1={test_f1:.8f})")

    total_t1 = time.time()
    elapsed_total = int(total_t1 - total_t0)

    # Summary / save aggregated results
    print("==================== AGGREGATE SUMMARY ====================")
    print(f"PSO  -> opt_score={pso_score:.8f} selected={int(np.sum(pso_mask))} time={pso_time}s")
    print(f"GA   -> opt_score={ga_score:.8f} selected={int(np.sum(ga_mask))} time={ga_time}s")
    print(f"GWO  -> opt_score={gwo_score:.8f} selected={int(np.sum(gwo_mask))} time={gwo_time}s")
    print(f"Union candidates    : {int(np.sum(union_mask))}")
    print(f"Intersection candidates: {int(np.sum(inter_mask))}")
    print(f"Voting candidates   : {int(np.sum(vote_mask))}")
    print("-------------------------------------------------")

    for name, info in results_all.items():
        print(f"-- {name.upper()} SUMMARY --")
        if info.get('skipped'):
            print(" skipped (no candidates)")
            continue
        fe = info['final_eval']
        tm = info['test_metrics']
        print(f" Selected ({fe['n_features']}): {fe['features']}")
        print(f" CV F1   : {fe['f1_mean']:.8f} ± {fe['f1_std']:.8f}")
        print(f" Test F1 : {tm['f1']:.8f} (n_test={tm['n_test']})")
        print(f" Accuracy : {fe['acc_mean']:.8f} ± {fe['acc_std']:.8f}")
        print(f" Precision: {fe['prec_mean']:.8f} ± {fe['prec_std']:.8f}")
        print(f" Recall   : {fe['rec_mean']:.8f} ± {fe['rec_std']:.8f}")
        print(f" Model file: {info['model_file']}")



    # Save aggregated pipeline outputs
    out = {
        "pso_mask": pso_mask, "pso_score": pso_score, "pso_time": pso_time,
        "ga_mask": ga_mask, "ga_score": ga_score, "ga_time": ga_time,
        "gwo_mask": gwo_mask, "gwo_score": gwo_score, "gwo_time": gwo_time,
        "union_mask": union_mask, "intersection_mask": inter_mask, "voting_mask": vote_mask,
        "results_all": results_all,
        "fitness_cache_len": len(fitness_cache)
    }
    with open(f"{SAVE_PREFIX}_results.pkl", "wb") as f:
        pickle.dump(out, f)

    log(f"Saved results to {SAVE_PREFIX}_results.pkl")
    log("===== PIPELINE COMPLETE =====")

[09:26:45] ===== HYBRID (reduced budget) + HLO + HILL-CLIMB (UNION/INTERSECTION/VOTING) START =====
[09:26:45] PSO START (swarm=15, iters=10, cv=2)
[09:27:57]  PSO iter 1/10 best_global=0.9992303
[09:29:07]  PSO iter 2/10 best_global=0.99924071
[09:30:18]  PSO iter 3/10 best_global=0.99924071
[09:31:23]  PSO iter 4/10 best_global=0.99924071
[09:32:32]  PSO iter 5/10 best_global=0.99924071
[09:33:43]  PSO iter 6/10 best_global=0.99927191
[09:34:56]  PSO iter 7/10 best_global=0.99927191
[09:36:06]  PSO iter 8/10 best_global=0.99927191
[09:37:12]  PSO iter 9/10 best_global=0.9993031
[09:38:18]  PSO iter 10/10 best_global=0.9993031
[09:39:28] PSO DONE in 762s best_score=0.99930310 selected=34
[09:39:28] PSO SELECTED FEATURES: ['Dst Port', 'Protocol', 'Timestamp', 'Tot Fwd Pkts', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Flow Pkts/s', 'Flow IAT Std', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd 

In [1]:
print("hi")

hi


In [2]:
# ==========================================================
# ABLATION-1 : OPTIMIZER CONTRIBUTION STUDY
# PSO vs GA vs GWO vs Hybrid (Voting)
# ==========================================================

import numpy as np
import pandas as pd
import time
import pickle
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.base import clone
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# ------------------ DATA ------------------
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")
TARGET_COL = "Label"

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)
FEATURE_NAMES = X.columns.tolist()
N_FEATURES = X.shape[1]

# ------------------ MODEL ------------------
def get_xgb_model(iterations=200):
    return XGBClassifier(
        n_estimators=iterations,
        learning_rate=0.05,
        max_depth=6,
        subsample=1.0,
        colsample_bytree=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss"
    )

# ------------------ FITNESS ------------------
fitness_cache = {}

def key_from_mask(mask):
    return tuple(sorted(np.where(mask)[0].tolist()))

def evaluate_mask(mask, cv=3, iters=100):
    key = key_from_mask(mask)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key) == 0:
        return 0.0

    X_sel = X.iloc[:, list(key)]
    model = get_xgb_model(iters)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    acc = cross_val_score(model, X_sel, y, cv=skf, scoring="accuracy").mean()
    f1  = cross_val_score(model, X_sel, y, cv=skf, scoring=make_scorer(f1_score)).mean()

    score = (acc + f1) / 2
    fitness_cache[key] = score
    return score

# ------------------ PSO ------------------
def run_pso(swarm=15, iters=10):
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm,dim))
    vel = np.random.uniform(-1,1,(swarm,dim))

    pbest = pos.copy()
    pbest_score = np.array([evaluate_mask(p.astype(bool)) for p in pos])

    gbest = pbest[np.argmax(pbest_score)]
    gbest_score = np.max(pbest_score)

    for _ in range(iters):
        for i in range(swarm):
            vel[i] = 0.5*vel[i] + 1.5*np.random.rand(dim)*(pbest[i]-pos[i]) + 1.5*np.random.rand(dim)*(gbest-pos[i])
            s = 1/(1+np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim)<s).astype(int)
            sc = evaluate_mask(pos[i].astype(bool))
            if sc > pbest_score[i]:
                pbest[i] = pos[i].copy()
                pbest_score[i] = sc
        gbest = pbest[np.argmax(pbest_score)]
        gbest_score = np.max(pbest_score)

    return gbest.astype(bool)

# ------------------ GA ------------------
def run_ga(pop=30, gens=10):
    dim = N_FEATURES
    P = np.random.randint(0,2,(pop,dim))
    fitness = np.array([evaluate_mask(p.astype(bool)) for p in P])

    for _ in range(gens):
        newP = []
        elite = P[np.argmax(fitness)]
        newP.append(elite)
        while len(newP)<pop:
            i,j = np.random.randint(0,pop,2)
            p1,p2 = P[i],P[j]
            pt = np.random.randint(dim)
            c = np.concatenate([p1[:pt],p2[pt:]])
            if np.random.rand()<0.1:
                idx = np.random.randint(dim)
                c[idx]=1-c[idx]
            newP.append(c)
        P = np.array(newP)
        fitness = np.array([evaluate_mask(p.astype(bool)) for p in P])
    return P[np.argmax(fitness)].astype(bool)

# ------------------ GWO ------------------
def run_gwo(wolves=10, iters=10):
    dim = N_FEATURES
    W = np.random.randint(0,2,(wolves,dim))
    fitness = np.array([evaluate_mask(w.astype(bool)) for w in W])

    for t in range(iters):
        idx = np.argsort(fitness)[::-1]
        A,B,D = W[idx[0]],W[idx[1]],W[idx[2]]
        a = 2 - 2*(t/iters)
        for i in range(wolves):
            for d in range(dim):
                r1,r2 = np.random.rand(),np.random.rand()
                X1 = A[d] - (2*a*r1-a)*abs(2*r2*A[d]-W[i][d])
                r1,r2 = np.random.rand(),np.random.rand()
                X2 = B[d] - (2*a*r1-a)*abs(2*r2*B[d]-W[i][d])
                r1,r2 = np.random.rand(),np.random.rand()
                X3 = D[d] - (2*a*r1-a)*abs(2*r2*D[d]-W[i][d])
                s = 1/(1+np.exp(-(X1+X2+X3)/3))
                W[i][d] = 1 if np.random.rand()<s else 0
        fitness = np.array([evaluate_mask(w.astype(bool)) for w in W])
    return W[np.argmax(fitness)].astype(bool)

# ------------------ VOTING ------------------
def voting(pso,ga,gwo):
    return ((pso.astype(int)+ga.astype(int)+gwo.astype(int))>=2)

# ------------------ FINAL EVAL ------------------
def final_eval(mask):
    idx = np.where(mask)[0]
    Xs = X.iloc[:,idx]
    model = get_xgb_model(500)
    skf = StratifiedKFold(5,shuffle=True,random_state=42)
    f1=[]
    for tr,te in skf.split(Xs,y):
        model.fit(Xs.iloc[tr],y.iloc[tr])
        p=model.predict(Xs.iloc[te])
        f1.append(f1_score(y.iloc[te],p))
    return len(idx),np.mean(f1)

# ================= RUN ABLATION =================
print("\nRunning PSO...")
pso = run_pso()
print("Running GA...")
ga = run_ga()
print("Running GWO...")
gwo = run_gwo()
hyb = voting(pso,ga,gwo)

print("\nFinal 5-Fold CV")
print("Method     | Features | F1")
print("--------------------------------")
for name,mask in zip(["PSO","GA","GWO","Hybrid"],[pso,ga,gwo,hyb]):
    n,f1 = final_eval(mask)
    print(f"{name:10s} | {n:8d} | {f1:.6f}")



Running PSO...


KeyboardInterrupt: 

In [3]:
# ==========================================================
# ABLATION-1 : OPTIMIZER CONTRIBUTION STUDY
# PSO vs GA vs GWO vs Hybrid (Voting)
# ==========================================================

import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# ---------------- DATA ----------------
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")
TARGET_COL = "Label"

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)
N_FEATURES = X.shape[1]

# ---------------- MODEL ----------------
def get_xgb_model(iters=150):
    return XGBClassifier(
        n_estimators=iters,
        learning_rate=0.05,
        max_depth=6,
        subsample=1.0,
        colsample_bytree=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss"
    )

# ---------------- FITNESS ----------------
fitness_cache = {}

def key_from_mask(mask):
    return tuple(sorted(np.where(mask)[0].tolist()))

def evaluate_mask(mask, cv=3):
    key = key_from_mask(mask)
    if key in fitness_cache:
        return fitness_cache[key]
    if len(key)==0:
        return 0.0

    Xs = X.iloc[:, list(key)]
    model = get_xgb_model(100)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    f1 = cross_val_score(model, Xs, y, cv=skf, scoring=make_scorer(f1_score)).mean()
    fitness_cache[key] = f1
    return f1

# ---------------- PSO ----------------
def run_pso(swarm=15, iters=5):
    print("\n[PSO] Start")
    dim = N_FEATURES
    pos = np.random.randint(0,2,(swarm,dim))
    vel = np.random.uniform(-1,1,(swarm,dim))
    pbest = pos.copy()
    pbest_score = np.array([evaluate_mask(p.astype(bool)) for p in pos])
    gbest = pbest[np.argmax(pbest_score)]
    gbest_score = np.max(pbest_score)

    print(f"[PSO] Initial best = {gbest_score:.6f}, features = {np.sum(gbest)}")

    for t in range(iters):
        for i in range(swarm):
            vel[i] = 0.5*vel[i] + 1.5*np.random.rand(dim)*(pbest[i]-pos[i]) + 1.5*np.random.rand(dim)*(gbest-pos[i])
            s = 1/(1+np.exp(-vel[i]))
            pos[i] = (np.random.rand(dim)<s).astype(int)
            sc = evaluate_mask(pos[i].astype(bool))
            if sc > pbest_score[i]:
                pbest[i] = pos[i].copy()
                pbest_score[i] = sc
        gbest = pbest[np.argmax(pbest_score)]
        gbest_score = np.max(pbest_score)
        print(f"[PSO] Iter {t+1}/{iters} best = {gbest_score:.6f}, features = {np.sum(gbest)}")
    return gbest.astype(bool)

# ---------------- GA ----------------
def run_ga(pop=30, gens=5):
    print("\n[GA] Start")
    dim = N_FEATURES
    P = np.random.randint(0,2,(pop,dim))
    fitness = np.array([evaluate_mask(p.astype(bool)) for p in P])
    print(f"[GA] Initial best = {np.max(fitness):.6f}")

    for g in range(gens):
        newP=[P[np.argmax(fitness)]]
        while len(newP)<pop:
            i,j = np.random.randint(0,pop,2)
            p1,p2=P[i],P[j]
            pt=np.random.randint(dim)
            c=np.concatenate([p1[:pt],p2[pt:]])
            if np.random.rand()<0.1:
                idx=np.random.randint(dim); c[idx]=1-c[idx]
            newP.append(c)
        P=np.array(newP)
        fitness=np.array([evaluate_mask(p.astype(bool)) for p in P])
        best=P[np.argmax(fitness)]
        print(f"[GA] Gen {g+1}/{gens} best = {np.max(fitness):.6f}, features = {np.sum(best)}")
    return P[np.argmax(fitness)].astype(bool)

# ---------------- GWO ----------------
def run_gwo(wolves=10, iters=5):
    print("\n[GWO] Start")
    dim = N_FEATURES
    W = np.random.randint(0,2,(wolves,dim))
    fitness = np.array([evaluate_mask(w.astype(bool)) for w in W])
    print(f"[GWO] Initial best = {np.max(fitness):.6f}")

    for t in range(iters):
        idx=np.argsort(fitness)[::-1]
        A,B,D=W[idx[0]],W[idx[1]],W[idx[2]]
        a=2-2*(t/iters)
        for i in range(wolves):
            for d in range(dim):
                r1,r2=np.random.rand(),np.random.rand()
                X1=A[d]-(2*a*r1-a)*abs(2*r2*A[d]-W[i][d])
                r1,r2=np.random.rand(),np.random.rand()
                X2=B[d]-(2*a*r1-a)*abs(2*r2*B[d]-W[i][d])
                r1,r2=np.random.rand(),np.random.rand()
                X3=D[d]-(2*a*r1-a)*abs(2*r2*D[d]-W[i][d])
                s=1/(1+np.exp(-(X1+X2+X3)/3))
                W[i][d]=1 if np.random.rand()<s else 0
        fitness=np.array([evaluate_mask(w.astype(bool)) for w in W])
        best=W[np.argmax(fitness)]
        print(f"[GWO] Iter {t+1}/{iters} best = {np.max(fitness):.6f}, features = {np.sum(best)}")
    return W[np.argmax(fitness)].astype(bool)

# ---------------- Voting ----------------
def voting(pso,ga,gwo):
    return ((pso.astype(int)+ga.astype(int)+gwo.astype(int))>=2)

# ---------------- Final Evaluation ----------------
def final_eval(mask):
    idx=np.where(mask)[0]
    Xs=X.iloc[:,idx]
    model=get_xgb_model(400)
    skf=StratifiedKFold(5,shuffle=True,random_state=42)
    f1=[]
    for tr,te in skf.split(Xs,y):
        model.fit(Xs.iloc[tr],y.iloc[tr])
        p=model.predict(Xs.iloc[te])
        f1.append(f1_score(y.iloc[te],p))
    return len(idx),np.mean(f1)

# ================= RUN =================
pso=run_pso()
ga=run_ga()
gwo=run_gwo()
hyb=voting(pso,ga,gwo)

print("\n=========== ABLATION RESULTS ===========")
print("Method    | Features | F1")
print("--------------------------------")
for name,mask in zip(["PSO","GA","GWO","Hybrid"],[pso,ga,gwo,hyb]):
    n,f1=final_eval(mask)
    print(f"{name:9s} | {n:8d} | {f1:.6f}")



[PSO] Start
[PSO] Initial best = 0.999310, features = 40
[PSO] Iter 1/5 best = 0.999331, features = 36
[PSO] Iter 2/5 best = 0.999331, features = 36
[PSO] Iter 3/5 best = 0.999331, features = 36
[PSO] Iter 4/5 best = 0.999331, features = 36
[PSO] Iter 5/5 best = 0.999331, features = 39

[GA] Start
[GA] Initial best = 0.999310
[GA] Gen 1/5 best = 0.999310, features = 42
[GA] Gen 2/5 best = 0.999372, features = 39
[GA] Gen 3/5 best = 0.999372, features = 39
[GA] Gen 4/5 best = 0.999393, features = 33
[GA] Gen 5/5 best = 0.999393, features = 33

[GWO] Start
[GWO] Initial best = 0.999289
[GWO] Iter 1/5 best = 0.999310, features = 44
[GWO] Iter 2/5 best = 0.999289, features = 55
[GWO] Iter 3/5 best = 0.999310, features = 46
[GWO] Iter 4/5 best = 0.999320, features = 49
[GWO] Iter 5/5 best = 0.999299, features = 52

Method    | Features | F1
--------------------------------
PSO       |       39 | 0.999404
GA        |       33 | 0.999425
GWO       |       52 | 0.999299
Hybrid    |       43 |

In [5]:
# ==========================================================
# ABLATION-3 : HLO CONTRIBUTION STUDY (WITH LOGS)
# Voting + Hill-Climb  vs  Voting + HLO + Hill-Climb
# ==========================================================

import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

# ---------------- DATA ----------------
df = pd.read_csv("/kaggle/input/ids-cleaned/ids2018_cleaned_combined_1.csv")
TARGET_COL = "Label"
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL].astype(int)
N_FEATURES = X.shape[1]

# ---------------- MODEL ----------------
def get_model(iters=200):
    return XGBClassifier(
        n_estimators=iters,
        learning_rate=0.05,
        max_depth=6,
        subsample=1.0,
        colsample_bytree=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss"
    )

# ---------------- FITNESS ----------------
def fitness(mask):
    idx = np.where(mask)[0]
    if len(idx) == 0:
        return 0.0
    Xs = X.iloc[:, idx]
    model = get_model(100)
    skf = StratifiedKFold(3, shuffle=True, random_state=42)
    return np.mean([
        f1_score(y.iloc[te],
                 model.fit(Xs.iloc[tr], y.iloc[tr]).predict(Xs.iloc[te]))
        for tr, te in skf.split(Xs, y)
    ])

# ---------------- PSO ----------------
def run_pso():
    pop = np.random.randint(0,2,(10,N_FEATURES))
    print("\nPSO started")
    for it in range(5):
        scores = np.array([fitness(p.astype(bool)) for p in pop])
        best = pop[np.argmax(scores)]
        print(f" PSO iter {it+1}/5 best_f1={scores.max():.6f}")
        for i in range(10):
            flip = np.random.randint(N_FEATURES)
            pop[i] = best.copy()
            pop[i,flip] = 1 - pop[i,flip]
    return best.astype(bool)

# ---------------- GA ----------------
def run_ga():
    pop = np.random.randint(0,2,(15,N_FEATURES))
    print("\nGA started")
    for g in range(5):
        scores = np.array([fitness(p.astype(bool)) for p in pop])
        print(f" GA gen {g+1}/5 best_f1={scores.max():.6f}")
        elite = pop[np.argmax(scores)]
        new = [elite]
        while len(new) < 15:
            i,j = np.random.randint(15,size=2)
            pt = np.random.randint(N_FEATURES)
            child = np.concatenate([pop[i][:pt], pop[j][pt:]])
            if np.random.rand() < 0.1:
                m = np.random.randint(N_FEATURES)
                child[m] = 1-child[m]
            new.append(child)
        pop = np.array(new)
    scores = np.array([fitness(p.astype(bool)) for p in pop])
    return pop[np.argmax(scores)].astype(bool)

# ---------------- Binary GWO ----------------
def run_gwo():
    wolves = 10
    pop = np.random.randint(0,2,(wolves,N_FEATURES))
    print("\nGWO started")
    for it in range(5):
        scores = np.array([fitness(p.astype(bool)) for p in pop])
        idx = np.argsort(scores)[::-1]
        print(f" GWO iter {it+1}/5 best_f1={scores.max():.6f}")
        alpha, beta, delta = pop[idx[0]], pop[idx[1]], pop[idx[2]]

        for i in range(wolves):
            for d in range(N_FEATURES):
                s = (alpha[d] + beta[d] + delta[d]) / 3
                pop[i,d] = 1 if np.random.rand() < s else 0
    scores = np.array([fitness(p.astype(bool)) for p in pop])
    return pop[np.argmax(scores)].astype(bool)

# ---------------- Voting ----------------
def voting(a,b,c):
    return ((a.astype(int)+b.astype(int)+c.astype(int))>=2)

# ---------------- HLO ----------------
def HLO(mask):
    idx = np.where(mask)[0]
    if len(idx)==0:
        return mask
    pop = np.random.randint(0,2,(10,len(idx)))
    best = pop[0]
    print("\nHLO started")

    for it in range(5):
        scores=[]
        for i in range(10):
            if np.random.rand() < 0.7:
                pop[i] = best.copy()
            m = np.random.randint(len(idx))
            pop[i,m] = 1-pop[i,m]

            full = np.zeros(N_FEATURES)
            full[idx[pop[i].astype(bool)]] = 1
            scores.append(fitness(full.astype(bool)))

        best = pop[np.argmax(scores)]
        print(f" HLO iter {it+1}/5 best_f1={max(scores):.6f}")

    final = np.zeros(N_FEATURES)
    final[idx[best.astype(bool)]] = 1
    return final.astype(bool)

# ---------------- Hill Climb ----------------
def hill_climb(mask):
    best = mask.copy()
    best_score = fitness(best)
    print("\nHill-Climb started")
    for step in range(50):
        i = np.random.randint(N_FEATURES)
        trial = best.copy()
        trial[i] = 1-trial[i]
        s = fitness(trial)
        if s > best_score:
            best, best_score = trial, s
            print(f" HC step {step+1} improved -> f1={best_score:.6f}")
    return best

# ---------------- Final Eval ----------------
def final_eval(mask):
    idx = np.where(mask)[0]
    Xs = X.iloc[:,idx]
    model = get_model(400)
    skf = StratifiedKFold(5, shuffle=True, random_state=42)
    f1=[]
    for tr,te in skf.split(Xs,y):
        model.fit(Xs.iloc[tr],y.iloc[tr])
        f1.append(f1_score(y.iloc[te],model.predict(Xs.iloc[te])))
    return len(idx), np.mean(f1)

# ================= RUN =================
print("\nRunning PSO, GA, GWO...")
pso = run_pso()
ga  = run_ga()
gwo = run_gwo()

vote = voting(pso,ga,gwo)

print("\nRunning Voting + Hill-Climb (no HLO)")
no_hlo = hill_climb(vote)

print("\nRunning Voting + HLO + Hill-Climb")
with_hlo = hill_climb(HLO(vote))

# ================= RESULTS =================
print("\n=========== HLO ABLATION ===========")
print("Variant                | Features | F1")
print("------------------------------------------")

n1,f1 = final_eval(no_hlo)
n2,f2 = final_eval(with_hlo)

print(f"Voting + HillClimb     | {n1:8d} | {f1:.6f}")
print(f"Voting + HLO + HC      | {n2:8d} | {f2:.6f}")



Running PSO, GA, GWO...

PSO started
 PSO iter 1/5 best_f1=0.999257
 PSO iter 2/5 best_f1=0.999268
 PSO iter 3/5 best_f1=0.999331
 PSO iter 4/5 best_f1=0.999341
 PSO iter 5/5 best_f1=0.999341

GA started
 GA gen 1/5 best_f1=0.999278
 GA gen 2/5 best_f1=0.999278
 GA gen 3/5 best_f1=0.999299
 GA gen 4/5 best_f1=0.999299
 GA gen 5/5 best_f1=0.999299

GWO started
 GWO iter 1/5 best_f1=0.999268
 GWO iter 2/5 best_f1=0.999320
 GWO iter 3/5 best_f1=0.999320
 GWO iter 4/5 best_f1=0.999372
 GWO iter 5/5 best_f1=0.999372

Running Voting + Hill-Climb (no HLO)

Hill-Climb started
 HC step 2 improved -> f1=0.999320
 HC step 9 improved -> f1=0.999320
 HC step 12 improved -> f1=0.999320
 HC step 17 improved -> f1=0.999351
 HC step 18 improved -> f1=0.999362
 HC step 41 improved -> f1=0.999372
 HC step 48 improved -> f1=0.999383

Running Voting + HLO + Hill-Climb

HLO started
 HLO iter 1/5 best_f1=0.998537
 HLO iter 2/5 best_f1=0.999257
 HLO iter 3/5 best_f1=0.999299
 HLO iter 4/5 best_f1=0.999352
 H