In [4]:
# Install required packages (run once)
!pip install -q imbalanced-learn xgboost lightgbm catboost ctgan==0.6.0

import os
import time
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, roc_auc_score,
                             mean_absolute_error, mean_squared_error, cohen_kappa_score,
                             matthews_corrcoef)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.cluster import KMeans

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler

# Try to import imblearn's KMeansSMOTE if available
try:
    from imblearn.over_sampling import KMeansSMOTE as imblearn_KMeansSMOTE
    HAVE_IMBLEARN_KMS = True
except Exception:
    HAVE_IMBLEARN_KMS = False

# CTGAN (optional)
try:
    from ctgan import CTGANSynthesizer
    HAVE_CTGAN = True
except Exception:
    HAVE_CTGAN = False

# -------------------------
# Utility & Preprocessing
# -------------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def safe_preprocess(df, target_col=None, verbose=False):
    """
    Convert features to numeric: numeric columns keep numeric,
    non-numeric columns label-encoded. Return X (np.array), y (np.array), label_map (dict)
    """
    df = df.dropna().reset_index(drop=True).copy()
    if target_col is None:
        candidates = ['Attack_type','attack_type','label','Class','class','target','type']
        target_col = next((c for c in candidates if c in df.columns), df.columns[-1])
    if verbose:
        print("Using target column:", target_col)

    y_raw = df[target_col].astype(str).copy()
    X_df = df.drop(columns=[target_col]).copy()

    # For each column: try numeric coercion; else label-encode
    numeric_cols = []
    labelers = {}
    for c in X_df.columns:
        coerced = pd.to_numeric(X_df[c], errors='coerce')
        # if mostly numeric, keep numeric (fill na with median)
        if coerced.notna().sum() / len(coerced) >= 0.95:
            X_df[c] = coerced.fillna(coerced.median())
            numeric_cols.append(c)
        else:
            le = LabelEncoder()
            X_df[c] = le.fit_transform(X_df[c].astype(str))
            labelers[c] = le

    # Final X array
    X = X_df.astype(float).values

    # Map known WSN labels (paper mapping). If not present, label-encode remaining.
    mapping = {
        'normal':0,'Normal':0,'NORMAL':0,'0':0,
        'grayhole':1,'Grayhole':1,'GRAYHOLE':1,'1':1,
        'blackhole':2,'Blackhole':2,'BLACKHOLE':2,'2':2,
        'tdma':3,'TDMA':3,'3':3,
        'flooding':4,'Flooding':4,'FLOODING':4,'4':4
    }
    y_mapped = y_raw.map(mapping)
    if y_mapped.isna().any():
        # encode remaining unseen values but ensure no collision with mapping keys
        unmapped_mask = y_mapped.isna()
        rem_vals = y_raw[unmapped_mask].unique()
        le_rem = LabelEncoder()
        le_rem.fit(rem_vals)
        start_idx = max(mapping.values()) + 1
        rem_map = {v: i+start_idx for i,v in enumerate(le_rem.classes_)}
        y_mapped.loc[unmapped_mask] = y_raw[unmapped_mask].map(rem_map)

    y_int = y_mapped.astype(int).values

    # Ensure labels are 0..C-1 contiguous (remap)
    uniq = np.unique(y_int)
    label_map = {old: new for new, old in enumerate(sorted(uniq))}
    y_final = np.array([label_map[v] for v in y_int])

    if verbose:
        print("Final label distribution:", Counter(y_final))

    return X, y_final, label_map

# -------------------------
# Safe SMOTE helper
# -------------------------
def make_safe_smote_or_ros(y, default_k=5):
    """
    Return an oversampler object:
     - If any class has <= 1 sample -> RandomOverSampler
     - Else use SMOTE with k_neighbors = min(default_k, min_count-1)
    """
    counts = Counter(y)
    min_count = min(counts.values())
    if min_count <= 1:
        # cannot SMOTE a class with <=1 sample
        return RandomOverSampler(random_state=RANDOM_STATE)
    k = max(1, min(default_k, min_count - 1))
    # note: do NOT pass n_jobs to SMOTE (compatibility across imblearn versions)
    return SMOTE(random_state=RANDOM_STATE, k_neighbors=k)

# -------------------------
# Balancing Methods
# -------------------------
def fast_stl(X, y, max_majority_ratio=3):
    """
    Faster SMOTE-Tomek (STL) variant:
    - undersample extreme majorities, then apply SMOTETomek with a safe SMOTE
    """
    # compute class counts
    counts = Counter(y)
    classes = np.array(sorted(counts.keys()))
    counts_arr = np.array([counts[c] for c in classes])
    median_minority = int(np.median(counts_arr))
    # target maximum for any majority class
    max_allowed = max( max(counts_arr.min(), 1), int(max_majority_ratio * median_minority) )

    # create sampling map for undersampling
    sampling_strategy = {}
    for c in classes:
        if counts[c] > max_allowed:
            sampling_strategy[c] = max_allowed

    if sampling_strategy:
        rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=RANDOM_STATE)
        X_red, y_red = rus.fit_resample(X, y)
    else:
        X_red, y_red = X, y

    # now apply SMOTE + Tomek with a safe SMOTE
    safe_smote = make_safe_smote_or_ros(y_red, default_k=5)
    smt = SMOTETomek(smote=safe_smote, random_state=RANDOM_STATE)
    X_bal, y_bal = smt.fit_resample(X_red, y_red)
    return X_bal, y_bal

def kms_resample_imblearn(X, y, clusters=8):
    """
    Use imblearn's KMeansSMOTE if available, else raise.
    When available we build a KMeans estimator with n_clusters=clusters.
    """
    if not HAVE_IMBLEARN_KMS:
        raise RuntimeError("imblearn KMeansSMOTE not available in this environment.")
    # build a KMeans estimator
    km = KMeans(n_clusters=max(2, int(clusters)), random_state=RANDOM_STATE)
    # KMeansSMOTE expects a kmeans_estimator object in recent versions
    kms = imblearn_KMeansSMOTE(kmeans_estimator=km, random_state=RANDOM_STATE)
    Xb, yb = kms.fit_resample(X, y)
    return Xb, yb

def gan_resample_ctgan_fallback(X, y, target_samples_per_class=None, max_samples_train_gan=20000):
    """
    Try CTGAN to generate synthetic minority samples.
    If CTGAN not installed or dataset too large/small, fallback to SMOTE + small noise or RandomOverSampler.
    """
    n = X.shape[0]
    # If CTGAN not present or dataset too large or too small, fallback
    if (not HAVE_CTGAN) or (n > 120000) or (n < 10):
        # fallback: SMOTE (safely) then add gaussian noise
        sampler = make_safe_smote_or_ros(y, default_k=5)
        Xs, ys = sampler.fit_resample(X, y)
        # small Gaussian noise
        noise = np.random.normal(0, 0.001*np.std(Xs, axis=0, ddof=1), size=Xs.shape)
        Xs = Xs + noise
        return Xs, ys, "smote+noise (fallback)"
    # else try CTGAN
    from ctgan import CTGANSynthesizer
    df = pd.DataFrame(X)
    df['label'] = y
    discrete_columns = ['label']
    ctgan = CTGANSynthesizer(epochs=300, generator_dim=(128,128), discriminator_dim=(128,128), verbose=False)
    train_df = df.copy()
    if len(df) > max_samples_train_gan:
        train_df = df.sample(n=max_samples_train_gan, random_state=RANDOM_STATE).reset_index(drop=True)
    ctgan.fit(train_df, discrete_columns)
    counts = Counter(y)
    max_count = max(counts.values()) if target_samples_per_class is None else target_samples_per_class
    synth_list = []
    for cls in sorted(counts.keys()):
        current = counts[cls]
        need = max_count - current
        if need <= 0:
            continue
        synth = ctgan.sample(need)
        synth_x = synth.drop(columns=['label']).values.astype(float)
        synth_list.append((synth_x, np.array([cls]*len(synth_x))))
    if synth_list:
        Xs = np.vstack([X] + [s[0] for s in synth_list])
        ys = np.hstack([y] + [s[1] for s in synth_list])
    else:
        Xs, ys = X, y
    return Xs, ys, "ctgan"

# -------------------------
# PCA with Kaiser Criterion
# -------------------------
def apply_pca_kaiser(X_train):
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_train)
    pca_full = PCA()
    Xp_all = pca_full.fit_transform(Xs)
    eigenvals = pca_full.explained_variance_
    # Kaiser criterion
    n_comp = int(np.sum(eigenvals > 1.0))
    n_comp = max(1, n_comp)  # at least 1
    pca_final = PCA(n_components=n_comp)
    Xp = pca_final.fit_transform(Xs)
    return scaler, pca_final, Xp, n_comp, pca_full.explained_variance_ratio_

# -------------------------
# Model training & evaluation
# -------------------------
def compute_metrics(y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    kappa = cohen_kappa_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    # ROC-AUC: multi-class OVR if probability available
    try:
        if y_proba is not None:
            auc = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
        else:
            auc = roc_auc_score(y_true, LabelEncoder().fit_transform(y_pred), multi_class='ovr', average='weighted')
    except Exception:
        auc = 0.0
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = float(np.sqrt(mse))
    # specificity per-class: TN/(TN+FP)
    cm = confusion_matrix(y_true, y_pred)
    specs = []
    for i in range(cm.shape[0]):
        TP = cm[i,i]
        FP = cm[:,i].sum() - TP
        FN = cm[i,:].sum() - TP
        TN = cm.sum() - (TP + FP + FN)
        spec = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        specs.append(spec)
    specificity = float(np.mean(specs))
    return {
        'accuracy': acc, 'precision': prec, 'recall': rec, 'f1_score': f1,
        'kappa': kappa, 'mcc': mcc, 'roc_auc': auc, 'mae': mae, 'mse': mse,
        'rmse': rmse, 'specificity': specificity, 'confusion_matrix': cm
    }

# -------------------------
# Full experiment pipeline for a dataset
# -------------------------
def run_pipeline(file_path, dataset_name, do_gan=True, fast_stl_ratio=3, kms_clusters=8, verbose=True):
    print(f"\n=== Running pipeline for {dataset_name} ===")
    df = pd.read_csv(file_path)
    X, y, label_map = safe_preprocess(df, verbose=verbose)
    print(f"Raw shape: {X.shape}, classes: {sorted(list(Counter(y).items()))}")

    # We'll compare 3 balancing techniques: KMS, STL and optionally GAN
    balance_methods = ['KMS', 'STL']
    if do_gan:
        balance_methods.append('GAN')

    models = {
        'DTC': DecisionTreeClassifier(random_state=RANDOM_STATE),
        'RFC': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
        'CBC': CatBoostClassifier(verbose=0, random_state=RANDOM_STATE),
        'XGBC': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE, n_jobs=-1),
        'LGBC': LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        'HGBC': HistGradientBoostingClassifier(random_state=RANDOM_STATE)
    }

    results_records = []

    for bal in balance_methods:
        print(f"\n--- Balancing: {bal} ---")
        # 1) stratified train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RANDOM_STATE)
        print("Train class counts:", Counter(y_train))
        # 2) balancing on train only
        if bal == 'STL':
            t0 = time.time()
            Xb, yb = fast_stl(X_train, y_train, max_majority_ratio=fast_stl_ratio)
            info = 'fast_stl'
            t1 = time.time()
            print(f"STL (fast) produced shape {Xb.shape} in {t1-t0:.1f}s")
        elif bal == 'GAN' and do_gan:
            t0 = time.time()
            Xb, yb, info = gan_resample_ctgan_fallback(X_train, y_train)
            t1 = time.time()
            print(f"GAN balancing ({info}) produced shape {Xb.shape} in {t1-t0:.1f}s")
        elif bal == 'KMS':
            t0 = time.time()
            try:
                # try imblearn's KMeansSMOTE if available
                Xb, yb = kms_resample_imblearn(X_train, y_train, clusters=kms_clusters)
                info = f'imblearn_kms_{kms_clusters}'
            except Exception as e:
                print(f"KMeansSMOTE failed/unused ({str(e)}) -> falling back to safe SMOTE.")
                sampler = make_safe_smote_or_ros(y_train, default_k=5)
                Xb, yb = sampler.fit_resample(X_train, y_train)
                info = 'smote_fallback'
            t1 = time.time()
            print(f"KMS (or fallback) produced shape {Xb.shape} in {t1-t0:.1f}s (info: {info})")
        else:
            # default no balancing
            Xb, yb = X_train, y_train
            info = 'none'
            print("No balancing applied.")

        # 3) PCA with Kaiser on balanced training set
        scaler, pca_model, Xp_train, n_comp, evr = apply_pca_kaiser(Xb)
        print(f"PCA selected {n_comp} components (Kaiser). explained variance first comps sum={np.sum(evr[:n_comp]):.4f}")

        # transform test set with same scaler+pca
        Xs_test = scaler.transform(X_test)
        Xp_test = pca_model.transform(Xs_test)  # pca_model already uses n_components=n_comp

        # 4) Train & evaluate each model
        for mname, mobj in models.items():
            print(f"  Training {mname} ...", end=' ')
            t_start = time.time()
            mobj.fit(Xp_train, yb)
            t_train = time.time() - t_start
            # prediction
            t_pred0 = time.time()
            ypred = mobj.predict(Xp_test)
            t_pred = time.time() - t_pred0
            # probabilities if available
            try:
                yprob = mobj.predict_proba(Xp_test)
            except Exception:
                yprob = None
            metrics = compute_metrics(y_test, ypred, y_proba=yprob)
            metrics['training_time'] = t_train
            metrics['prediction_time'] = t_pred
            # record
            rec = {
                'dataset': dataset_name,
                'balancing': bal,
                'bal_info': info,
                'model': mname,
                'n_components': n_comp,
                'n_train': Xp_train.shape[0],
                'n_test': Xp_test.shape[0],
                **metrics
            }
            results_records.append(rec)
            print(f"done. Acc={metrics['accuracy']*100:.2f}%, F1={metrics['f1_score']*100:.2f}%, t_train={t_train:.1f}s")
    results_df = pd.DataFrame(results_records)
    return results_df

# -------------------------
# Run for both datasets
# -------------------------
wsn_path = "/content/WSN-DS.csv"
ton_path = "/content/train_test_network.csv"

assert os.path.exists(wsn_path), f"{wsn_path} not found!"
assert os.path.exists(ton_path), f"{ton_path} not found!"

# Run (WSN may be large; STL optimized and GAN fallback used)
res_ton = run_pipeline(ton_path, "TON-IoT", do_gan=True, fast_stl_ratio=3, kms_clusters=8, verbose=True)
res_wsn = run_pipeline(wsn_path, "WSN-DS", do_gan=True, fast_stl_ratio=3, kms_clusters=8, verbose=True)

# Combine & show summary table similar to paper (display top rows)
all_res = pd.concat([res_wsn, res_ton], ignore_index=True)
pd.set_option('display.max_rows', 200)
display(all_res.sort_values(['dataset','balancing','model']).reset_index(drop=True))

# Example: print the specific KMS+PCA+RFC row for WSN
row = all_res[(all_res.dataset=='WSN-DS') & (all_res.balancing=='KMS') & (all_res.model=='RFC')]
if not row.empty:
    r = row.iloc[0]
    print("\nKMS+PCA+RFC (WSN-DS) example metrics:")
    for k in ['accuracy','precision','recall','f1_score','kappa','mcc','roc_auc','mae','mse','rmse','training_time','prediction_time']:
        print(f"  {k:12}: {r[k]}")

# Save results
all_res.to_csv("/content/hybrid_ids_results_summary.csv", index=False)
print("\nAll results saved to /content/hybrid_ids_results_summary.csv")


[31mERROR: Ignored the following versions that require a different python version: 0.2.2 Requires-Python >=3.6,<3.9; 0.2.2.dev1 Requires-Python >=3.5,<3.9; 0.2.2.dev2 Requires-Python >=3.6,<3.9; 0.2.2.dev3 Requires-Python >=3.6,<3.9; 0.3.0 Requires-Python >=3.6,<3.9; 0.3.0.dev0 Requires-Python >=3.5,<3.9; 0.3.0.dev1 Requires-Python >=3.6,<3.9; 0.3.1 Requires-Python >=3.6,<3.9; 0.3.1.dev0 Requires-Python >=3.6,<3.9; 0.3.1.dev1 Requires-Python >=3.6,<3.9; 0.3.1.dev2 Requires-Python >=3.6,<3.9; 0.3.2.dev0 Requires-Python >=3.6,<3.9; 0.4.0 Requires-Python >=3.6,<3.9; 0.4.0.dev0 Requires-Python >=3.6,<3.9; 0.4.0.dev1 Requires-Python >=3.6,<3.9; 0.4.1 Requires-Python >=3.6,<3.9; 0.4.1.dev0 Requires-Python >=3.6,<3.9; 0.4.1.dev1 Requires-Python >=3.6,<3.9; 0.4.2 Requires-Python >=3.6,<3.9; 0.4.2.dev0 Requires-Python >=3.6,<3.9; 0.4.3 Requires-Python >=3.6,<3.9; 0.4.3.dev0 Requires-Python >=3.6,<3.9; 0.4.3.dev1 Requires-Python >=3.6,<3.9; 0.4.4.dev0 Requires-Python >=3.6,<3.9; 0.5.0 Requires-

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


done. Acc=99.98%, F1=99.98%, t_train=2.3s
  Training LGBC ... [LightGBM] [Info] Number of positive: 112730, number of negative: 112730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043101 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 225460, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




done. Acc=99.97%, F1=99.97%, t_train=4.6s
  Training HGBC ... done. Acc=99.96%, F1=99.96%, t_train=4.0s

--- Balancing: STL ---
Train class counts: Counter({np.int64(1): 112730, np.int64(0): 35000})
STL (fast) produced shape (225360, 43) in 370.7s
PCA selected 14 components (Kaiser). explained variance first comps sum=0.7883
  Training DTC ... done. Acc=99.91%, F1=99.91%, t_train=10.3s
  Training RFC ... done. Acc=99.97%, F1=99.97%, t_train=183.6s
  Training CBC ... done. Acc=99.94%, F1=99.94%, t_train=48.1s
  Training XGBC ... 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


done. Acc=99.98%, F1=99.98%, t_train=2.2s
  Training LGBC ... [LightGBM] [Info] Number of positive: 112680, number of negative: 112680
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 225360, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




done. Acc=99.97%, F1=99.97%, t_train=4.4s
  Training HGBC ... done. Acc=99.96%, F1=99.96%, t_train=3.9s

--- Balancing: GAN ---
Train class counts: Counter({np.int64(1): 112730, np.int64(0): 35000})
GAN balancing (smote+noise (fallback)) produced shape (225460, 43) in 9.9s
PCA selected 14 components (Kaiser). explained variance first comps sum=0.7880
  Training DTC ... done. Acc=99.90%, F1=99.90%, t_train=12.5s
  Training RFC ... done. Acc=99.97%, F1=99.97%, t_train=211.7s
  Training CBC ... done. Acc=99.95%, F1=99.95%, t_train=49.7s
  Training XGBC ... 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


done. Acc=99.97%, F1=99.97%, t_train=2.3s
  Training LGBC ... [LightGBM] [Info] Number of positive: 112730, number of negative: 112730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031981 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 225460, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




done. Acc=99.95%, F1=99.95%, t_train=4.6s
  Training HGBC ... done. Acc=99.96%, F1=99.96%, t_train=3.9s

=== Running pipeline for WSN-DS ===
Using target column: Attack type
Final label distribution: Counter({np.int64(0): 340066, np.int64(1): 14596, np.int64(2): 10049, np.int64(3): 6638, np.int64(4): 3312})
Raw shape: (374661, 18), classes: [(np.int64(0), 340066), (np.int64(1), 14596), (np.int64(2), 10049), (np.int64(3), 6638), (np.int64(4), 3312)]

--- Balancing: KMS ---
Train class counts: Counter({np.int64(0): 238046, np.int64(1): 10217, np.int64(2): 7034, np.int64(3): 4647, np.int64(4): 2318})
KMeansSMOTE failed/unused (No clusters found with sufficient samples of class 3. Try lowering the cluster_balance_threshold or increasing the number of clusters.) -> falling back to safe SMOTE.
KMS (or fallback) produced shape (1190230, 18) in 4.4s (info: smote_fallback)
PCA selected 4 components (Kaiser). explained variance first comps sum=0.7163
  Training DTC ... done. Acc=96.68%, F1=97.18

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


done. Acc=98.07%, F1=98.14%, t_train=44.6s
  Training LGBC ... [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 1190230, number of used features: 4
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




done. Acc=97.77%, F1=97.86%, t_train=34.8s
  Training HGBC ... done. Acc=97.70%, F1=97.79%, t_train=30.4s

--- Balancing: STL ---
Train class counts: Counter({np.int64(0): 238046, np.int64(1): 10217, np.int64(2): 7034, np.int64(3): 4647, np.int64(4): 2318})
STL (fast) produced shape (105008, 18) in 69.7s
PCA selected 4 components (Kaiser). explained variance first comps sum=0.7128
  Training DTC ... done. Acc=94.75%, F1=95.80%, t_train=1.3s
  Training RFC ... done. Acc=97.98%, F1=98.09%, t_train=65.1s
  Training CBC ... done. Acc=97.65%, F1=97.76%, t_train=58.6s
  Training XGBC ... 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


done. Acc=97.76%, F1=97.87%, t_train=3.4s
  Training LGBC ... [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 105008, number of used features: 4
[LightGBM] [Info] Start training from score -1.614048
[LightGBM] [Info] Start training from score -1.607849
[LightGBM] [Info] Start training from score -1.606139
[LightGBM] [Info] Start training from score -1.613235
[LightGBM] [Info] Start training from score -1.605949




done. Acc=97.68%, F1=97.78%, t_train=4.4s
  Training HGBC ... done. Acc=97.51%, F1=97.62%, t_train=2.7s

--- Balancing: GAN ---
Train class counts: Counter({np.int64(0): 238046, np.int64(1): 10217, np.int64(2): 7034, np.int64(3): 4647, np.int64(4): 2318})
GAN balancing (smote+noise (fallback)) produced shape (1190230, 18) in 2.6s
PCA selected 4 components (Kaiser). explained variance first comps sum=0.7163
  Training DTC ... done. Acc=96.66%, F1=97.17%, t_train=23.7s
  Training RFC ... done. Acc=98.48%, F1=98.55%, t_train=1144.3s
  Training CBC ... done. Acc=97.82%, F1=97.91%, t_train=626.4s
  Training XGBC ... 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


done. Acc=98.06%, F1=98.13%, t_train=44.3s
  Training LGBC ... [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 1190230, number of used features: 4
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




done. Acc=97.89%, F1=97.96%, t_train=36.2s
  Training HGBC ... done. Acc=97.71%, F1=97.80%, t_train=37.5s


Unnamed: 0,dataset,balancing,bal_info,model,n_components,n_train,n_test,accuracy,precision,recall,...,kappa,mcc,roc_auc,mae,mse,rmse,specificity,confusion_matrix,training_time,prediction_time
0,TON-IoT,GAN,smote+noise (fallback),CBC,14,225460,63313,0.999463,0.999463,0.999463,...,0.998515,0.998515,0.0,0.000537,0.000537,0.023174,0.999395,"[[14989, 11], [23, 48290]]",49.695147,0.396237
1,TON-IoT,GAN,smote+noise (fallback),DTC,14,225460,63313,0.998989,0.99899,0.998989,...,0.997205,0.997206,0.0,0.001011,0.001011,0.031794,0.998809,"[[14977, 23], [41, 48272]]",12.50088,0.007129
2,TON-IoT,GAN,smote+noise (fallback),HGBC,14,225460,63313,0.999558,0.999558,0.999558,...,0.998777,0.998778,0.0,0.000442,0.000442,0.02103,0.999572,"[[14994, 6], [22, 48291]]",3.920371,0.647048
3,TON-IoT,GAN,smote+noise (fallback),LGBC,14,225460,63313,0.999526,0.999526,0.999526,...,0.99869,0.99869,0.0,0.000474,0.000474,0.021768,0.999437,"[[14989, 11], [19, 48294]]",4.56568,0.215285
4,TON-IoT,GAN,smote+noise (fallback),RFC,14,225460,63313,0.999716,0.999716,0.999716,...,0.999214,0.999214,0.0,0.000284,0.000284,0.016861,0.999584,"[[14990, 10], [8, 48305]]",211.713982,0.676563
5,TON-IoT,GAN,smote+noise (fallback),XGBC,14,225460,63313,0.999731,0.999732,0.999731,...,0.999258,0.999258,0.0,0.000269,0.000269,0.016386,0.999755,"[[14997, 3], [14, 48299]]",2.287559,0.168832
6,TON-IoT,KMS,smote_fallback,CBC,14,225460,63313,0.999479,0.999479,0.999479,...,0.998559,0.998559,0.0,0.000521,0.000521,0.02283,0.999406,"[[14989, 11], [22, 48291]]",47.817182,0.401969
7,TON-IoT,KMS,smote_fallback,DTC,14,225460,63313,0.998973,0.998974,0.998973,...,0.997162,0.997163,0.0,0.001027,0.001027,0.032041,0.998845,"[[14979, 21], [44, 48269]]",10.552916,0.007142
8,TON-IoT,KMS,smote_fallback,HGBC,14,225460,63313,0.999574,0.999574,0.999574,...,0.998821,0.998821,0.0,0.000426,0.000426,0.020651,0.999606,"[[14995, 5], [22, 48291]]",4.019494,0.658915
9,TON-IoT,KMS,smote_fallback,LGBC,14,225460,63313,0.999668,0.999668,0.999668,...,0.999083,0.999083,0.0,0.000332,0.000332,0.018212,0.999668,"[[14995, 5], [16, 48297]]",4.577968,0.210315



KMS+PCA+RFC (WSN-DS) example metrics:
  accuracy    : 0.9847329602576536
  precision   : 0.9869630175360714
  recall      : 0.9847329602576536
  f1_score    : 0.9854779669165458
  kappa       : 0.9161938516776468
  mcc         : 0.9175502626823507
  roc_auc     : 0.9959869628991823
  mae         : 0.036779686652016474
  mse         : 0.10444932784099503
  rmse        : 0.32318621233121164
  training_time: 1108.10076546669
  prediction_time: 2.808807611465454

All results saved to /content/hybrid_ids_results_summary.csv
