In [None]:
!pip -q install pytorch-tabnet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import hashlib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, average_precision_score

from scipy import stats

# TabNet
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
SEEDS = [42, 52, 62, 72, 82]
TEST_SIZE = 0.30
VAL_SIZE  = 0.20

TABNET_PARAMS = dict(
    max_epochs=10,
    patience=10,
    batch_size=512,
    virtual_batch_size=128,
    num_workers=0
)

BASE_ORIGINAL = "/content/OroginalDatasetBalanced.csv"

In [None]:
def apply_hashing(df):
    for col in ['Source IP', 'Destination IP', 'Flow ID']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
    return df

def get_numeric_cols(df, label_col="Label"):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = numeric_cols.drop(label_col)
    return numeric_cols

def apply_noise_only(df, numeric_cols, factor=0.35):
    for col in numeric_cols:
        std = df[col].std()
        noise = np.random.normal(0, factor * std, size=len(df))
        df[col] = (df[col] + noise).clip(lower=0)
    return df

def apply_binning_only(df, numeric_cols, q=5):
    for col in numeric_cols:
        try:
            df[col] = pd.qcut(df[col], q=q, labels=False, duplicates='drop')
        except:
            pass
    return df

def apply_masking_only(df, numeric_cols, mask_prob=0.15):
    for col in numeric_cols:
        mask = np.random.rand(len(df)) < mask_prob
        df.loc[mask, col] = 0
    return df

def apply_permutation_only(df, numeric_cols):
    for col in numeric_cols:
        df[col] = np.random.permutation(df[col].values)
    return df

In [None]:
def build_ablation_csvs(base_csv):
    df0 = pd.read_csv(base_csv)

    # Encode labels (exactly your approach)
    df0["Label"] = df0["Label"].map({"Benign": 0, "Malicious": 1})
    df0.dropna(subset=["Label"], inplace=True)

    # Hashing (consistent with your obfuscation scripts)
    df0 = apply_hashing(df0)

    numeric_cols = get_numeric_cols(df0, "Label")

    # Use ONE fixed seed for obfuscation generation
    np.random.seed(0)

    # Noise-only
    df_noise = apply_noise_only(df0.copy(), numeric_cols, factor=0.3)
    df_noise.to_csv("/content/Ablation_NoiseOnly.csv", index=False)

    # Binning-only
    df_bin = apply_binning_only(df0.copy(), numeric_cols, q=5)
    df_bin.to_csv("/content/Ablation_BinningOnly.csv", index=False)

    # Masking-only
    df_mask = apply_masking_only(df0.copy(), numeric_cols, mask_prob=0.2)
    df_mask.to_csv("/content/Ablation_MaskingOnly.csv", index=False)

    # Permutation-only
    df_perm = apply_permutation_only(df0.copy(), numeric_cols)
    df_perm.to_csv("/content/Ablation_PermutationOnly.csv", index=False)

    print("✅ Saved ablation datasets:",
          "Ablation_NoiseOnly.csv, Ablation_BinningOnly.csv, Ablation_MaskingOnly.csv, Ablation_PermutationOnly.csv")

build_ablation_csvs(BASE_ORIGINAL)

✅ Saved ablation datasets: Ablation_NoiseOnly.csv, Ablation_BinningOnly.csv, Ablation_MaskingOnly.csv, Ablation_PermutationOnly.csv


In [None]:
def load_and_preprocess(csv_path, label_col="Label"):
    df = pd.read_csv(csv_path)
    if label_col not in df.columns:
        raise ValueError(f"Label column '{label_col}' not found in {csv_path}")

    # ---- Label handling (supports [0,1] or ['Benign','Malicious']) ----
    if df[label_col].dtype == object:
        df = df[df[label_col].isin(["Benign", "Malicious"])].copy()
        y = df[label_col].map({"Benign": 0, "Malicious": 1}).astype(int).to_numpy()
    else:
        df = df[df[label_col].isin([0, 1])].copy()
        y = df[label_col].astype(int).to_numpy()

    # ---- Numeric features only ----
    X = df.drop(columns=[label_col], errors="ignore").select_dtypes(include=[np.number]).copy()

    # Drop constant columns
    if len(X.columns) == 0:
        raise ValueError("No numeric features found after selecting numeric columns.")
    X = X.loc[:, (X != X.iloc[0]).any()]

    # Clean inf/nan
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(0, inplace=True)

    return X.to_numpy(), y, X.columns.tolist()

In [None]:
def train_eval_tabnet(X, y, seed=42):
    # 1) Train/Test split
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=seed
    )

    # 2) Train/Val split (from train only)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=VAL_SIZE, stratify=y_train_full, random_state=seed
    )

    # 3) Scale: fit on TRAIN only
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val   = scaler.transform(X_val)
    X_test  = scaler.transform(X_test)

    # 4) Train TabNet using VAL as eval_set (no test leakage)
    clf = TabNetClassifier(seed=seed)
    clf.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_name=["val"],
        eval_metric=["accuracy"],
        **TABNET_PARAMS
    )

    # 5) Final evaluation on TEST only
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average="macro")
    roc = roc_auc_score(y_test, y_prob)
    pr  = average_precision_score(y_test, y_prob)
    cm  = confusion_matrix(y_test, y_pred)

    return acc, f1, roc, pr, cm

In [None]:
DATASETS = {
    "Original": BASE_ORIGINAL,
    "NoiseOnly": "/content/Ablation_NoiseOnly.csv",
    "BinningOnly": "/content/Ablation_BinningOnly.csv",
    "MaskingOnly": "/content/Ablation_MaskingOnly.csv",
    "PermutationOnly": "/content/Ablation_PermutationOnly.csv",
}

In [None]:
results = []

for name, path in DATASETS.items():
    print("\n" + "="*80)
    print(f"DATASET: {name}")
    print(f"FILE: {path}")
    print("="*80)

    X, y, feats = load_and_preprocess(path)

    for seed in SEEDS:
        acc, f1, roc, pr, cm = train_eval_tabnet(X, y, seed=seed)
        results.append([name, seed, acc, f1, roc, pr])

summary = pd.DataFrame(
    results,
    columns=["Dataset", "Seed", "Accuracy", "F1_macro", "ROC_AUC", "PR_AUC"]
)
summary


DATASET: Original
FILE: /content/OroginalDatasetBalanced.csv




epoch 0  | loss: 0.56492 | val_accuracy: 0.82643 |  0:00:00s
epoch 1  | loss: 0.27631 | val_accuracy: 0.90286 |  0:00:01s
epoch 2  | loss: 0.20277 | val_accuracy: 0.94071 |  0:00:01s
epoch 3  | loss: 0.15406 | val_accuracy: 0.94071 |  0:00:02s
epoch 4  | loss: 0.1162  | val_accuracy: 0.95214 |  0:00:02s
epoch 5  | loss: 0.10919 | val_accuracy: 0.97143 |  0:00:02s
epoch 6  | loss: 0.11084 | val_accuracy: 0.97786 |  0:00:03s
epoch 7  | loss: 0.09537 | val_accuracy: 0.97571 |  0:00:03s
epoch 8  | loss: 0.08028 | val_accuracy: 0.97143 |  0:00:04s
epoch 9  | loss: 0.07261 | val_accuracy: 0.95929 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 6 and best_val_accuracy = 0.97786




epoch 0  | loss: 0.5895  | val_accuracy: 0.76714 |  0:00:00s
epoch 1  | loss: 0.32703 | val_accuracy: 0.91071 |  0:00:01s
epoch 2  | loss: 0.22179 | val_accuracy: 0.94286 |  0:00:01s
epoch 3  | loss: 0.12366 | val_accuracy: 0.97143 |  0:00:02s
epoch 4  | loss: 0.11423 | val_accuracy: 0.97214 |  0:00:03s
epoch 5  | loss: 0.07975 | val_accuracy: 0.97786 |  0:00:03s
epoch 6  | loss: 0.07099 | val_accuracy: 0.95929 |  0:00:03s
epoch 7  | loss: 0.06708 | val_accuracy: 0.96143 |  0:00:04s
epoch 8  | loss: 0.07176 | val_accuracy: 0.96214 |  0:00:04s
epoch 9  | loss: 0.06086 | val_accuracy: 0.975   |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 5 and best_val_accuracy = 0.97786




epoch 0  | loss: 0.60508 | val_accuracy: 0.78786 |  0:00:00s
epoch 1  | loss: 0.41152 | val_accuracy: 0.86786 |  0:00:00s
epoch 2  | loss: 0.23448 | val_accuracy: 0.94286 |  0:00:01s
epoch 3  | loss: 0.16077 | val_accuracy: 0.95571 |  0:00:01s
epoch 4  | loss: 0.12694 | val_accuracy: 0.97357 |  0:00:02s
epoch 5  | loss: 0.11233 | val_accuracy: 0.96571 |  0:00:02s
epoch 6  | loss: 0.089   | val_accuracy: 0.97143 |  0:00:03s
epoch 7  | loss: 0.08688 | val_accuracy: 0.97857 |  0:00:03s
epoch 8  | loss: 0.08262 | val_accuracy: 0.97929 |  0:00:04s
epoch 9  | loss: 0.06862 | val_accuracy: 0.97071 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_accuracy = 0.97929




epoch 0  | loss: 0.68625 | val_accuracy: 0.82286 |  0:00:00s
epoch 1  | loss: 0.37892 | val_accuracy: 0.91714 |  0:00:00s
epoch 2  | loss: 0.21621 | val_accuracy: 0.94714 |  0:00:01s
epoch 3  | loss: 0.15335 | val_accuracy: 0.95786 |  0:00:02s
epoch 4  | loss: 0.12591 | val_accuracy: 0.96286 |  0:00:03s
epoch 5  | loss: 0.10969 | val_accuracy: 0.96714 |  0:00:03s
epoch 6  | loss: 0.10405 | val_accuracy: 0.95643 |  0:00:04s
epoch 7  | loss: 0.09724 | val_accuracy: 0.96143 |  0:00:04s
epoch 8  | loss: 0.09005 | val_accuracy: 0.95571 |  0:00:05s
epoch 9  | loss: 0.08375 | val_accuracy: 0.97071 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.97071




epoch 0  | loss: 0.54656 | val_accuracy: 0.77214 |  0:00:00s
epoch 1  | loss: 0.2989  | val_accuracy: 0.87071 |  0:00:00s
epoch 2  | loss: 0.2006  | val_accuracy: 0.93429 |  0:00:01s
epoch 3  | loss: 0.13903 | val_accuracy: 0.95214 |  0:00:01s
epoch 4  | loss: 0.13155 | val_accuracy: 0.95357 |  0:00:02s
epoch 5  | loss: 0.0939  | val_accuracy: 0.98143 |  0:00:02s
epoch 6  | loss: 0.08773 | val_accuracy: 0.97786 |  0:00:03s
epoch 7  | loss: 0.07499 | val_accuracy: 0.97929 |  0:00:03s
epoch 8  | loss: 0.07347 | val_accuracy: 0.96286 |  0:00:04s
epoch 9  | loss: 0.08888 | val_accuracy: 0.97714 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 5 and best_val_accuracy = 0.98143





DATASET: NoiseOnly
FILE: /content/Ablation_NoiseOnly.csv




epoch 0  | loss: 0.67353 | val_accuracy: 0.73714 |  0:00:00s
epoch 1  | loss: 0.40659 | val_accuracy: 0.87286 |  0:00:00s
epoch 2  | loss: 0.29173 | val_accuracy: 0.91857 |  0:00:01s
epoch 3  | loss: 0.23515 | val_accuracy: 0.92143 |  0:00:01s
epoch 4  | loss: 0.18528 | val_accuracy: 0.95857 |  0:00:02s
epoch 5  | loss: 0.14183 | val_accuracy: 0.96857 |  0:00:03s
epoch 6  | loss: 0.1209  | val_accuracy: 0.97    |  0:00:03s
epoch 7  | loss: 0.10987 | val_accuracy: 0.97286 |  0:00:04s
epoch 8  | loss: 0.0951  | val_accuracy: 0.97071 |  0:00:05s
epoch 9  | loss: 0.09141 | val_accuracy: 0.97429 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.97429




epoch 0  | loss: 0.6271  | val_accuracy: 0.69429 |  0:00:00s
epoch 1  | loss: 0.37846 | val_accuracy: 0.88286 |  0:00:00s
epoch 2  | loss: 0.25306 | val_accuracy: 0.92929 |  0:00:01s
epoch 3  | loss: 0.21419 | val_accuracy: 0.93571 |  0:00:01s
epoch 4  | loss: 0.16333 | val_accuracy: 0.95143 |  0:00:02s
epoch 5  | loss: 0.15345 | val_accuracy: 0.96    |  0:00:02s
epoch 6  | loss: 0.13307 | val_accuracy: 0.96143 |  0:00:03s
epoch 7  | loss: 0.13511 | val_accuracy: 0.96357 |  0:00:03s
epoch 8  | loss: 0.1343  | val_accuracy: 0.95929 |  0:00:04s
epoch 9  | loss: 0.10996 | val_accuracy: 0.96286 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 7 and best_val_accuracy = 0.96357




epoch 0  | loss: 0.58789 | val_accuracy: 0.79571 |  0:00:00s
epoch 1  | loss: 0.3419  | val_accuracy: 0.88429 |  0:00:00s
epoch 2  | loss: 0.27124 | val_accuracy: 0.90214 |  0:00:01s
epoch 3  | loss: 0.21944 | val_accuracy: 0.92    |  0:00:01s
epoch 4  | loss: 0.20754 | val_accuracy: 0.93143 |  0:00:02s
epoch 5  | loss: 0.16361 | val_accuracy: 0.94    |  0:00:02s
epoch 6  | loss: 0.14083 | val_accuracy: 0.955   |  0:00:03s
epoch 7  | loss: 0.11678 | val_accuracy: 0.96143 |  0:00:04s
epoch 8  | loss: 0.10432 | val_accuracy: 0.96429 |  0:00:04s
epoch 9  | loss: 0.10252 | val_accuracy: 0.95857 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_accuracy = 0.96429




epoch 0  | loss: 0.72182 | val_accuracy: 0.68714 |  0:00:00s
epoch 1  | loss: 0.40977 | val_accuracy: 0.885   |  0:00:00s
epoch 2  | loss: 0.22895 | val_accuracy: 0.92643 |  0:00:01s
epoch 3  | loss: 0.19217 | val_accuracy: 0.93071 |  0:00:01s
epoch 4  | loss: 0.15959 | val_accuracy: 0.94071 |  0:00:02s
epoch 5  | loss: 0.13893 | val_accuracy: 0.95214 |  0:00:02s
epoch 6  | loss: 0.1225  | val_accuracy: 0.96214 |  0:00:03s
epoch 7  | loss: 0.11294 | val_accuracy: 0.95857 |  0:00:03s
epoch 8  | loss: 0.08877 | val_accuracy: 0.96071 |  0:00:04s
epoch 9  | loss: 0.07306 | val_accuracy: 0.96357 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.96357




epoch 0  | loss: 0.64544 | val_accuracy: 0.77214 |  0:00:00s
epoch 1  | loss: 0.44853 | val_accuracy: 0.85357 |  0:00:00s
epoch 2  | loss: 0.29155 | val_accuracy: 0.90214 |  0:00:01s
epoch 3  | loss: 0.26443 | val_accuracy: 0.92    |  0:00:01s
epoch 4  | loss: 0.20664 | val_accuracy: 0.94071 |  0:00:02s
epoch 5  | loss: 0.18344 | val_accuracy: 0.94857 |  0:00:02s
epoch 6  | loss: 0.17001 | val_accuracy: 0.95357 |  0:00:03s
epoch 7  | loss: 0.15664 | val_accuracy: 0.95786 |  0:00:03s
epoch 8  | loss: 0.14131 | val_accuracy: 0.95929 |  0:00:04s
epoch 9  | loss: 0.1267  | val_accuracy: 0.96071 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.96071





DATASET: BinningOnly
FILE: /content/Ablation_BinningOnly.csv




epoch 0  | loss: 0.78658 | val_accuracy: 0.72929 |  0:00:00s
epoch 1  | loss: 0.29033 | val_accuracy: 0.92    |  0:00:01s
epoch 2  | loss: 0.16281 | val_accuracy: 0.94214 |  0:00:02s
epoch 3  | loss: 0.12459 | val_accuracy: 0.96714 |  0:00:02s
epoch 4  | loss: 0.09469 | val_accuracy: 0.97143 |  0:00:02s
epoch 5  | loss: 0.07703 | val_accuracy: 0.97357 |  0:00:03s
epoch 6  | loss: 0.06535 | val_accuracy: 0.98357 |  0:00:03s
epoch 7  | loss: 0.06272 | val_accuracy: 0.97429 |  0:00:04s
epoch 8  | loss: 0.04669 | val_accuracy: 0.97071 |  0:00:04s
epoch 9  | loss: 0.03974 | val_accuracy: 0.98429 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.98429




epoch 0  | loss: 0.53237 | val_accuracy: 0.90857 |  0:00:00s
epoch 1  | loss: 0.22333 | val_accuracy: 0.94571 |  0:00:00s
epoch 2  | loss: 0.14295 | val_accuracy: 0.96143 |  0:00:01s
epoch 3  | loss: 0.10895 | val_accuracy: 0.96857 |  0:00:01s
epoch 4  | loss: 0.08473 | val_accuracy: 0.98143 |  0:00:02s
epoch 5  | loss: 0.06139 | val_accuracy: 0.97929 |  0:00:02s
epoch 6  | loss: 0.05675 | val_accuracy: 0.985   |  0:00:03s
epoch 7  | loss: 0.04042 | val_accuracy: 0.98857 |  0:00:03s
epoch 8  | loss: 0.03477 | val_accuracy: 0.99    |  0:00:04s
epoch 9  | loss: 0.03561 | val_accuracy: 0.98714 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 8 and best_val_accuracy = 0.99




epoch 0  | loss: 0.50107 | val_accuracy: 0.83643 |  0:00:02s
epoch 1  | loss: 0.17922 | val_accuracy: 0.95286 |  0:00:03s
epoch 2  | loss: 0.12981 | val_accuracy: 0.95286 |  0:00:03s
epoch 3  | loss: 0.09692 | val_accuracy: 0.95214 |  0:00:04s
epoch 4  | loss: 0.08636 | val_accuracy: 0.96643 |  0:00:04s
epoch 5  | loss: 0.06823 | val_accuracy: 0.975   |  0:00:05s
epoch 6  | loss: 0.05158 | val_accuracy: 0.98571 |  0:00:05s
epoch 7  | loss: 0.04715 | val_accuracy: 0.98214 |  0:00:06s
epoch 8  | loss: 0.04353 | val_accuracy: 0.98429 |  0:00:06s
epoch 9  | loss: 0.03831 | val_accuracy: 0.99143 |  0:00:07s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.99143




epoch 0  | loss: 0.56967 | val_accuracy: 0.89429 |  0:00:00s
epoch 1  | loss: 0.22316 | val_accuracy: 0.92571 |  0:00:00s
epoch 2  | loss: 0.13046 | val_accuracy: 0.96429 |  0:00:01s
epoch 3  | loss: 0.1051  | val_accuracy: 0.96643 |  0:00:01s
epoch 4  | loss: 0.11113 | val_accuracy: 0.97929 |  0:00:02s
epoch 5  | loss: 0.07234 | val_accuracy: 0.97857 |  0:00:02s
epoch 6  | loss: 0.05822 | val_accuracy: 0.98    |  0:00:03s
epoch 7  | loss: 0.05094 | val_accuracy: 0.98429 |  0:00:03s
epoch 8  | loss: 0.04246 | val_accuracy: 0.98571 |  0:00:04s
epoch 9  | loss: 0.03443 | val_accuracy: 0.99071 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.99071




epoch 0  | loss: 0.55594 | val_accuracy: 0.73143 |  0:00:00s
epoch 1  | loss: 0.18508 | val_accuracy: 0.92286 |  0:00:01s
epoch 2  | loss: 0.12879 | val_accuracy: 0.95643 |  0:00:02s
epoch 3  | loss: 0.10541 | val_accuracy: 0.95714 |  0:00:02s
epoch 4  | loss: 0.09496 | val_accuracy: 0.96857 |  0:00:02s
epoch 5  | loss: 0.09098 | val_accuracy: 0.975   |  0:00:03s
epoch 6  | loss: 0.0655  | val_accuracy: 0.98357 |  0:00:03s
epoch 7  | loss: 0.05246 | val_accuracy: 0.98357 |  0:00:04s
epoch 8  | loss: 0.04832 | val_accuracy: 0.98786 |  0:00:04s
epoch 9  | loss: 0.03624 | val_accuracy: 0.98929 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.98929





DATASET: MaskingOnly
FILE: /content/Ablation_MaskingOnly.csv




epoch 0  | loss: 0.60682 | val_accuracy: 0.76071 |  0:00:00s
epoch 1  | loss: 0.35084 | val_accuracy: 0.87143 |  0:00:00s
epoch 2  | loss: 0.25757 | val_accuracy: 0.91929 |  0:00:01s
epoch 3  | loss: 0.18263 | val_accuracy: 0.94643 |  0:00:01s
epoch 4  | loss: 0.16061 | val_accuracy: 0.955   |  0:00:02s
epoch 5  | loss: 0.13627 | val_accuracy: 0.96857 |  0:00:02s
epoch 6  | loss: 0.12333 | val_accuracy: 0.97143 |  0:00:03s
epoch 7  | loss: 0.11705 | val_accuracy: 0.97214 |  0:00:03s
epoch 8  | loss: 0.10569 | val_accuracy: 0.96571 |  0:00:04s
epoch 9  | loss: 0.09862 | val_accuracy: 0.96714 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 7 and best_val_accuracy = 0.97214




epoch 0  | loss: 0.58566 | val_accuracy: 0.71571 |  0:00:00s
epoch 1  | loss: 0.3965  | val_accuracy: 0.83714 |  0:00:01s
epoch 2  | loss: 0.31099 | val_accuracy: 0.87357 |  0:00:01s
epoch 3  | loss: 0.26312 | val_accuracy: 0.935   |  0:00:02s
epoch 4  | loss: 0.20142 | val_accuracy: 0.92286 |  0:00:03s
epoch 5  | loss: 0.18782 | val_accuracy: 0.94071 |  0:00:03s
epoch 6  | loss: 0.16143 | val_accuracy: 0.94071 |  0:00:04s
epoch 7  | loss: 0.14191 | val_accuracy: 0.95143 |  0:00:04s
epoch 8  | loss: 0.13292 | val_accuracy: 0.95286 |  0:00:05s
epoch 9  | loss: 0.11847 | val_accuracy: 0.95714 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.95714




epoch 0  | loss: 0.58467 | val_accuracy: 0.82071 |  0:00:00s
epoch 1  | loss: 0.37661 | val_accuracy: 0.89357 |  0:00:00s
epoch 2  | loss: 0.24926 | val_accuracy: 0.88786 |  0:00:01s
epoch 3  | loss: 0.21153 | val_accuracy: 0.91786 |  0:00:01s
epoch 4  | loss: 0.1837  | val_accuracy: 0.92143 |  0:00:02s
epoch 5  | loss: 0.15757 | val_accuracy: 0.93929 |  0:00:02s
epoch 6  | loss: 0.14262 | val_accuracy: 0.94071 |  0:00:03s
epoch 7  | loss: 0.13721 | val_accuracy: 0.94214 |  0:00:03s
epoch 8  | loss: 0.12773 | val_accuracy: 0.94571 |  0:00:04s
epoch 9  | loss: 0.10702 | val_accuracy: 0.95429 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.95429




epoch 0  | loss: 0.71452 | val_accuracy: 0.74286 |  0:00:00s
epoch 1  | loss: 0.48765 | val_accuracy: 0.84857 |  0:00:00s
epoch 2  | loss: 0.33864 | val_accuracy: 0.88714 |  0:00:01s
epoch 3  | loss: 0.25919 | val_accuracy: 0.915   |  0:00:01s
epoch 4  | loss: 0.21949 | val_accuracy: 0.93071 |  0:00:02s
epoch 5  | loss: 0.20377 | val_accuracy: 0.92786 |  0:00:03s
epoch 6  | loss: 0.20168 | val_accuracy: 0.92143 |  0:00:03s
epoch 7  | loss: 0.1824  | val_accuracy: 0.92714 |  0:00:04s
epoch 8  | loss: 0.15521 | val_accuracy: 0.92786 |  0:00:04s
epoch 9  | loss: 0.14451 | val_accuracy: 0.945   |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_val_accuracy = 0.945




epoch 0  | loss: 0.5815  | val_accuracy: 0.76286 |  0:00:00s
epoch 1  | loss: 0.38426 | val_accuracy: 0.85643 |  0:00:00s
epoch 2  | loss: 0.26503 | val_accuracy: 0.92286 |  0:00:01s
epoch 3  | loss: 0.20325 | val_accuracy: 0.92714 |  0:00:01s
epoch 4  | loss: 0.17171 | val_accuracy: 0.95071 |  0:00:02s
epoch 5  | loss: 0.15286 | val_accuracy: 0.94857 |  0:00:02s
epoch 6  | loss: 0.15737 | val_accuracy: 0.96071 |  0:00:03s
epoch 7  | loss: 0.13906 | val_accuracy: 0.96    |  0:00:03s
epoch 8  | loss: 0.1464  | val_accuracy: 0.94929 |  0:00:04s
epoch 9  | loss: 0.12995 | val_accuracy: 0.96    |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 6 and best_val_accuracy = 0.96071





DATASET: PermutationOnly
FILE: /content/Ablation_PermutationOnly.csv




epoch 0  | loss: 0.87311 | val_accuracy: 0.52643 |  0:00:00s
epoch 1  | loss: 0.73839 | val_accuracy: 0.51071 |  0:00:00s
epoch 2  | loss: 0.70252 | val_accuracy: 0.47571 |  0:00:01s
epoch 3  | loss: 0.69854 | val_accuracy: 0.50571 |  0:00:01s
epoch 4  | loss: 0.69639 | val_accuracy: 0.49714 |  0:00:02s
epoch 5  | loss: 0.6948  | val_accuracy: 0.51429 |  0:00:02s
epoch 6  | loss: 0.69257 | val_accuracy: 0.51571 |  0:00:03s
epoch 7  | loss: 0.69335 | val_accuracy: 0.49571 |  0:00:03s
epoch 8  | loss: 0.69348 | val_accuracy: 0.5     |  0:00:04s
epoch 9  | loss: 0.69167 | val_accuracy: 0.50714 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 0 and best_val_accuracy = 0.52643




epoch 0  | loss: 0.84829 | val_accuracy: 0.51929 |  0:00:00s
epoch 1  | loss: 0.71991 | val_accuracy: 0.50143 |  0:00:00s
epoch 2  | loss: 0.70656 | val_accuracy: 0.51786 |  0:00:01s
epoch 3  | loss: 0.69706 | val_accuracy: 0.51    |  0:00:01s
epoch 4  | loss: 0.69604 | val_accuracy: 0.49857 |  0:00:02s
epoch 5  | loss: 0.69542 | val_accuracy: 0.50571 |  0:00:02s
epoch 6  | loss: 0.69402 | val_accuracy: 0.49214 |  0:00:03s
epoch 7  | loss: 0.69294 | val_accuracy: 0.51    |  0:00:03s
epoch 8  | loss: 0.691   | val_accuracy: 0.50929 |  0:00:04s
epoch 9  | loss: 0.69278 | val_accuracy: 0.50857 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 0 and best_val_accuracy = 0.51929




epoch 0  | loss: 0.74087 | val_accuracy: 0.50714 |  0:00:00s
epoch 1  | loss: 0.70473 | val_accuracy: 0.50857 |  0:00:00s
epoch 2  | loss: 0.69764 | val_accuracy: 0.50286 |  0:00:01s
epoch 3  | loss: 0.69406 | val_accuracy: 0.505   |  0:00:01s
epoch 4  | loss: 0.69369 | val_accuracy: 0.49643 |  0:00:02s
epoch 5  | loss: 0.69371 | val_accuracy: 0.49786 |  0:00:02s
epoch 6  | loss: 0.69293 | val_accuracy: 0.49071 |  0:00:03s
epoch 7  | loss: 0.6928  | val_accuracy: 0.48286 |  0:00:03s
epoch 8  | loss: 0.69225 | val_accuracy: 0.46429 |  0:00:04s
epoch 9  | loss: 0.69166 | val_accuracy: 0.47286 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 1 and best_val_accuracy = 0.50857




epoch 0  | loss: 0.77149 | val_accuracy: 0.51929 |  0:00:00s
epoch 1  | loss: 0.70295 | val_accuracy: 0.52429 |  0:00:01s
epoch 2  | loss: 0.69858 | val_accuracy: 0.49714 |  0:00:01s
epoch 3  | loss: 0.69593 | val_accuracy: 0.50929 |  0:00:02s
epoch 4  | loss: 0.69422 | val_accuracy: 0.48714 |  0:00:03s
epoch 5  | loss: 0.69421 | val_accuracy: 0.49071 |  0:00:03s
epoch 6  | loss: 0.69265 | val_accuracy: 0.48643 |  0:00:03s
epoch 7  | loss: 0.69315 | val_accuracy: 0.48357 |  0:00:04s
epoch 8  | loss: 0.6929  | val_accuracy: 0.49    |  0:00:04s
epoch 9  | loss: 0.69155 | val_accuracy: 0.48357 |  0:00:05s
Stop training because you reached max_epochs = 10 with best_epoch = 1 and best_val_accuracy = 0.52429




epoch 0  | loss: 0.80065 | val_accuracy: 0.49857 |  0:00:00s
epoch 1  | loss: 0.7204  | val_accuracy: 0.48357 |  0:00:00s
epoch 2  | loss: 0.70128 | val_accuracy: 0.49357 |  0:00:01s
epoch 3  | loss: 0.69826 | val_accuracy: 0.50929 |  0:00:01s
epoch 4  | loss: 0.69446 | val_accuracy: 0.50643 |  0:00:02s
epoch 5  | loss: 0.69326 | val_accuracy: 0.505   |  0:00:02s
epoch 6  | loss: 0.69313 | val_accuracy: 0.51643 |  0:00:03s
epoch 7  | loss: 0.6929  | val_accuracy: 0.49643 |  0:00:03s
epoch 8  | loss: 0.69279 | val_accuracy: 0.48786 |  0:00:04s
epoch 9  | loss: 0.69276 | val_accuracy: 0.50786 |  0:00:04s
Stop training because you reached max_epochs = 10 with best_epoch = 6 and best_val_accuracy = 0.51643




Unnamed: 0,Dataset,Seed,Accuracy,F1_macro,ROC_AUC,PR_AUC
0,Original,42,0.978667,0.978667,0.995776,0.995355
1,Original,52,0.975,0.974999,0.994232,0.993756
2,Original,62,0.974667,0.974662,0.995375,0.99598
3,Original,72,0.972667,0.972666,0.994786,0.993895
4,Original,82,0.977333,0.977333,0.99479,0.990291
5,NoiseOnly,42,0.971333,0.971333,0.99377,0.993328
6,NoiseOnly,52,0.961667,0.961666,0.989909,0.991137
7,NoiseOnly,62,0.970333,0.97033,0.993899,0.992624
8,NoiseOnly,72,0.968333,0.968332,0.99544,0.993033
9,NoiseOnly,82,0.958333,0.958301,0.989075,0.98625


In [None]:
def mean_std_ci(x, confidence=0.95):
    x = np.array(x)
    mean = x.mean()
    std  = x.std(ddof=1)
    n    = len(x)
    h    = stats.t.ppf((1 + confidence) / 2., n-1) * std / np.sqrt(n)
    return mean, std, h

agg_rows = []

for dataset in summary["Dataset"].unique():
    df_d = summary[summary["Dataset"] == dataset]

    acc_mean, acc_std, acc_ci = mean_std_ci(df_d["Accuracy"])
    f1_mean,  f1_std,  f1_ci  = mean_std_ci(df_d["F1_macro"])
    roc_mean, roc_std, roc_ci = mean_std_ci(df_d["ROC_AUC"])
    pr_mean,  pr_std,  pr_ci  = mean_std_ci(df_d["PR_AUC"])

    agg_rows.append([
        dataset,
        acc_mean, acc_std, acc_ci,
        f1_mean,  f1_std,  f1_ci,
        roc_mean, roc_std, roc_ci,
        pr_mean,  pr_std,  pr_ci
    ])

agg = pd.DataFrame(
    agg_rows,
    columns=[
        "Dataset",
        "Accuracy_mean", "Accuracy_std", "Accuracy_95CI",
        "F1_macro_mean", "F1_macro_std", "F1_macro_95CI",
        "ROC_AUC_mean", "ROC_AUC_std", "ROC_AUC_95CI",
        "PR_AUC_mean",  "PR_AUC_std",  "PR_AUC_95CI"
    ]
)

agg

Unnamed: 0,Dataset,Accuracy_mean,Accuracy_std,Accuracy_95CI,F1_macro_mean,F1_macro_std,F1_macro_95CI,ROC_AUC_mean,ROC_AUC_std,ROC_AUC_95CI,PR_AUC_mean,PR_AUC_std,PR_AUC_95CI
0,Original,0.975667,0.002357,0.002927,0.975665,0.002358,0.002927,0.994992,0.000596,0.00074,0.993855,0.002207,0.00274
1,NoiseOnly,0.966,0.005706,0.007085,0.965992,0.005716,0.007097,0.992419,0.002767,0.003436,0.991274,0.002932,0.003641
2,BinningOnly,0.987,0.001667,0.002069,0.986999,0.001667,0.00207,0.998809,0.000184,0.000229,0.998857,0.000228,0.000283
3,MaskingOnly,0.958933,0.007021,0.008718,0.958909,0.007051,0.008755,0.987371,0.004894,0.006077,0.978945,0.011311,0.014044
4,PermutationOnly,0.491933,0.011663,0.014481,0.468336,0.016273,0.020206,0.491414,0.011867,0.014735,0.497042,0.009532,0.011835


In [None]:
base = agg[agg["Dataset"] == "Original"].iloc[0]

agg["ΔAccuracy"] = agg["Accuracy_mean"] - base["Accuracy_mean"]
agg["ΔF1_macro"] = agg["F1_macro_mean"] - base["F1_macro_mean"]
agg["ΔROC_AUC"]  = agg["ROC_AUC_mean"]  - base["ROC_AUC_mean"]
agg["ΔPR_AUC"]   = agg["PR_AUC_mean"]   - base["PR_AUC_mean"]

agg

Unnamed: 0,Dataset,Accuracy_mean,Accuracy_std,Accuracy_95CI,F1_macro_mean,F1_macro_std,F1_macro_95CI,ROC_AUC_mean,ROC_AUC_std,ROC_AUC_95CI,PR_AUC_mean,PR_AUC_std,PR_AUC_95CI,ΔAccuracy,ΔF1_macro,ΔROC_AUC,ΔPR_AUC
0,Original,0.975667,0.002357,0.002927,0.975665,0.002358,0.002927,0.994992,0.000596,0.00074,0.993855,0.002207,0.00274,0.0,0.0,0.0,0.0
1,NoiseOnly,0.966,0.005706,0.007085,0.965992,0.005716,0.007097,0.992419,0.002767,0.003436,0.991274,0.002932,0.003641,-0.009667,-0.009673,-0.002573,-0.002581
2,BinningOnly,0.987,0.001667,0.002069,0.986999,0.001667,0.00207,0.998809,0.000184,0.000229,0.998857,0.000228,0.000283,0.011333,0.011334,0.003817,0.005002
3,MaskingOnly,0.958933,0.007021,0.008718,0.958909,0.007051,0.008755,0.987371,0.004894,0.006077,0.978945,0.011311,0.014044,-0.016733,-0.016756,-0.007621,-0.01491
4,PermutationOnly,0.491933,0.011663,0.014481,0.468336,0.016273,0.020206,0.491414,0.011867,0.014735,0.497042,0.009532,0.011835,-0.483733,-0.507329,-0.503578,-0.496813
