In [32]:
!pip install pandas



In [22]:
import sys
from pathlib import Path
from copy import deepcopy

import numpy as np

# Make `import src.*` work when running from nested directory
REPO_ROOT = Path.cwd().resolve()
if not (REPO_ROOT / "src").exists() and (REPO_ROOT.parent / "src").exists():
    REPO_ROOT = REPO_ROOT.parent.resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from src.api import (
    load_config,
    get_dataset,
    get_model,
    train,
    generate_adversarial,
    compute_scores,
    concat_scores,
    fit_detector,
    evaluate_detector,
)


In [17]:
# Choose YAML configs
CONFIG_NAMES = ["fgsm_eps0p02_pca5", "fgsm_eps0p02_pca10", "fgsm_eps0p02_pca20"]


# epsilon sweep
EPS_LIST = [0.01, 0.02, 0.05]
PGD_STEPS_DEFAULT = 10
PGD_STEP_SIZE_MODE = "scaled"   # or "fixed"
PGD_STEP_SIZE_FIXED = 0.007     # only used if mode is fixed


# Dataset/model
DATASET_NAME = "synthetic_shapes_3class"
MODEL_NAME = "minicnn"
NUM_CLASSES = 3
IN_CHANNELS = 3

# Speed knobs
MAX_POINTS_FOR_SCORING = 300   # 100–500 typical for laptop
FEAT_DIM = 128                 # lower -> faster (64 or 128)
TRAIN_EPOCHS = 10              # lower -> faster (5–10)
BATCH_SIZE = 64

# Attack defaults
PGD_STEPS_DEFAULT = 10



In [18]:
def make_subsample_idx(n, n_max):
    n_use = min(int(n), int(n_max))
    return np.arange(n_use)

def run_attack_and_eval(
    *,
    cfg_base,
    model,
    bundle,
    scores_val_clean,
    scores_test_clean,
    idx_val,
    idx_test,
    attack_type: str,
    eps: float,
    pgd_steps: int = 10,
):
    """
    Returns: metrics dict (roc_auc, pr_auc, fpr_at_tpr95, ...) + some bookkeeping.
    """
    cfg = deepcopy(cfg_base)
    cfg.attack.attack_type = str(attack_type)
    cfg.attack.epsilon = float(eps)

    if attack_type == "fgsm":
        cfg.attack.num_steps = 1
        cfg.attack.step_size = 0.0
        cfg.attack.random_start = False
    elif attack_type == "pgd":
        cfg.attack.num_steps = int(pgd_steps)
        cfg.attack.step_size = float(eps / 7.0)
        cfg.attack.random_start = True
    else:
        raise ValueError(f"Unknown attack_type: {attack_type}")

    clip = bundle.meta.get("clip", (0.0, 1.0))

    # Generate adversarial ONLY on the subsampled points (big speed win)
    X_val_adv = generate_adversarial(
        model,
        bundle.X_val[idx_val],
        bundle.y_val[idx_val],
        cfg,
        clip=clip
    )
    X_test_adv = generate_adversarial(
        model,
        bundle.X_test[idx_test],
        bundle.y_test[idx_test],
        cfg,
        clip=clip
    )

    scores_val_adv = compute_scores(X_val_adv, model, bundle=bundle, cfg=cfg)
    scores_test_adv = compute_scores(X_test_adv, model, bundle=bundle, cfg=cfg)

    scores_val_all = concat_scores(scores_val_clean, scores_val_adv)
    any_key = next(iter(scores_val_all.keys()))
    labels_val = np.concatenate([
        np.zeros(len(scores_val_clean[any_key]), dtype=int),
        np.ones(len(scores_val_adv[any_key]), dtype=int)
    ])

    detector = fit_detector(scores_val_all, labels_val, cfg)

    scores_test_all = concat_scores(scores_test_clean, scores_test_adv)
    any_key_t = next(iter(scores_test_all.keys()))
    labels_test = np.concatenate([
        np.zeros(len(scores_test_clean[any_key_t]), dtype=int),
        np.ones(len(scores_test_adv[any_key_t]), dtype=int)
    ])

    raw_scores = np.asarray(detector.score(scores_test_all), dtype=float)
    metrics = evaluate_detector(labels_test, raw_scores, threshold=float(detector.threshold))

    return metrics


In [19]:
all_results = []

for cfg_name in CONFIG_NAMES:
    print("\n" + "="*80)
    print(f"CONFIG: {cfg_name}")
    print("="*80)

    # 1) Load YAML config
    cfg = load_config(cfg_name)

    # 2) Override only what you must for this experiment
    cfg.device = "cpu"                
    cfg.model.output_dim = NUM_CLASSES
    cfg.model.epochs = TRAIN_EPOCHS
    cfg.model.batch_size = BATCH_SIZE

    # 3) Load dataset + build model
    bundle = get_dataset(DATASET_NAME, cfg)

    model = get_model(
        MODEL_NAME,
        cfg,
        num_classes=NUM_CLASSES,
        in_channels=IN_CHANNELS,
        feat_dim=FEAT_DIM,
    )

    # 4) Train once per config
    model = train(model, bundle, cfg, verbose=True)

    # 5) Precompute CLEAN scores once (reuse for all attacks/eps)
    idx_val = make_subsample_idx(len(bundle.X_val), MAX_POINTS_FOR_SCORING)
    idx_test = make_subsample_idx(len(bundle.X_test), MAX_POINTS_FOR_SCORING)

    X_val_clean = bundle.X_val[idx_val]
    X_test_clean = bundle.X_test[idx_test]

    scores_val_clean = compute_scores(X_val_clean, model, bundle=bundle, cfg=cfg)
    scores_test_clean = compute_scores(X_test_clean, model, bundle=bundle, cfg=cfg)

    # 6) Sweep eps and both attacks
    for eps in EPS_LIST:
        for attack_type in ["fgsm", "pgd"]:
            metrics = run_attack_and_eval(
                cfg_base=cfg,
                model=model,
                bundle=bundle,
                scores_val_clean=scores_val_clean,
                scores_test_clean=scores_test_clean,
                idx_val=idx_val,
                idx_test=idx_test,
                attack_type=attack_type,
                eps=eps,
                pgd_steps=PGD_STEPS_DEFAULT,
            )

            row = {
                "config": cfg_name,
                "attack": attack_type,
                "eps": float(eps),
                "roc_auc": float(metrics.get("roc_auc", np.nan)),
                "pr_auc": float(metrics.get("pr_auc", np.nan)),
                "fpr_at_tpr95": float(metrics.get("fpr_at_tpr95", np.nan)),
                "accuracy": float(metrics.get("accuracy", np.nan)),
            }
            all_results.append(row)

            print(
                f"{cfg_name:22s}  {attack_type:4s}  eps={eps:<5}  "
                f"AUROC={row['roc_auc']:.3f}  AUPRC={row['pr_auc']:.3f}  FPR@95TPR={row['fpr_at_tpr95']:.3f}"
            )



CONFIG: fgsm_eps0p02_pca5
Epoch [10/10] Train Loss: 0.0290, Train Acc: 100.00%, Val Loss: 0.0275, Val Acc: 99.50%
fgsm_eps0p02_pca5       fgsm  eps=0.01   AUROC=0.734  AUPRC=0.756  FPR@95TPR=0.775
fgsm_eps0p02_pca5       pgd   eps=0.01   AUROC=0.725  AUPRC=0.748  FPR@95TPR=0.780
fgsm_eps0p02_pca5       fgsm  eps=0.02   AUROC=0.867  AUPRC=0.891  FPR@95TPR=0.675
fgsm_eps0p02_pca5       pgd   eps=0.02   AUROC=0.884  AUPRC=0.901  FPR@95TPR=0.580
fgsm_eps0p02_pca5       fgsm  eps=0.05   AUROC=0.830  AUPRC=0.870  FPR@95TPR=0.755
fgsm_eps0p02_pca5       pgd   eps=0.05   AUROC=0.911  AUPRC=0.920  FPR@95TPR=0.360

CONFIG: fgsm_eps0p02_pca10
Epoch [10/10] Train Loss: 0.0290, Train Acc: 100.00%, Val Loss: 0.0275, Val Acc: 99.50%
fgsm_eps0p02_pca10      fgsm  eps=0.01   AUROC=0.729  AUPRC=0.752  FPR@95TPR=0.775
fgsm_eps0p02_pca10      pgd   eps=0.01   AUROC=0.722  AUPRC=0.745  FPR@95TPR=0.775
fgsm_eps0p02_pca10      fgsm  eps=0.02   AUROC=0.863  AUPRC=0.888  FPR@95TPR=0.665
fgsm_eps0p02_pca10    

In [20]:
# Sort results by config, then attack, then eps
all_results_sorted = sorted(all_results, key=lambda r: (r["config"], r["attack"], r["eps"]))

print("\n" + "-"*110)
print(f"{'config':22s} {'attack':6s} {'eps':>6s} {'AUROC':>8s} {'AUPRC':>8s} {'FPR@95TPR':>10s} {'acc':>8s}")
print("-"*110)

for r in all_results_sorted:
    print(
        f"{r['config'][:22]:22s} {r['attack']:6s} {r['eps']:6.3f} "
        f"{r['roc_auc']:8.3f} {r['pr_auc']:8.3f} {r['fpr_at_tpr95']:10.3f} {r['accuracy']:8.3f}"
    )



--------------------------------------------------------------------------------------------------------------
config                 attack    eps    AUROC    AUPRC  FPR@95TPR      acc
--------------------------------------------------------------------------------------------------------------
fgsm_eps0p02_pca10     fgsm    0.010    0.729    0.752      0.775    0.615
fgsm_eps0p02_pca10     fgsm    0.020    0.863    0.888      0.665    0.775
fgsm_eps0p02_pca10     fgsm    0.050    0.830    0.868      0.750    0.762
fgsm_eps0p02_pca10     pgd     0.010    0.722    0.745      0.775    0.605
fgsm_eps0p02_pca10     pgd     0.020    0.880    0.897      0.595    0.772
fgsm_eps0p02_pca10     pgd     0.050    0.909    0.918      0.375    0.777
fgsm_eps0p02_pca20     fgsm    0.010    0.731    0.753      0.775    0.610
fgsm_eps0p02_pca20     fgsm    0.020    0.863    0.888      0.675    0.767
fgsm_eps0p02_pca20     fgsm    0.050    0.830    0.868      0.755    0.757
fgsm_eps0p02_pca20     pgd 

## Cross-attack transfer (training on FGSM, testing on PGD and vice versa)

In [35]:
import sys
from pathlib import Path
from copy import deepcopy

import pandas as pd
import numpy as np

# Make `import src.*` work when running from nested directory
REPO_ROOT = Path.cwd().resolve()
if not (REPO_ROOT / "src").exists() and (REPO_ROOT.parent / "src").exists():
    REPO_ROOT = REPO_ROOT.parent.resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


from src.api import (
    load_config,
    get_dataset,
    get_model,
    train,
    generate_adversarial,
    compute_scores,
    concat_scores,
    fit_detector,
    evaluate_detector,
)



In [39]:
# Choose ONE topology config (attack inside YAML doesn't matter; we override attacks below)
CFG_NAME = "fgsm_eps0p02_pca5"   # example
DATASET_NAME = "synthetic_shapes_3class"   # your 3-class dataset key
MODEL_NAME = "minicnn"                      # or whatever you used for the 3-class notebook

SEED = 10
DEVICE = "cpu"

NUM_CLASSES = 3
IN_CHANNELS = 3   # synthetic shapes are RGB
FEAT_DIM = 128    # or whatever you used (smaller is faster)

TRAIN_EPOCHS = 20
BATCH_SIZE = 64

# Subsample for PH speed
MAX_POINTS_FOR_SCORING = 300

# Attack hyperparams
EPS_TRAIN = 0.02            # detector training epsilon
EPS_TEST_LIST = [0.01, 0.02, 0.05]  # evaluation epsilons

PGD_STEPS = 10
PGD_STEP_SIZE = 0.007       # match your original notebook if it used ~0.007


In [40]:
def make_subsample_idx(n, n_max):
    n_use = min(int(n), int(n_max))
    return np.arange(n_use)

def set_attack(cfg, attack_type: str, eps: float):
    cfg.attack.attack_type = attack_type
    cfg.attack.epsilon = float(eps)

    if attack_type == "fgsm":
        cfg.attack.num_steps = 1
        cfg.attack.step_size = 0.0
        cfg.attack.random_start = False

    elif attack_type == "pgd":
        cfg.attack.num_steps = int(PGD_STEPS)
        cfg.attack.step_size = float(PGD_STEP_SIZE)
        cfg.attack.random_start = True

    else:
        raise ValueError(f"Unknown attack_type: {attack_type}")


In [43]:
cfg_base = load_config(CFG_NAME)

# Override only what you need for this 3-class experiment
cfg_base.seed = SEED
cfg_base.device = DEVICE
cfg_base.model.output_dim = NUM_CLASSES
cfg_base.model.epochs = TRAIN_EPOCHS
cfg_base.model.batch_size = BATCH_SIZE

bundle = get_dataset(DATASET_NAME, cfg_base)

model = get_model(
    MODEL_NAME,
    cfg_base,
    num_classes=NUM_CLASSES,
    in_channels=IN_CHANNELS,
    feat_dim=FEAT_DIM,
)

model = train(model, bundle, cfg_base, verbose=True)

# Subsample indices (PH speed)
idx_val = make_subsample_idx(len(bundle.X_val), MAX_POINTS_FOR_SCORING)
idx_test = make_subsample_idx(len(bundle.X_test), MAX_POINTS_FOR_SCORING)

X_val_clean = bundle.X_val[idx_val]
X_test_clean = bundle.X_test[idx_test]

scores_val_clean = compute_scores(X_val_clean, model, bundle=bundle, cfg=cfg_base)
scores_test_clean = compute_scores(X_test_clean, model, bundle=bundle, cfg=cfg_base)

print("Model trained. Clean scores computed.")


Epoch [10/20] Train Loss: 0.0333, Train Acc: 100.00%, Val Loss: 0.0213, Val Acc: 100.00%
Epoch [20/20] Train Loss: 0.0028, Train Acc: 100.00%, Val Loss: 0.0029, Val Acc: 100.00%
Model trained. Clean scores computed.


In [44]:
def train_detector_on_attack(cfg_base, attack_train: str, eps_train: float):
    cfg = deepcopy(cfg_base)
    set_attack(cfg, attack_train, eps_train)

    clip = bundle.meta.get("clip", (0.0, 1.0))

    # Generate ADV on the SAME subsampled points (fast)
    X_val_adv = generate_adversarial(
        model, bundle.X_val[idx_val], bundle.y_val[idx_val], cfg, clip=clip
    )

    scores_val_adv = compute_scores(X_val_adv, model, bundle=bundle, cfg=cfg)

    # Fit detector on val clean vs val adv
    scores_val_all = concat_scores(scores_val_clean, scores_val_adv)
    any_key = next(iter(scores_val_all.keys()))
    labels_val = np.concatenate([
        np.zeros(len(scores_val_clean[any_key]), dtype=int),
        np.ones(len(scores_val_adv[any_key]), dtype=int),
    ])

    detector = fit_detector(scores_val_all, labels_val, cfg)
    return detector


def eval_detector_on_attack(detector, cfg_base, attack_test: str, eps_test: float):
    cfg = deepcopy(cfg_base)
    set_attack(cfg, attack_test, eps_test)

    clip = bundle.meta.get("clip", (0.0, 1.0))

    X_test_adv = generate_adversarial(
        model, bundle.X_test[idx_test], bundle.y_test[idx_test], cfg, clip=clip
    )

    scores_test_adv = compute_scores(X_test_adv, model, bundle=bundle, cfg=cfg)

    scores_test_all = concat_scores(scores_test_clean, scores_test_adv)
    any_key = next(iter(scores_test_all.keys()))
    labels_test = np.concatenate([
        np.zeros(len(scores_test_clean[any_key]), dtype=int),
        np.ones(len(scores_test_adv[any_key]), dtype=int),
    ])

    raw_scores = np.asarray(detector.score(scores_test_all), dtype=float)
    metrics = evaluate_detector(labels_test, raw_scores, threshold=float(detector.threshold))
    return metrics


In [45]:
results = []

for attack_train in ["fgsm", "pgd"]:
    detector = train_detector_on_attack(cfg_base, attack_train=attack_train, eps_train=EPS_TRAIN)

    for attack_test in ["fgsm", "pgd"]:
        for eps_test in EPS_TEST_LIST:
            m = eval_detector_on_attack(detector, cfg_base, attack_test=attack_test, eps_test=eps_test)

            results.append({
                "cfg": CFG_NAME,
                "train_attack": attack_train,
                "train_eps": EPS_TRAIN,
                "test_attack": attack_test,
                "test_eps": eps_test,
                "roc_auc": m.get("roc_auc", np.nan),
                "pr_auc": m.get("pr_auc", np.nan),
                "fpr_at_tpr95": m.get("fpr_at_tpr95", np.nan),
                "accuracy": m.get("accuracy", np.nan),
            })

df = pd.DataFrame(results)
df.sort_values(["train_attack", "test_attack", "test_eps"], inplace=True)
df


Unnamed: 0,cfg,train_attack,train_eps,test_attack,test_eps,roc_auc,pr_auc,fpr_at_tpr95,accuracy
0,fgsm_eps0p02_pca5,fgsm,0.02,fgsm,0.01,0.762025,0.791952,0.81,0.6575
1,fgsm_eps0p02_pca5,fgsm,0.02,fgsm,0.02,0.9021,0.925119,0.575,0.8175
2,fgsm_eps0p02_pca5,fgsm,0.02,fgsm,0.05,0.9294,0.935825,0.355,0.8075
3,fgsm_eps0p02_pca5,fgsm,0.02,pgd,0.01,0.782425,0.81464,0.81,0.68
4,fgsm_eps0p02_pca5,fgsm,0.02,pgd,0.02,0.90465,0.920168,0.47,0.78
5,fgsm_eps0p02_pca5,fgsm,0.02,pgd,0.05,0.899175,0.915689,0.535,0.7775
6,fgsm_eps0p02_pca5,pgd,0.02,fgsm,0.01,0.762025,0.791952,0.81,0.6575
7,fgsm_eps0p02_pca5,pgd,0.02,fgsm,0.02,0.9021,0.925119,0.575,0.8175
8,fgsm_eps0p02_pca5,pgd,0.02,fgsm,0.05,0.9294,0.935825,0.355,0.8075
9,fgsm_eps0p02_pca5,pgd,0.02,pgd,0.01,0.787625,0.817833,0.795,0.6825
