In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from pathlib import Path

from cupbearer import data, detectors, models, scripts, tasks, utils
from tensorboard import notebook

  from .autonotebook import tqdm as notebook_tqdm


# Training a backdoored classifier
First, we train a classifier on poisoned data:

In [3]:
data.CIFAR10.__name__

'CIFAR10'

In [4]:
model = models.ResnetConfig()
Dataset = data.CIFAR10
Backdoor = data.WanetBackdoor
path = Path(f"logs/{type(model).__name__}/{Dataset.__name__}/{Backdoor.__name__}")

In [5]:
scripts.train_classifier(
    scripts.TrainClassifierConfig(
        path=path,
        model=model,
        train_data=data.BackdoorData(
            original=Dataset(),
            backdoor=Backdoor(p_backdoor=0.10, p_noise=0.20),
        ),
        val_data={
            "clean": Dataset(train=False),
            "backdoor": data.BackdoorData(
                # By default, the poison rate is 100%, so this will let us evaluate
                # performance on completely poisoned data
                original=Dataset(train=False),
                backdoor=Backdoor(),
            ),
            "noisy": data.BackdoorData(
                original=Dataset(train=False),
                backdoor=Backdoor(p_noise=1, p_backdoor=0),
            ),
        },
        train_config=utils.TrainConfig(
            num_epochs=10,
            num_workers=4,
            pbar=True,
            optimizer=utils.OptimizerConfig(
                lr=3e-4,
            ),
        ),
    )
)

[32m2024-02-21 17:57:00.748[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m115[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-02-21 17:57:00.802[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 17:57:00.804[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m115[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-02-21 17:57:00.807[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 17:57:00.809[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m115[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-02-21 17:57:00.812[0m | [34m[1mDEBUG   [0m

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


[32m2024-02-21 17:57:05.158[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mstore[0m:[36m172[0m - [34m[1mStoring control grid to logs/ResnetConfig/CIFAR10/WanetBackdoor/wanet_backdoor.pt[0m
/mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbearer-env/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbeare ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbearer-env/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbearer-env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 391/391 [00:15<00:00, 25.77it/s, train/loss=1.490]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 0:  20%|██        | 1/5 [00:00<00:00,  8.72it/s][A
Validation DataLoader 0:  40%|████      | 2/5 [00:00<00:00,  3.87it/s][A
Validation DataLoader 0:  60%|██████    | 3/5 [00:00<00:00,  3.28it/s][A
Validation DataLoader 0:  80%|████████  | 4/5 [00:01<00:00,  3.04it/s][A
Validation DataLoader 0: 100%|██████████| 5/5 [00:01<00:00,  3.00it/s][A
Validation DataLoader 0:   0%|          | 0/5 [00:00<?, ?it/s]        [A
Validation DataLoader 1:   0%|          | 0/5 [00:00<?, ?it/s][A
Validation DataLoader 1:  20%|██        | 1/5 [00:00<00:00,  8.70it/s][A
Validation DataLoader 1:  40%|████      | 2/5 [00:00<00:01,  2.61it/s][A
Validation DataLoader 1:  60%|██████    | 3/5 [00:01<00:00,  2.12it/s][A
Validation DataLoader 1

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 391/391 [00:19<00:00, 20.26it/s, train/loss=0.716]


{'train/loss': tensor(0.7156),
 'train/acc_step': tensor(0.7625),
 'clean/loss/dataloader_idx_0': tensor(0.5392),
 'clean/acc_step/dataloader_idx_0': tensor(0.8227),
 'backdoor/loss/dataloader_idx_1': tensor(0.0016),
 'backdoor/acc_step/dataloader_idx_1': tensor(0.9994),
 'noisy/loss/dataloader_idx_2': tensor(1.7687),
 'noisy/acc_step/dataloader_idx_2': tensor(0.3621),
 'clean/acc_epoch': tensor(0.8227),
 'backdoor/acc_epoch': tensor(0.9994),
 'noisy/acc_epoch': tensor(0.3621),
 'train/acc_epoch': tensor(0.7529)}

The training script will have automatically created Tensorboard log files. The model should be close to perfect on backdoored inputs, and decent (~95%) on clean data.

In [6]:
%load_ext tensorboard
%tensorboard --logdir logs/demo

Reusing TensorBoard on port 6006 (pid 448943), started 2:13:31 ago. (Use '!kill 448943' to kill it.)

In [7]:
notebook.display(port=6006, height=1000)

Selecting TensorBoard with logdir logs/demo (started 2:13:32 ago; port 6006, pid 448943).


We can also explicitly evaluate the trained model (right now this is pretty limited and doesn't support multiple datasets at once):

In [8]:
scripts.eval_classifier(
    scripts.EvalClassifierConfig(
        path=path, data=Dataset(train=False)
    )
)

[32m2024-02-21 18:00:42.469[0m | [34m[1mDEBUG   [0m | [36mcupbearer.scripts.eval_classifier[0m:[36mmain[0m:[36m18[0m - [34m[1mLoading transform: ToTensor()[0m


Files already downloaded and verified


/mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbearer-env/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbeare ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbearer-env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 5/5 [00:01<00:00,  3.16it/s]


In [9]:
scripts.eval_classifier(
    scripts.EvalClassifierConfig(
        path=path, data=data.BackdoorData(
            original=Dataset(train=False),
            backdoor=Backdoor(),
        )
    )
)

[32m2024-02-21 18:00:46.863[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m115[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-02-21 18:00:46.866[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 18:00:46.869[0m | [34m[1mDEBUG   [0m | [36mcupbearer.scripts.eval_classifier[0m:[36mmain[0m:[36m18[0m - [34m[1mLoading transform: ToTensor()[0m
[32m2024-02-21 18:00:46.871[0m | [34m[1mDEBUG   [0m | [36mcupbearer.scripts.eval_classifier[0m:[36mmain[0m:[36m18[0m - [34m[1mLoading transform: WanetBackdoor(p_backdoor=1.0, target_class=0, p_noise=0.0, control_grid_width=4, warping_strength=0.5, grid_rescale=1.0, _control_grid=([[-0.2654159963130951, -0.8007417917251587, -0.6394952535629272, 0.5390511155128479], [-0.4819018542766571, -0.8012377619743347, -0.19970284402370453, 0.766

Files already downloaded and verified


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 5/5 [00:02<00:00,  2.29it/s]


In [10]:
scripts.eval_classifier(
    scripts.EvalClassifierConfig(
        path=path, data=data.BackdoorData(
            original=Dataset(train=False),
            backdoor=Backdoor(p_backdoor=0, p_noise=1),
        )
    )
)

[32m2024-02-21 18:00:51.070[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m115[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-02-21 18:00:51.072[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 18:00:51.075[0m | [34m[1mDEBUG   [0m | [36mcupbearer.scripts.eval_classifier[0m:[36mmain[0m:[36m18[0m - [34m[1mLoading transform: ToTensor()[0m
[32m2024-02-21 18:00:51.076[0m | [34m[1mDEBUG   [0m | [36mcupbearer.scripts.eval_classifier[0m:[36mmain[0m:[36m18[0m - [34m[1mLoading transform: WanetBackdoor(p_backdoor=0, target_class=0, p_noise=1, control_grid_width=4, warping_strength=0.5, grid_rescale=1.0, _control_grid=([[0.48124271631240845, -0.8087896108627319, -0.45163992047309875, -0.43014565110206604], [0.3139922022819519, -1.1025198698043823, -0.08548571914434433, -1.0263

Files already downloaded and verified


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 5/5 [00:01<00:00,  2.51it/s]


These results will also have been stored to `logs/demo/classifier/metrics.json` if we want to process them further (e.g. to compare many runs):

In [11]:
with open(path / "eval.json") as f:
    print(json.load(f))

[{'test/loss': 1.8737095594406128, 'test/acc_step': 0.3100000023841858, 'test/acc_epoch': 0.3100000023841858}]


# Training a backdoor detector
We'll train a very simple detector using the Mahalanobis distance:

In [12]:
detectors.AbstractionDetectorConfig()

AbstractionDetectorConfig(train=TrainConfig(num_epochs=10, batch_size=128, max_batch_size=2048, optimizer=OptimizerConfig(name='adam', lr=0.001), num_workers=0, pin_memory=True, max_steps=-1, check_val_every_n_epoch=1, pbar=False, log_every_n_steps=None, wandb=False, devices='auto', accelerator='auto', precision=32, monitor_device_stats=False, profiler=None), abstraction=LocallyConsistentAbstractionConfig(size_reduction=4))

In [None]:
scripts.train_detector(
    scripts.TrainDetectorConfig(
        path=path / "lca",
        task=tasks.BackdoorDetection(
            # We pass in the path of the trained classifier, as well as what backdoor
            # to use. The backdoor is the same one we used for training in this case,
            # we could also have stored that.
            path=path,
            backdoor=Backdoor(),
        ),
        detector=detectors.MahalanobisConfig(),
        #detector=detectors.AbstractionDetectorConfig(
        #    train=utils.TrainConfig(
        #        num_workers=4,
        #        num_epochs=10,
        #        optimizer=utils.OptimizerConfig(
        #            lr=0.001,
        #        ),
        #    )
        #),
    )
)

[32m2024-02-21 18:00:55.667[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m115[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-02-21 18:00:55.669[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 18:00:55.677[0m | [34m[1mDEBUG   [0m | [36mcupbearer.utils.scripts[0m:[36mload_config[0m:[36m55[0m - [34m[1mLoading config 'train_data' from logs/ResnetConfig/CIFAR10/WanetBackdoor[0m
[32m2024-02-21 18:00:55.697[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 18:00:55.698[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m


Files already downloaded and verified


[32m2024-02-21 18:00:56.445[0m | [34m[1mDEBUG   [0m | [36mcupbearer.utils.scripts[0m:[36mload_config[0m:[36m55[0m - [34m[1mLoading config 'model' from logs/ResnetConfig/CIFAR10/WanetBackdoor[0m
[32m2024-02-21 18:00:56.465[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-02-21 18:00:56.466[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m131[0m - [34m[1mSetting new control grid for warping field.[0m
  0%|          | 0/13 [00:00<?, ?it/s]

As we can see, this was a trivial detection task. As an ablation, we can test whether the detector specifically flags backdoored inputs as anomalous, or just anything out of distribution:

In [None]:
scripts.eval_detector(
    scripts.EvalDetectorConfig(
        path=Path("logs/demo/detector"),
        task=tasks.CustomTask(
            # TODO: this won't actually be used, plausibly Tasks should be split better
            # into their training and test data.
            train_data=data.MNIST(),
            # Our anomalous data is the backdoor data from above, except we use the
            # MNIST test split.
            anomalous_data=data.BackdoorData(
                original=data.MNIST(train=False),
                backdoor=data.CornerPixelBackdoor(),
            ),
            # Our normal data is MNIST with added noise, this makes the images OOD
            # but they shouldn't be mechanistically anomalous.
            normal_test_data=data.MNIST(
                train=False,
                transforms={
                    "to_tensor": data.ToTensor(),
                    "noise": data.GaussianNoise(0.3),
                },
            ),
            model=models.StoredModel(Path("logs/demo/classifier")),
        ),
    )
)

As we can see, adding noise did make the images quite a bit more "anomalous" according to our detector (the blue histogram has shifted to the right to higher anomaly scores). But we still have a very clear separation between these "merely noisy" inputs and the backdoored inputs. (This is a very easy to detect backdoor.)