In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import json
import os
from pathlib import Path

from cupbearer import data, detectors, models, scripts, tasks, utils
from lightning.pytorch import loggers
from lightning.pytorch.trainer import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from tensorboard import notebook
import torch
import submitit

  from .autonotebook import tqdm as notebook_tqdm


# Training a backdoored classifier
First, we train a classifier on poisoned data:

In [3]:
executor = submitit.SlurmExecutor(folder="runs")
executor.update_parameters(
    account="NAISS2023-22-1064",
    gpus_per_node="A100:1",
    time=2*24*60,  # minutes
)
jobs = []

In [4]:
help(executor)

Help on SlurmExecutor in module submitit.slurm.slurm object:

class SlurmExecutor(submitit.core.core.PicklingExecutor)
 |  SlurmExecutor(folder: Union[pathlib.Path, str], max_num_timeout: int = 3, python: Optional[str] = None) -> None
 |  
 |  Slurm job executor
 |  This class is used to hold the parameters to run a job on slurm.
 |  In practice, it will create a batch file in the specified directory for each job,
 |  and pickle the task function and parameters. At completion, the job will also pickle
 |  the output. Logs are also dumped in the same directory.
 |  
 |  Parameters
 |  ----------
 |  folder: Path/str
 |      folder for storing job submission/output and logs.
 |  max_num_timeout: int
 |      Maximum number of time the job can be requeued after timeout (if
 |      the instance is derived from helpers.Checkpointable)
 |  python: Optional[str]
 |      Command to launch python. This allow to use singularity for example.
 |      Caller is responsible to provide a valid shell c

In [5]:
from functools import partial
partial(data.CIFAR10)().transforms
data.__dict__.keys()

Files already downloaded and verified


dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__path__', '__file__', '__cached__', '__builtins__', 'transforms', '_shared', 'MixedData', 'TransformDataset', 'adversarial', 'AdversarialExampleDataset', 'make_adversarial_examples', 'backdoors', 'Backdoor', 'BackdoorDataset', 'CornerPixelBackdoor', 'NoiseBackdoor', 'WanetBackdoor', 'huggingface', 'IMDBDataset', 'pytorch', 'CIFAR10', 'GTSRB', 'MNIST', 'PytorchDataset', 'tampering', 'TamperingDataset', 'toy_ambiguous_features', 'ToyDataset', 'GaussianNoise', 'RandomCrop', 'RandomHorizontalFlip', 'RandomRotation', 'Resize', 'ToTensor', 'Transform'])

In [6]:
# Train data
def get_dataloaders(
    Dataset: type,
    backdoor: data.Backdoor,
) -> tuple[
    torch.utils.data.Dataset,
    torch.utils.data.DataLoader,
    dict[torch.utils.data.DataLoader],
]:
    dataset = data.BackdoorDataset(
        original=Dataset(train=True),
        backdoor=backdoor,
    )
    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=128,
        shuffle=True,
        num_workers=12,
        persistent_workers=True,
        pin_memory=True,
    )
    
    # Val data
    val_data={
        "clean": Dataset(train=False),
        #"backdoor": data.BackdoorDataset(
        #    original=Dataset(train=False),
        #    backdoor=type(backdoor)(p_backdoor=1),
        #),
        "backdoor": data.BackdoorDataset(
            original=Dataset(train=False),
            backdoor=(
                backdoor.clone(p_backdoor=1, p_noise=0)
                if type(backdoor) == data.WanetBackdoor
                else type(backdoor)(p_backdoor=1)
            ),
        ),
    }
    if type(backdoor) == data.WanetBackdoor:
        val_data["noisy"] = data.BackdoorDataset(
            original=Dataset(train=False),
            backdoor=backdoor.clone(p_backdoor=0, p_noise=1),
        )
    val_loaders = {
        k: torch.utils.data.DataLoader(
            v,
            batch_size=2048,
            shuffle=False,
            num_workers=1,
            persistent_workers=True,
        ) for k, v in val_data.items()
    }

    return dataset, train_loader, val_loaders

class WanetClassifier(scripts._shared.Classifier):
    def configure_optimizers(self):
        opt = torch.optim.SGD(
            self.parameters(),
            lr=self.lr,
            momentum=0.9,
            weight_decay=5e-4,
        )
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            opt,
            milestones=[100, 200, 300, 400],
            gamma=0.1,
        )
        return {
            "optimizer": opt,
            "lr_scheduler": lr_scheduler,
        }

In [7]:
def train_classifier(model: torch.nn.Module, Dataset: type, backdoor: data.Backdoor):
    torch.set_float32_matmul_precision('high')
    
    path = Path(f"logs/{type(model).__name__}/{Dataset.__name__}/{type(backdoor).__name__}")
    cfg_path = path / os.getenv("SLURM_JOB_ID", datetime.datetime.now().isoformat())

    dataset, train_loader, val_loaders = get_dataloaders(Dataset, backdoor)

    # Dataloader returns images and labels, only images get passed to model
    images, _ = next(iter(train_loader))
    example_input = images[0]
    
    classifier = WanetClassifier(
        model=model,
        lr=0.01,
        #input_shape=example_input.shape,
        num_classes=dataset.original.num_classes,
        val_loader_names=list(val_loaders.keys()),
    )

    metrics_logger = loggers.TensorBoardLogger(
        save_dir=cfg_path,
        name="",
        version="",
        sub_dir="tensorboard",
    )
    
    # TODO: once we do longer training runs we'll want to have multiple
    # checkpoints, potentially based on validation loss
    callbacks = [
        ModelCheckpoint(
            dirpath=cfg_path / "checkpoints",
            save_last=True,
        ),
    ]

    trainer = Trainer(
        max_epochs=num_epochs,
        enable_progress_bar=False,
        callbacks=callbacks,
        logger=metrics_logger,
        default_root_dir=cfg_path,
        check_val_every_n_epoch=5,
        #precision=32,
        accelerator="auto",
        devices="auto",
    )
    trainer.fit(
        model=classifier,
        train_dataloaders=train_loader,
        val_dataloaders=list(val_loaders.values()),
    )

    try:
        dataset.backdoor.store(cfg_path)
    except AttributeError as e:
        pass

In [8]:
num_epochs = 1000
#Dataset = data.CIFAR10
Dataset = data.GTSRB
#Dataset = data.MNIST

model = models.PreActResNet(
    block=models.models.PreActBlock,
    num_blocks=[2, 2, 2, 2],
    num_classes=Dataset.num_classes,
)

backdoor = data.WanetBackdoor(path=None, p_backdoor=0.1, p_noise=0.2)
#backdoor = data.CornerPixelBackdoor(p_backdoor=0.1)
#backdoor = data.NoiseBackdoor(p_backdoor=0.1)


n_replicas = 1
with executor.batch():
    for _ in range(n_replicas):
        for Dataset in [
            data.MNIST,
            data.GTSRB,
            data.CIFAR10,
        ]:
            for model in [
                models.MLP(
                    input_shape=(3, 28, 28) if Dataset == data.MNIST else (3, 32, 32),
                    hidden_dims=[128, 128, 128],
                    output_dim=Dataset.num_classes,
                ),
                models.CNN(
                    input_shape=(3, 28, 28) if Dataset == data.MNIST else (3, 32, 32),
                    channels=[16, 32, 32, 64],
                    dense_dims=[128],
                    output_dim=Dataset.num_classes,),
                models.PreActResNet(
                    block=models.models.PreActBlock,
                    num_blocks=[2, 2, 2, 2],
                    num_classes=Dataset.num_classes,
                ),
            ]:
                for backdoor in [
                    data.CornerPixelBackdoor(p_backdoor=0.1),
                    data.NoiseBackdoor(p_backdoor=0.1),
                    data.WanetBackdoor(path=None, p_backdoor=0.1, p_noise=0.2),
                ]:
                    job = executor.submit(train_classifier, model=model, Dataset=Dataset, backdoor=backdoor)
                    jobs.append(job)

[32m2024-05-16 12:14:59.683[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m140[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-05-16 12:14:59.685[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m163[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-05-16 12:14:59.764[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m140[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-05-16 12:14:59.765[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m163[0m - [34m[1mSetting new control grid for warping field.[0m
[32m2024-05-16 12:14:59.766[0m | [34m[1mDEBUG   [0m | [36mcupbearer.data.backdoors[0m:[36mcontrol_grid[0m:[36m140[0m - [34m[1mGenerating new control grid for warping field.[0m
[32m2024-05-16 12:14:59.767[0m | [34m[1mDEBUG   [0m

In [20]:
for job in jobs:
    print(job)

SlurmJob<job_id=2349069_0, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_1, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_2, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_3, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_4, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_5, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_6, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_7, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_8, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_9, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_10, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_11, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_12, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_13, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_14, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_15, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_16, task_id=0, state="COMPLETED">
SlurmJob<job_id=2349069_

In [21]:
job = jobs[-4]

In [22]:
print(job.stdout())

This job can be monitored from: https://scruffy.c3se.chalmers.se/d/alvis-job/alvis-job?var-jobid=2349223&from=1715858707000
submitit INFO (2024-05-16 13:25:08,346) - Starting with JobEnvironment(job_id=2349069_23, hostname=alvis3-18, local_rank=0(1), node=0(1), global_rank=0(1))
submitit INFO (2024-05-16 13:25:08,347) - Loading pickle: /mimer/NOBACKUP/groups/ml-safety/vikren/mad/cupbearer/runs/2349069_23_submitted.pkl
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
submitit INFO (2024-05-16 14:14:26,032) - Job completed successfully
submitit INFO (2024-05-16 14:14:26,033) - Exiting after successful completion



In [23]:
print(job.stderr())

2024-05-16 13:25:14.846 | DEBUG    | cupbearer.data.backdoors:control_grid:140 - Generating new control grid for warping field.
2024-05-16 13:25:14.859 | DEBUG    | cupbearer.data.backdoors:control_grid:163 - Setting new control grid for warping field.
2024-05-16 13:25:14.859 | DEBUG    | cupbearer.data.backdoors:clone:197 - Setting control grid of clone from instance.
2024-05-16 13:25:14.859 | DEBUG    | cupbearer.data.backdoors:control_grid:163 - Setting new control grid for warping field.
2024-05-16 13:25:15.537 | DEBUG    | cupbearer.data.backdoors:control_grid:140 - Generating new control grid for warping field.
2024-05-16 13:25:15.538 | DEBUG    | cupbearer.data.backdoors:control_grid:163 - Setting new control grid for warping field.
2024-05-16 13:25:15.538 | DEBUG    | cupbearer.data.backdoors:clone:197 - Setting control grid of clone from instance.
2024-05-16 13:25:15.538 | DEBUG    | cupbearer.data.backdoors:control_grid:163 - Setting new control grid for warping field.
GPU av