In [12]:
import matplotlib.pyplot as plt
# from mpl_toolkits import mplot3d  # might need this? unclear...
import pprint
import json
from art.defences.detector.poison import ActivationDefence
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification
)
from data import SNLIDataset, DavidsonDataset
from main import load_args
from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
import numpy as np
import logging
logger = logging.getLogger(__name__)
import sys
import os
import yaml
import argparse
from transformers.training_args import TrainingArguments


In [13]:
class ChenActivations(ActivationDefence):
    def __init__(self, classifier, x_train, y_train, batch_size = 64):
        super().__init__(classifier, x_train, y_train)
        self.batch_size = batch_size

    def _get_activations(self, x_train: Optional[np.ndarray] = None):
        logger.info("Getting activations")

        # if self.classifier.layer_names is not None:
        #     nb_layers = len(self.classifier.layer_names)
        # else:
        #     raise ValueError("No layer names identified.")
        try:
            if self.classifier.layer_names is not None:
                nb_layers = len(self.classifier.layer_names)
            else:
                raise ValueError("No layer names identified.")
            features_x_poisoned = self.classifier.get_activations(
                self.x_train, layer=nb_layers - 1, batch_size=self.batch_size
            )
            features_split = segment_by_class(features_x_poisoned, self.y_train, self.classifier.nb_classes)
        except:
            self.y_train_sparse = np.argmax(self.y_train)
            self.batch_size = 64 #yes that is right i am a HARD coder
            if 'bert' in self.classifier.base_model_prefix:
                import torch
                from tqdm import tqdm

                try:
                    output_shape = self.classifier.classifier.in_features #BERT linear classifier
                except torch.nn.modules.module.ModuleAttributeError:
                    output_shape = self.classifier.classifier.dense.in_features #RoBERTa dense classifier
                except:
                    raise NotImplementedError('Transformer architecture not supported')

                activations = np.zeros((len(self.y_train),output_shape))

                # Get activations with batching
                for batch_index in tqdm(range(int(np.ceil(len(self.y_train) / float(self.batch_size)))), desc=f'Extracting activations from {self.classifier.base_model_prefix}'):
                    begin, end = (
                        batch_index * self.batch_size,
                        min((batch_index + 1) * self.batch_size, len(self.y_train)),
                    )
                    inputs = dict(input_ids=torch.tensor([i.input_ids for i in self.x_train][begin:end]), 
                                    attention_mask=torch.tensor([i.attention_mask for i in self.x_train][begin:end]))

                    if self.classifier.base_model_prefix == 'bert':
                        last_l_activations = self.classifier.bert(**inputs).pooler_output
                    elif self.classifier.base_model_prefix == 'roberta':
                        last_l_activations = self.classifier.roberta(**inputs)[0][:,0,:]
                        
                    activations[begin:end] = last_l_activations.detach().cpu().numpy()
                features_split = segment_by_class(activations, self.y_train, self.classifier.num_labels)
                ##################
                #todo: override self.classifier.get_activations in the subclass to be defined later
                #done here ;)
                ##################


        if self.generator is not None:
            activations = self.classifier.get_activations(
                x_train, layer=protected_layer, batch_size=self.generator.batch_size
            )
        else:
            #activations = self.classifier.get_activations(self.x_train, layer=protected_layer, batch_size=128)
            pass #i did this because we already defined activations above in Fab's code

        # wrong way to get activations activations = self.classifier.predict(self.x_train)
        nodes_last_layer = np.shape(activations)[1]

        if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS:
            logger.warning(
                "Number of activations in last hidden layer is too small. Method may not work properly. " "Size: %s",
                str(nodes_last_layer),
            )
        torch.save(activations, '/scratch/groups/nms_cdt_ai/RobuSTAI/chen/bert_ACTIVATIONS.pt')
        return activations
    
    #may have to change this if it doesn't run 13/04
    def evaluate_defence(self, is_clean: np.ndarray, **kwargs) -> str:
        """
        If ground truth is known, this function returns a confusion matrix in the form of a JSON object.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """
        if is_clean is None or is_clean.size == 0:
            raise ValueError("is_clean was not provided while invoking evaluate_defence.")

        self.set_params(**kwargs)

        if not self.activations_by_class and self.generator is None:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)

        (self.clusters_by_class, self.red_activations_by_class,) = self.cluster_activations()
        _, self.assigned_clean_by_class = self.analyze_clusters()

        # Now check ground truth:
        if self.generator is not None:
            batch_size = self.generator.batch_size
            num_samples = self.generator.size
            num_classes = self.classifier.nb_classes
            self.is_clean_by_class = [np.empty(0, dtype=int) for _ in range(num_classes)]

            # calculate is_clean_by_class for each batch
            for batch_idx in range(num_samples // batch_size):  # type: ignore
                _, y_batch = self.generator.get_batch()
                is_clean_batch = is_clean[batch_idx * batch_size : batch_idx * batch_size + batch_size]
                clean_by_class_batch = self._segment_by_class(is_clean_batch, y_batch)
                self.is_clean_by_class = [
                    np.append(self.is_clean_by_class[class_idx], clean_by_class_batch[class_idx])
                    for class_idx in range(num_classes)
                ]

        else:
            self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train)
        self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.assigned_clean_by_class, self.is_clean_by_class
        )
        return conf_matrix_json


In [14]:
def load_args():
    """ Load args and run some basic checks.
        Args loaded from:
        - Huggingface transformers training args (defaults for using their model)
        - Manual args from .yaml file
    """
#     assert sys.argv[1] in ['chen']
    # Load args from file
    with open('config/chen.yaml', 'r') as f:
        manual_args = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader))
        args = TrainingArguments(output_dir=manual_args.output_dir)
        for arg in manual_args.__dict__:
            try:
                setattr(args, arg, getattr(manual_args, arg))
            except AttributeError:
                pass

    if args.do_train and 'tmp' not in args.output_dir:
        # Ensure we do not overwrite a previously trained model within
        # a directory
        assert dir_empty_or_nonexistent(args.output_dir), (
            f"Directory exists and not empty:\t{args.output_dir}")

    if args.do_predict and not args.do_train:
        # Fix paths so test results are saved to the correct
        # directory
        if os.path.isdir(args.model_name_or_dir):
            args.output_dir = os.path.join(args.model_name_or_dir, 'test_results')
            os.makedirs(args.output_dir, exist_ok=True)

    if args.load_best_model_at_end:
        # Dump args
        if not os.path.isdir(args.output_dir):
            os.mkdir(args.output_dir)
        with open(os.path.join(args.output_dir, 'user_args.yaml'), 'w') as f:
            yaml.dump(manual_args.__dict__, f)
        with open(os.path.join(args.output_dir, 'all_args.yaml'), 'w') as f:
            yaml.dump(args.__dict__, f)   

    return args

In [17]:
args = load_args()
dataset = SNLIDataset if args.task == 'snli' else DavidsonDataset

tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_dir)
model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_dir, num_labels=3)

# Init dataset
train = dataset(args, 'train', tokenizer)

# Set Up
x_train = []
y_train = []


for index, element in enumerate(train.data):
    x_train.append(train.data[index])
    y_train.append(train.data[index].labels)

In [16]:
# NOTE: the below code is primarily taken from
# https://github.com/Trusted-AI/adversarial-robustness-toolbox/blob/c311a4b26f16fc17487ad35e143b88a15d9df8e6/notebooks/poisoning_defense_activation_clustering.ipynb

# Detect Poison Using Activation Defence
#defence = ActivationDefence(model, x_train, y_train)
defence = ChenActivations(model, x_train, y_train)
report, is_clean_lst = defence.detect_poison(nb_clusters=2,
                                             nb_dims=3,
                                             reduce="PCA")

print("Analysis completed. Report:")
pp = pprint.PrettyPrinter(indent=10)
pprint.pprint(report)


TrainingArguments(output_dir=/scratch/groups/nms_cdt_ai/RobuSTAI/chen/detection1, overwrite_output_dir=False, do_train=False, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Apr13_12-14-26_login4.pri.rosalind2.alces.network, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=/scratch/groups/nms_cdt_ai/RobuSTAI/chen/detection1,

In [None]:
# Evaluate Defense
# Evaluate method when ground truth is known:
print("------------------- Results using size metric -------------------")
is_poison_train = 0
is_clean = (is_poison_train == 0)
confusion_matrix = defence.evaluate_defence(is_clean)

jsonObject = json.loads(confusion_matrix)
for label in jsonObject:
    print(label)
    pprint.pprint(jsonObject[label])

In [None]:
# Visualize Activations
# Get clustering and reduce activations to 3 dimensions using PCA
[clusters_by_class, _] = defence.cluster_activations()
defence.set_params(**{'ndims': 3})
[_, red_activations_by_class] = defence.cluster_activations()

c=0
red_activations = red_activations_by_class[c]
clusters = clusters_by_class[c]
fig = plt.figure()
ax = plt.axes(projection='3d')
colors=["#0000FF", "#00FF00"]
for i, act in enumerate(red_activations):
    ax.scatter3D(act[0], act[1], act[2], color = colors[clusters[i]])
