# Task 3: Adaptive attack

## Dependencies

In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import os
import copy

### If you are using Google Colab, you need to upload this notebook and the codebase to your Google Drive. Then you need to mount your Google Drive in Colab and set your working directory. If you are running on your local machine, you can ignore the following line.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
root_dir = "/content/drive/My Drive/"
project_dir = "Assignment2" # Change to your path
os.chdir(root_dir + project_dir)

In [4]:
# Make sure the path is correct
!ls

 attack.py
 CS5562_Assignment_2_Task1.ipynb
 CS5562_Assignment_2_Task2.ipynb
 CS5562_Assignment_2_Task3.ipynb
 CS5562_Assignment_2_Task4.ipynb
 CS5562_Assignment_2_Warm_ups.ipynb
 dataset
 defense.py
'[DISCARDED]CS5562_Assignment_2_Task1.ipynb'
'[DISCARDED]CS5562_Assignment_2_Task2.ipynb'
'[DISCARDED]CS5562_Assignment_2_Task3.ipynb'
 environment.yml
 model.py
 __pycache__
 utilities.py


## Implement adaptive attacks

In [5]:
from utilities import *
from attack import Attack
from tqdm import tqdm
from model import Undefended_Model

import cvxpy as cp

In [6]:
class Adaptive_Attack(Attack):
    """
          adaptive attack: students implement their own adaptive attack here
    """
    def attack(self, eps):
        n_poison = int(eps * len(self.clean_dataset))

        ####################
        # TODO: update the following part to build your attack model based on KKT attack

        # Find decoy parameters theta_decoy
        svm_clean = copy.deepcopy(self.target_model)
        svm_clean.train(self.clean_dataset)

        test_data_flip = dataset(self.test_dataset.X, self.test_dataset.Y*(-1))
        test_loss = svm_clean.individual_loss(test_data_flip)
        gamma = np.percentile(test_loss, 50, axis=0)
        index = np.where(test_loss > gamma)[0]
        x, y = test_data_flip[index]

        # Changed value of repeat (r) from [10 - 20] and chose 20 empirically
        repeats = [20][0]
        x_flip = np.tile(x, (repeats, 1))
        y_flip = np.tile(y, repeats)

        D_flip = dataset(x_flip, y_flip)
        D_decoy = combine_datset(self.clean_dataset, D_flip)

        base_model = load_model("svm", "mnist_17")
        svm_decoy = Undefended_Model(base_model,"svm")
        svm_decoy.train(D_decoy)
        print("Decoy theta found!")

        # Grid search
        def hinge_grad(data, model):
          w = model.coef_[0]
          b = model.intercept_
          X, Y = data.X, data.Y
          grad = 0

          for (x_, y_) in zip(X, Y):
              v = y_ * (np.dot(w, x_) + b)
              grad += 0 if v > 1 else -y_ * x_
          return grad / X[0].shape

        def optimization(n_features, eps_pos, eps_neg, g_decoy, model):
          w = model.coef_
          b = model.intercept_
          x_pos = cp.Variable(n_features)
          x_neg = cp.Variable(n_features)

          error = g_decoy - cp.multiply(eps_pos, x_pos) + cp.multiply(eps_neg, x_neg)
          obj = cp.Minimize(cp.sum_squares(error))
          constraints = [
              1 - (w@x_pos + b) >= 0,
              1 + (w@x_neg + b) >= 0
          ]

          prob = cp.Problem(obj, constraints)
          prob.solve()
          x_pos = np.array(x_pos.value)
          x_neg = np.array(x_neg.value)

          return x_pos, x_neg

        print("Finding attack points")
        T = 5
        optimal_params = None
        g_decoy = hinge_grad(self.clean_dataset, svm_decoy.model)

        for t in tqdm(range(1, T-1)):
          eps_pos = t*eps / T
          eps_neg = eps - eps_pos

          # Get x_pos, x_neg
          features = self.clean_dataset.X[0].shape
          x_pos, x_neg = optimization(features, eps_pos, eps_neg, g_decoy, svm_decoy.model)

          # Create D_poison
          T = 800
          n_pos = T//2 - 1
          n_neg = T//2 - 1

          x = np.concatenate((np.tile(x_pos, (n_pos, 1)), np.tile(x_neg, (n_neg, 1))))
          y = np.concatenate((np.ones(n_pos), (-1)*np.ones(n_neg)))
          D_poison = dataset(x, y)
          assert len(D_poison) <= n_poison

          # Train svm
          D_combine = combine_datset(self.clean_dataset, D_poison)
          svm_new = copy.deepcopy(self.target_model)
          svm_new.train(D_combine)

          # Get svm, D_poison with highest test loss
          test_loss = svm_new.score(self.test_dataset)[0]

          if optimal_params is None or optimal_params['loss'] < test_loss:
            optimal_params = {
                'x_poison': x,
                'y_poison': y,
                'loss': test_loss,
                'eps_pos': eps_pos,
                'eps_neg': eps_neg
                }
          print(optimal_params['loss'], optimal_params['eps_pos'], optimal_params['eps_neg'])

        print("Length of D_poison:" , len(D_poison))
        print("Length allowed to poison: ", n_poison)

        X_modified = optimal_params['x_poison']
        Y_modified = optimal_params['y_poison']

        ####################
        return dataset(X_modified, Y_modified)

# Test your code

## Copy and Paste your data sanitizer from Task 2 here:

In [7]:
def data_sanitizer(training_data, estimate_eps):
    """
       Removes the estimate_eps fraction of points from X and Y.
    """

    n_est_poisoned = int(estimate_eps * len(training_data))

    #################
    # TODO: decide which points need to be deleted

    def calc_dist_to_centroid(training_data):
      class_map = {-1: 0, 1: 1}
      X = training_data.X
      Y = training_data.Y
      num_classes = len(set(Y))
      num_features = X.shape[1]

      centroids = np.zeros((num_classes, num_features))
      dis_to_centro = np.zeros(len(training_data))

      for y in set(Y):
          centroids[class_map[y], :] = np.median(X[Y == y, :], axis=0)

      for i in range(len(training_data)):
          dis_to_centro[i] = np.linalg.norm(X[i]-centroids[class_map[Y[i]]])

      return dis_to_centro

    # Distances of whole training data
    distances = calc_dist_to_centroid(training_data)

    # Distances after deletion
    index, ind1, ind2 = [], [], []
    threshold = 400
    values, counts = np.unique(distances, return_counts=True)
    count1_idx, count2_idx = np.argpartition(counts, -2)[-2:]

    if counts[count1_idx] > threshold:
      dist = values[count1_idx]
      ind1 = np.where(dist == distances)[0]

    if counts[count2_idx] > threshold:
      dist = values[count2_idx]
      ind2 = np.where(dist == distances)[0]

    if len(ind1) != 0 or len(ind2) != 0:
      index = np.concatenate((ind1, ind2))

    print("No. of points sanitized: ", len(index))

    ################
    training_data_copy = copy.deepcopy(training_data)
    del training_data_copy[index]
    return training_data_copy

## Helper functions

In [8]:
from model import Model


class Data_Sanitized_Model(Model):
    def __init__(self, model, model_name, estimated_eps):
        super().__init__(model, model_name)
        self.estimated_eps = estimated_eps

    def train(self, train_dataset):
        sanitized_data = data_sanitizer(training_data=train_dataset, estimate_eps=self.estimated_eps)
        self.model.fit(sanitized_data.X, sanitized_data.Y)

In [9]:
def compute_attack_grade(attack, victim_model,eps,clean_train_dataset,test_dataset):
    # target model structure is known to the adversary
    target_model = copy.deepcopy(victim_model)
    if attack == 'KKT':
        attacker = KKT_Attack(target_model,clean_train_dataset,test_dataset)
    elif attack == 'label-flip':
        attacker = Label_Flip_Attack(target_model, clean_train_dataset, test_dataset)
    elif attack == 'adaptive':
        attacker = Adaptive_Attack(target_model, clean_train_dataset, test_dataset)
    elif attack == 'random-label-flip':
        attacker = Random_Label_Flip_Attack(target_model, clean_train_dataset, test_dataset)
    poisoned_dataset = attacker.attack(eps)
    assert len(poisoned_dataset) <= int(eps*len(clean_train_dataset))

    train_dataset = combine_datset(clean_train_dataset,poisoned_dataset)
    clean_model = copy.deepcopy(target_model)

    # performance without any attack
    clean_model.train(clean_train_dataset)
    clean_loss,clean_acc = clean_model.score(test_dataset)
    print('\nAvg loss of clean model: %0.5f, avg classification accuracy: %0.5f'%(clean_loss,clean_acc))

    # attack the victim model
    victim_model.train(train_dataset)
    poisoned_loss,poisoned_acc =victim_model.score(test_dataset)
    print('\nAvg loss of poisoned model:%0.5f, avg classification accuracy: %0.5f'%(poisoned_loss,poisoned_acc))

    grade = poisoned_loss - clean_loss

    # # for generating figures
    # distance_to_center_diff(clean_train_dataset,poisoned_dataset)
    # loss_diff(clean_train_dataset, poisoned_dataset,clean_model)

    return len(poisoned_dataset)/len(clean_train_dataset),grade

## Testing

In [10]:
train_dataset,test_dataset = load_dataset('mnist_17')
base_model = load_model("svm", "mnist_17")
target_model = Data_Sanitized_Model(base_model,"svm", 0.2)
defense_name = 'data_sanitization'
fraction, attack_grade = compute_attack_grade("adaptive", target_model, 0.2, train_dataset, test_dataset)
print('\n\n-----------result---------')
print('%s attack against %s %s model on %s dataset: %0.2f (%0.2f fraction of poisoning data)'%("adaptive",defense_name,"svm","mnist_17",attack_grade,fraction))

No. of points sanitized:  0




Decoy theta found!
Finding attack points


  0%|          | 0/3 [00:00<?, ?it/s]

No. of points sanitized:  0


 33%|███▎      | 1/3 [00:29<00:58, 29.01s/it]

0.13875530930139723 0.04 0.16
No. of points sanitized:  0


 67%|██████▋   | 2/3 [01:09<00:35, 35.68s/it]

0.142245707594638 0.0005 0.1995
No. of points sanitized:  0


100%|██████████| 3/3 [01:48<00:00, 36.25s/it]


0.14234661206340612 0.0007500000000000001 0.19925
Length of D_poison: 798
Length allowed to poison:  2601
No. of points sanitized:  0

Avg loss of clean model: 0.01694, avg classification accuracy: 0.99260
No. of points sanitized:  0

Avg loss of poisoned model:0.14235, avg classification accuracy: 0.95747


-----------result---------
adaptive attack against data_sanitization svm model on mnist_17 dataset: 0.13 (0.06 fraction of poisoning data)




# Report

**Q.1) Please describe your adaptive attack algorithm in the report, specifying how your attack bypasses the data sanitization. You can also inform us about any difficulties you faced and how you solved them.**

My adaptive is the same KKT approach as Task 1 with 2 modifications:
1. My $\theta_{decoy}$ is calculated using an undefended SVM. This is so that my combined dataset (D_clean + D_poison) which contains a lot of poison points in order to learn the optimal $\theta_{decoy}$ does not get sanitized by my data sanitizer. As a result, my optimal $\theta_{decoy}$ is learnt.

2. I'm limiting the number of poisoned points to be less than 400 for each class. The reason for this is because the defense in Task 2 assumes that the total number of poisoned points will be greater than 800 (2 points repeated > 400 times each). 


The result of this modification is that:
1. Length of D_posion < Allowed length of D_poison --> Bypassed data sanitization completely
2. 4% decrease in performance (exactly same as Task 1).

```
Avg loss of clean model: 0.01694, avg classification accuracy: 0.99260
Avg loss of poisoned model:0.14235, avg classification accuracy: 0.95747
```

Note: I have noted through experimentatioon that since I have learnt an optimal $\theta_{decoy}$, I can add only 2 poisoned points for each class and still my performance would decrease drastically.

