# Warm-ups: Implement label flipping attcks

## Dependencies

In [None]:
import copy
import numpy as np
import os

### If you are using Google Colab, you need to upload this notebook and the codebase to your Google Drive. Then you need to mount your Google Drive in Colab and set your working directory. If you are running on your local machine, you can ignore the following line.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root_dir = "/content/drive/My Drive/"
project_dir = "Assignment2" # Change to your path
os.chdir(root_dir + project_dir)

In [None]:
# Make sure the path is correct
!ls

attack.py			    dataset
CS5562_Assignment_2_Task1.ipynb     defense.py
CS5562_Assignment_2_Task2.ipynb     environment.yml
CS5562_Assignment_2_Task3.ipynb     model.py
CS5562_Assignment_2_Task4.ipynb     __pycache__
CS5562_Assignment_2_Warm_ups.ipynb  utilities.py


## Implement random label flipping attack

In [None]:
from utilities import *

In [None]:
from attack import Attack

class Random_Label_Flip_Attack(Attack):
    """
    Random label flipping attack
    """
    def attack(self, eps):
        n_poison = int(eps * len(self.clean_dataset))

        ####################
        # TODO: modify the following part to build your attack model based on label flipping attack
        index = np.random.choice(self.clean_dataset.X.shape[0], n_poison, replace=False)
        X, Y_modified = self.clean_dataset[index]
        Y_modified = Y_modified*(-1)
        ####################
        return dataset(X, Y_modified)

## Implement your label flipping attack

In [None]:
class Label_Flip_Attack(Attack):
    """
        Label flipping attack: students implement their own label flipping attack here
    """
    def attack(self, eps):
        n_poison = int(eps * len(self.clean_dataset))

        ####################
        # TODO: modify the following part to build your attack model based on label flipping attack
        self.target_model.train(self.clean_dataset)
        svm = self.target_model.model

        w_norm = np.linalg.norm(svm.coef_)
        distances = abs(svm.decision_function(self.clean_dataset.X) / w_norm)

        # Get k largest distances
        index = np.argpartition(np.array(distances), -n_poison)[-n_poison:]
        X, Y_modified = self.clean_dataset[index]
        Y_modified = Y_modified*(-1)

        ####################
        return dataset(X, Y_modified)

# Test your code

## Helper functions

In [None]:
def compute_attack_grade(attack, victim_model,eps,clean_train_dataset,test_dataset):
    # target model structure is known to the adversary
    target_model = copy.deepcopy(victim_model)
    if attack == 'KKT':
        attacker = KKT_Attack(target_model,clean_train_dataset,test_dataset)
    elif attack == 'label-flip':
        attacker = Label_Flip_Attack(target_model, clean_train_dataset, test_dataset)
    elif attack == 'adaptive':
        attacker = Adaptive_Attack(target_model, clean_train_dataset, test_dataset)
    elif attack == 'random-label-flip':
        attacker = Random_Label_Flip_Attack(target_model, clean_train_dataset, test_dataset)
    poisoned_dataset = attacker.attack(eps)
    assert len(poisoned_dataset) <= int(eps*len(clean_train_dataset))

    train_dataset = combine_datset(clean_train_dataset,poisoned_dataset)
    clean_model = copy.deepcopy(target_model)

    # performance without any attack
    clean_model.train(clean_train_dataset)
    clean_loss,clean_acc = clean_model.score(test_dataset)
    print('\nAvg loss of clean model: %0.5f, avg classification accuracy: %0.5f'%(clean_loss,clean_acc))

    # attack the victim model
    victim_model.train(train_dataset)
    poisoned_loss,poisoned_acc =victim_model.score(test_dataset)
    print('\nAvg loss of poisoned model:%0.5f, avg classification accuracy: %0.5f'%(poisoned_loss,poisoned_acc))

    grade = poisoned_loss - clean_loss

    # # for generating figures
    # distance_to_center_diff(clean_train_dataset,poisoned_dataset)
    # loss_diff(clean_train_dataset, poisoned_dataset,clean_model)

    return len(poisoned_dataset)/len(clean_train_dataset),grade

## Testing

### Random label flipping

In [None]:
from model import Undefended_Model

train_dataset,test_dataset = load_dataset('mnist_17')
base_model = load_model("nn", "mnist_17")
target_model = Undefended_Model(base_model,"nn")
defense_name = 'undefended'
fraction, attack_grade = compute_attack_grade("random-label-flip", target_model, 0.2, train_dataset, test_dataset)
print('\n\n-----------result---------')
print('%s attack against %s %s model on %s dataset: %0.2f (%0.2f fraction of poisoning data)'%("random-label-flip",defense_name,"nn","mnist_17",attack_grade,fraction))


Avg loss of clean model: 0.01602, avg classification accuracy: 0.99491

Avg loss of poisoned model:0.20555, avg classification accuracy: 0.99260


-----------result---------
random-label-flip attack against undefended nn model on mnist_17 dataset: 0.19 (0.20 fraction of poisoning data)


### Label flipping

In [None]:
from model import Undefended_Model

train_dataset,test_dataset = load_dataset('mnist_17')
base_model = load_model("svm", "mnist_17")
target_model = Undefended_Model(base_model,"svm")
defense_name = 'undefended'
fraction, attack_grade = compute_attack_grade("label-flip", target_model, 0.2, train_dataset, test_dataset)
print('\n\n-----------result---------')
print('%s attack against %s %s model on %s dataset: %0.2f (%0.2f fraction of poisoning data)'%("label-flip",defense_name,"svm","mnist_17",attack_grade,fraction))


Avg loss of clean model: 0.01694, avg classification accuracy: 0.99260

Avg loss of poisoned model:0.08222, avg classification accuracy: 0.98382


-----------result---------
label-flip attack against undefended svm model on mnist_17 dataset: 0.07 (0.20 fraction of poisoning data)




# Report