# Task 2: Data Sanitization

## Dependencies

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import os
import copy

### If you are using Google Colab, you need to upload this notebook and the codebase to your Google Drive. Then you need to mount your Google Drive in Colab and set your working directory. If you are running on your local machine, you can ignore the following line.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
root_dir = "/content/drive/My Drive/"
project_dir = "CS5562 2023 Spring/Robustness/Assignment 2" # Change to your path
os.chdir(root_dir + project_dir)

In [None]:
# Make sure the path is correct
!ls

## Implement data sanitizer

In [None]:
from utilities import *

In [None]:
def data_sanitizer(training_data, estimate_eps):
    """
       Removes the estimate_eps fraction of points from X and Y.
    """

    n_est_poisoned = int(estimate_eps * len(training_data))

    #################
    # TODO: decide which points need to be deleted
    index = np.random.choice(training_data.X.shape[0], n_est_poisoned, replace=False)

    ################
    training_data_copy = copy.deepcopy(training_data)
    del training_data_copy[index]
    return training_data_copy

# Test your code

## Helper functions

In [None]:
from model import Model


class Data_Sanitized_Model(Model):
    def __init__(self, model, model_name, estimated_eps):
        super().__init__(model, model_name)
        self.estimated_eps = estimated_eps

    def train(self, train_dataset):
        sanitized_data = data_sanitizer(training_data=train_dataset, estimate_eps=self.estimated_eps)
        self.model.fit(sanitized_data.X, sanitized_data.Y)

In [None]:
def compute_attack_grade(attack, victim_model,eps,clean_train_dataset,test_dataset):
    # target model structure is known to the adversary
    target_model = copy.deepcopy(victim_model)
    if attack == 'KKT':
        attacker = KKT_Attack(target_model,clean_train_dataset,test_dataset)
    elif attack == 'label-flip':
        attacker = Label_Flip_Attack(target_model, clean_train_dataset, test_dataset)
    elif attack == 'adaptive':
        attacker = Adaptive_Attack(target_model, clean_train_dataset, test_dataset)
    elif attack == 'random-label-flip':
        attacker = Random_Label_Flip_Attack(target_model, clean_train_dataset, test_dataset)
    poisoned_dataset = attacker.attack(eps)
    assert len(poisoned_dataset) <= int(eps*len(clean_train_dataset))

    train_dataset = combine_datset(clean_train_dataset,poisoned_dataset)
    clean_model = copy.deepcopy(target_model)

    # performance without any attack
    clean_model.train(clean_train_dataset)
    clean_loss,clean_acc = clean_model.score(test_dataset)
    print('\nAvg loss of clean model: %0.5f, avg classification accuracy: %0.5f'%(clean_loss,clean_acc))

    # attack the victim model
    victim_model.train(train_dataset)
    poisoned_loss,poisoned_acc =victim_model.score(test_dataset)
    print('\nAvg loss of poisoned model:%0.5f, avg classification accuracy: %0.5f'%(poisoned_loss,poisoned_acc))

    grade = poisoned_loss - clean_loss

    # # for generating figures
    # distance_to_center_diff(clean_train_dataset,poisoned_dataset)
    # loss_diff(clean_train_dataset, poisoned_dataset,clean_model)

    return len(poisoned_dataset)/len(clean_train_dataset),grade

## Copy and Paste your KKT attack here:

In [None]:
from attack import Attack

class KKT_Attack(Attack):
    """
        KKT attack
    """
    def attack(self, eps):
        n_poison = int(eps * len(self.clean_dataset))

        ####################
        # TODO: update the following part to build your attack model based on KKT attack
        index = np.random.choice(self.clean_dataset.X.shape[0], n_poison, replace=False)
        X_modified, Y_modified = self.clean_dataset[index]


        ####################

        return dataset(X_modified, Y_modified)

## Testing

In [None]:
train_dataset,test_dataset = load_dataset('mnist_17')
base_model = load_model("svm", "mnist_17")
target_model = Data_Sanitized_Model(base_model,"svm", 0.2)
defense_name = 'data_sanitization'
fraction, attack_grade = compute_attack_grade("KKT", target_model, 0.2, train_dataset, test_dataset)
print('\n\n-----------result---------')
print('%s attack against %s %s model on %s dataset: %0.2f (%0.2f fraction of poisoning data)'%("KKT",defense_name,"svm","mnist_17",attack_grade,fraction))

# Report