In [1]:
import os
import torch
from torch import nn
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from torchvision import transforms
from captum.robust import PGD, FGSM
import sys
import argparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from skimage.metrics import structural_similarity as ssim
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import datetime
from tqdm import tqdm
sys.path.append('../')
from saliency import *
from utils import *
from plots import *
def get_model(model_path, device):
    model = torch.load(model_path)
    model.eval()
    model.to(device)
    return model


Bad key "text.kerning_factor" on line 4 in
/home/raza.imam/.conda/envs/xaim/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution
  warn(f"Failed to load image Python extension: {e}")


In [2]:
DEVICE = 'cuda'
MODEL_PATH = '/home/raza.imam/Documents/HC701B/Project/models/vit_base_patch16_224_in21k_test-accuracy_0.96_chest.pth'
model = get_model(model_path=MODEL_PATH, device=DEVICE)

transform = transforms.Compose(
            [
                transforms.Grayscale(num_output_channels=3),
                transforms.RandomRotation((90,90)),
                transforms.CenterCrop(200),
                transforms.Resize((224, 224)),
                # transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
                transforms.ToTensor(),
                ]
            )

In [60]:
def classify_attn(img_attn_map, mean_attn_clean, mean_attn_adv, method = 'all', attacks = ['PDG', 'FGSM']): 
    """
        This function classifies an image as clean or adversarial 
        based on the distance between the test image's attention map 
        and the mean attention maps of clean and adversarial images.
    """
    test_attn_flat = img_attn_map.flatten()
    mean_attns_cln_flat = mean_attn_clean.flatten()
    assert isinstance(mean_attn_adv, dict), "mean_attn_adv must be a dict"
    if isinstance(method, str):
        method = [method]
    if "all" in method:
        method = ["sum", "euclidean", "cosine", "ssim", "kl"]
    preds = {}
    
    if "sum" in method:
        sum_distance_to_normal = np.sum((test_attn_flat - mean_attns_cln_flat))
        sum_pred = "Clean"
        for key in attacks:
            sum_distance_to_adversarial = np.sum((test_attn_flat - mean_attn_adv[key].flatten()))
            if sum_distance_to_normal > sum_distance_to_adversarial:
                sum_pred = "Adversarial"
                break
        preds["sum"] = sum_pred
    
    if "kl" in method:
        kl_to_normal = np.sum(kl_div(test_attn_flat, mean_attns_cln_flat))
        kl_pred = "Clean"
        for key in attacks:
            kl_to_adversarial = np.sum(kl_div(test_attn_flat, mean_attn_adv[key].flatten()))
            if kl_to_normal > kl_to_adversarial:
                kl_pred = "Adversarial"
                break
        preds["kl"] = kl_pred
        
    if "ssim" in method:
        structural_similarity_to_normal = ssim(test_attn_flat, mean_attns_cln_flat)
        structural_pred = "Clean"
        for key in attacks:
            structural_similarity_to_adversarial = ssim(test_attn_flat, mean_attn_adv[key].flatten())
            if structural_similarity_to_normal < structural_similarity_to_adversarial:
                structural_pred = "Adversarial"
                break
        preds["ssim"] = structural_pred

    if "euclidean" in method:
        euc_distance_to_normal = euclidean(test_attn_flat, mean_attns_cln_flat)
        euc_pred = "Clean"
        for key in attacks:
            euc_distance_to_adversarial = euclidean(test_attn_flat, mean_attn_adv[key].flatten())
            if euc_distance_to_normal > euc_distance_to_adversarial:
                euc_pred = "Adversarial"
                break
        preds["euclidean"] = euc_pred

    if "cosine" in method:
        cosine_distance_to_normal = cosine_similarity([test_attn_flat], [mean_attns_cln_flat])
        cos_pred = "Clean"
        for key in attacks:
            cosine_distance_to_adversarial = cosine_similarity([test_attn_flat], [mean_attn_adv[key].flatten()])
            if cosine_distance_to_normal < cosine_distance_to_adversarial: # cosine similarity is between 0 and 1 and greater the value, more similar the vectors
                cos_pred = "Adversarial"
                break
        preds["cosine"] = cos_pred

    return preds

# Creating references (means) using Training Set

In [4]:
#global vars
attack_type = 'PDG'
eps = 0.06
class_type = 'Normal' #Normal, TB

In [5]:
cln_folder = f'/home/raza.imam/Documents/XAIM/XAIM/data5/training/{attack_type}_{eps}/{class_type}/Succ/Clean_x'

cln_files = [f for f in os.listdir(cln_folder) if f.endswith(".png")]
images = []
for f in tqdm(cln_files):
    image_path = os.path.join(cln_folder, f)
    image = Image.open(image_path)
    image = transform(image)
    images.append(image)
images_tensor = torch.stack(images)
mean_attns = {}
all_attns = {}
mean_attn_diff = {}
attentions_clean, mean_attns_cln = apply_attn_on_images(model=model, block=-1, images = images_tensor, device=DEVICE)
mean_attns['clean'] = mean_attns_cln
all_attns['clean'] = attentions_clean
image_path = '../plots_succ/clean.png'
plt.imsave(image_path, mean_attns['clean'], cmap='inferno')

100%|██████████| 2677/2677 [00:11<00:00, 228.35it/s]


In [8]:
attack_type = ['PDG', 'FGSM']
eps_list = [0.01, 0.03, 0.06]

for attack in attack_type:
    for eps in eps_list:
        print(attack, eps)
        atk_folder = f'/home/raza.imam/Documents/XAIM/XAIM/data5/training/{attack}_{eps}/{class_type}/Succ/Succ_x'

        atk_files = [f for f in os.listdir(atk_folder) if f.endswith(".png")]
        images = []
        for f in tqdm(atk_files):
            image_path = os.path.join(atk_folder, f)
            image = Image.open(image_path)
            image = transform(image)
            images.append(image)
        adv_images_tensor = torch.stack(images)

        attentions_adv, mean_attns_adv = apply_attn_on_images(model=model, block=-1, images = adv_images_tensor, device=DEVICE)
        mean_attns_diff_adv = mean_attns_adv - mean_attns_cln
        mean_attns[f'{attack}_{eps}'] = mean_attns_adv
        all_attns[f'{attack}_{eps}'] = attentions_adv
        mean_attn_diff[f'{attack}_{eps}'] = mean_attns_diff_adv
        image_path = f'../plots_succ/{attack}_{eps}.png'
        plt.imsave(image_path, mean_attns[f'{attack}_{eps}'], cmap='inferno')

PDG 0.01


100%|██████████| 420/420 [00:02<00:00, 191.79it/s]


PDG 0.03


100%|██████████| 2036/2036 [00:10<00:00, 201.83it/s]


PDG 0.06


100%|██████████| 2677/2677 [00:13<00:00, 199.94it/s]


FGSM 0.01


100%|██████████| 261/261 [00:01<00:00, 219.11it/s]


FGSM 0.03


100%|██████████| 1075/1075 [00:04<00:00, 219.99it/s]


FGSM 0.06


100%|██████████| 2203/2203 [00:10<00:00, 215.51it/s]


# Test attentions generation

In [11]:
test_attack = 'Clean_x' # Succ_x or Clean_x  
class_name = 'Normal' # Normal or TB
attack_type_test = 'PDG'

test_folder = f'/home/raza.imam/Documents/XAIM/XAIM/data5/validation/{attack_type_test}_{eps}/{class_name}/Succ/{test_attack}'

test_files = [f for f in os.listdir(test_folder) if f.endswith(".jpg") or f.endswith(".png")]
images = []
for f in tqdm(test_files):
    image_path = os.path.join(test_folder, f)
    image = Image.open(image_path)
    image = transform(image)
    images.append(image)
test_images_tensor = torch.stack(images)
test_attns = {}
attentions_clean, _ = apply_attn_on_images(model=model, block=-1, images = test_images_tensor, device=DEVICE)
test_attns['Clean'] = attentions_clean

# ----------------------------------------------

test_attack = 'Succ_x' # Succ_x or Clean_x
class_name = 'Normal' # Normal or TB

test_folder = f'/home/raza.imam/Documents/XAIM/XAIM/data5/validation/{attack_type_test}_{eps}/{class_name}/Succ/{test_attack}'

test_files = [f for f in os.listdir(test_folder) if f.endswith(".jpg") or f.endswith(".png")]
images = []
for f in tqdm(test_files):
    image_path = os.path.join(test_folder, f)
    image = Image.open(image_path)
    image = transform(image)
    images.append(image)
test_images_tensor = torch.stack(images)
attentions_clean, _ = apply_attn_on_images(model=model, block=-1, images = test_images_tensor, device=DEVICE)
test_attns['Adversarial'] = attentions_clean

  0%|          | 0/301 [00:00<?, ?it/s]

100%|██████████| 301/301 [00:01<00:00, 237.55it/s]
100%|██████████| 301/301 [00:01<00:00, 196.72it/s]


# Classification

In [63]:
gt_labels = []
sum_preds = [] #sum_preds = ssim preds
euc_preds = []
cos_preds = []
ssim_preds = []
kl_preds = []

test_attack_type='Clean'
attacks = list(mean_attns.keys())
attacks.remove('clean')
print(attacks)

for idx, attn_map in tqdm(enumerate(test_attns[test_attack_type])):
    result = classify_attn(
        img_attn_map = attn_map,
        mean_attn_clean = mean_attns['clean'], 
        mean_attn_adv=mean_attns, 
        method='all',
        attacks = attacks
        )
    sum_preds.append(result['sum'])
    euc_preds.append(result['euclidean'])
    cos_preds.append(result['cosine'])
    ssim_preds.append(result['ssim'])
    kl_preds.append(result['kl'])
    gt_labels.append(test_attack_type)

results_dict ={
    "GT": gt_labels,
    "sum": sum_preds,
    "euclidean": euc_preds,
    "cosine": cos_preds,
    "ssim": ssim_preds,
    "kl": kl_preds,
}

# 1 = Clean
# 0 = Adversarial
gt_labels_bin = [1 if label == "Clean" else 0 for label in gt_labels]

methods = ["sum", "euclidean", "cosine", "ssim", "kl"]
for method in methods:
    pred_bin = [1 if label == "Clean" else 0 for label in results_dict[method]]
    print(f'--------------- {method} ---------------')
    print(f"Accuracy for {method}: {accuracy_score(gt_labels_bin, pred_bin)}")
    print(f"F1 score for {method}: {f1_score(gt_labels_bin, pred_bin)}")
    print(f'-------------------------------------------------------------------')
    print(f'-------------------------------------------------------------------')

['PDG_0.01', 'PDG_0.03', 'PDG_0.06', 'FGSM_0.01', 'FGSM_0.03', 'FGSM_0.06']


301it [00:05, 56.99it/s]

--------------- sum ---------------
Accuracy for sum: 0.0
F1 score for sum: 0.0
-------------------------------------------------------------------
-------------------------------------------------------------------
--------------- euclidean ---------------
Accuracy for euclidean: 0.48172757475083056
F1 score for euclidean: 0.6502242152466368
-------------------------------------------------------------------
-------------------------------------------------------------------
--------------- cosine ---------------
Accuracy for cosine: 0.47840531561461797
F1 score for cosine: 0.6471910112359551
-------------------------------------------------------------------
-------------------------------------------------------------------
--------------- ssim ---------------
Accuracy for ssim: 0.4850498338870432
F1 score for ssim: 0.6532438478747203
-------------------------------------------------------------------
-------------------------------------------------------------------
--------------




In [27]:
from skimage.metrics import structural_similarity as ssim

def classify_image_ssim(test_image, clean_image, adversarial_image, threshold=0.9):
    # Calculate SSIM between test image and clean image
    ssim_clean = ssim(test_image, clean_image, data_range=test_image.max() - test_image.min())

    # Calculate SSIM between test image and adversarial image
    ssim_adv = ssim(test_image, adversarial_image, data_range=test_image.max() - test_image.min())

    # print(f"SSIM to Clean: {ssim_clean}, SSIM to Adversarial: {ssim_adv}")

    # Compare SSIM values and classify
    if ssim_clean > ssim_adv:
        return "Clean"
    else:
        return "Adversarial"
    
def classify_image_sum(test_attn, mean_attns_cln, mean_attns_adv): #np.sum((test_attn - mean_attns_cln)) #FIXME:
    # Flatten the images to 1D arrays (if not already flattened)
    test_attn_flat = test_attn.flatten()
    mean_attns_cln_flat = mean_attns_cln.flatten()
    mean_attns_adv_flat = mean_attns_adv.flatten()
    # Calculate cosine similarities
    distance_to_normal = np.sum((test_attn_flat - mean_attns_cln_flat))
    distance_to_adversarial = np.sum((test_attn_flat - mean_attns_adv_flat))

    # print(f"distance_to_normal: {distance_to_normal}", f"distance_to_adversarial: {distance_to_adversarial}")
    # Compare distances and classify
    if distance_to_normal < distance_to_adversarial:
        return "Clean"
    else:
        return "Adversarial"
    
from scipy.spatial.distance import euclidean
# Defining function
def classify_image_euclidean(test_attn, mean_attns_cln, mean_attns_adv): # np.sqrt(np.sum(np.square(test_attn - mean_attns_cln)))
    # Flatten the images to 1D arrays (if not already flattened)
    test_attn_flat = test_attn.flatten()
    mean_attns_cln_flat = mean_attns_cln.flatten()
    mean_attns_adv_flat = mean_attns_adv.flatten()
    # Calculate Euclidean distances
    distance_to_normal = euclidean(test_attn_flat, mean_attns_cln_flat)
    distance_to_adversarial = euclidean(test_attn_flat, mean_attns_adv_flat)
    
    # print(f"distance_to_normal: {distance_to_normal}", f"distance_to_adversarial: {distance_to_adversarial}")
    # Compare distances and classify
    if distance_to_normal < distance_to_adversarial:
        return "Clean"
    else:
        return "Adversarial"
    
from sklearn.metrics.pairwise import cosine_similarity
# Defining function
def classify_image_cos_similarity(test_attn, mean_attns_cln, mean_attns_adv): #np.dot(test_image,normal_image)/(norm(test_image)*norm(normal_image))
    # Flatten the images to 1D arrays (if not already flattened)
    test_attn_flat = test_attn.flatten()
    mean_attns_cln_flat = mean_attns_cln.flatten()
    mean_attns_adv_flat = mean_attns_adv.flatten()
    # Calculate cosine similarities
    similarity_to_normal = cosine_similarity([test_attn_flat], [mean_attns_cln_flat])
    similarity_to_adversarial = cosine_similarity([test_attn_flat], [mean_attns_adv_flat])

    # print(f"similarity_to_normal: {similarity_to_normal}", f"similarity_to_adversarial: {similarity_to_adversarial}")
    # Compare similarity scores and classify
    if similarity_to_normal > similarity_to_adversarial:
        return "Clean"
    else:
        return "Adversarial"

import numpy as np
from scipy.special import kl_div

def classify_image_kl_divergence(test_attn, mean_attns_cln, mean_attns_adv):
    # Flatten the images to 1D arrays (if not already flattened)
    test_attn_flat = test_attn.flatten()
    mean_attns_cln_flat = mean_attns_cln.flatten()
    mean_attns_adv_flat = mean_attns_adv.flatten()
    
    # Calculate KL divergences
    kl_to_normal = np.sum(kl_div(test_attn_flat, mean_attns_cln_flat))
    kl_to_adversarial = np.sum(kl_div(test_attn_flat, mean_attns_adv_flat))

    # Compare KL divergences and classify
    if kl_to_normal < kl_to_adversarial:
        return "Clean"
    else :
        return "Adversarial"


In [70]:
num_test_images = len(test_attns['Adversarial'])
test_attns_ = test_attns['Adversarial']
mean_attns_cln = mean_attns['clean']
mean_attns_adv = mean_attns['FGSM_0.06']

classifications = []
# Test each test image
for test_attn in test_attns_:
    result = classify_image_kl_divergence(test_attn, mean_attns_cln, mean_attns_adv)
    classifications.append(result)

# Calculate accuracy
true_labels = ["Adversarial"] * num_test_images  # Assuming all test images should be classified as "Normal"
accuracy = sum(1 for true, predicted in zip(true_labels, classifications) if true == predicted) / num_test_images
print(f"Overall Accuracy: {accuracy * 100:.4f}%")


Overall Accuracy: 70.0997%
