In [1]:
AGENT_NAME = 'default_PPO_citylearn_challenge_2022_phase_2_Building_6_20_bins_500'
SAVE_DIR = 'attribution/'
ATTR = 'FP'

In [62]:
from stable_baselines3 import PPO

import pandas as pd
import numpy as np

from captum.attr import FeaturePermutation
import torchdrift.detectors as detectors
import torch

import KBMproject.utilities as utils

%matplotlib inline

In [3]:
agent = PPO.load(path=f"{AGENT_NAME}")
print('Model loaded from storage')

Model loaded from storage


In [4]:
actor = utils.extract_actor(agent)

In [5]:
fp = FeaturePermutation(actor)

In [6]:
from art.estimators.classification import PyTorchClassifier as classifier
from torch.nn import CrossEntropyLoss
victim_policy = classifier(
    model=actor,
    loss=CrossEntropyLoss(), 
    nb_classes=agent.action_space[0].n,
    input_shape=agent.observation_space.shape,
    device_type='gpu',
    clip_values = (agent.observation_space.low.min(),agent.observation_space.high.max()) #min and max values of each feature, brendle bethge attack only supports floats values and not array
    )

In [7]:
df_clean_bb_obs = pd.read_csv('bb results/clean obs.csv',
                           index_col=0,
                           dtype='float32')
df_adv_bb_obs = pd.read_csv('bb results/bb obs.csv',
                         index_col=0,
                         dtype='float32')

In [8]:
clean_bb_a = np.argmax(victim_policy.predict(df_clean_bb_obs), axis=1)
adv_bb_a = np.argmax(victim_policy.predict(df_adv_bb_obs), axis=1)

In [17]:
df_clean_acg_obs = pd.read_csv('adv_obs.csv',
                           index_col=0,
                           dtype='float32')
df_adv_acg_obs = pd.read_csv('adv_perturbed_obs.csv',
                         index_col=0,
                         dtype='float32')

In [18]:
clean_acg_a = np.argmax(victim_policy.predict(df_clean_acg_obs), axis=1)
adv_acg_a = np.argmax(victim_policy.predict(df_adv_acg_obs), axis=1)

In [56]:
df_baseline_obs = pd.read_csv('baseline_obs.csv',
                         index_col=0,
                         dtype='float32')
baseline_a = np.loadtxt('baseline_obs_a.csv',
                         delimiter=',',
                         ).astype('int64')

In [11]:
clean_bb_shap = fp.attribute(torch.from_numpy(
                                            df_clean_bb_obs.to_numpy()
                                            ).to('cuda'), #convert df to cuda tensor
                             target=clean_bb_a.flatten().tolist() #convert actions to 1d list
                             ).detach().cpu().numpy() #move results to cpu

In [15]:
adv_bb_shap = fp.attribute(torch.from_numpy(df_adv_bb_obs.to_numpy()).to('cuda'),
                             target=adv_bb_a.flatten().tolist()
                             ).detach().cpu().numpy()

In [19]:
clean_acg_shap = fp.attribute(torch.from_numpy(df_clean_acg_obs.to_numpy()).to('cuda'),
                             target=clean_acg_a.flatten().tolist()
                             ).detach().cpu().numpy()

In [20]:
adv_acg_shap = fp.attribute(torch.from_numpy(df_adv_acg_obs.to_numpy()).to('cuda'),
                             target=adv_acg_a.flatten().tolist()
                             ).detach().cpu().numpy()

In [57]:
baseline_shap = fp.attribute(torch.from_numpy(df_baseline_obs.to_numpy()).to('cuda'),
                             target=baseline_a.tolist()
                             ).detach().cpu().numpy()

In [64]:
BOOTSTRAP = 10_000
PVAL = 0.05
kernel = detectors.mmd.GaussianKernel()

In [65]:
result = detectors.kernel_mmd(torch.from_numpy(clean_bb_shap).to('cuda'), 
                                  torch.from_numpy(adv_bb_shap).to('cuda'), 
                                  n_perm=BOOTSTRAP,
                                  kernel=kernel)
print(f'The resulats for the clean and untargeted bb perturbed observations are mmd:{result[0]}, p-value:{result[1]}')

The resulats for the clean and untargeted bb perturbed observations are mmd:0.0007966756820678711, p-value:0.0


In [66]:
result = detectors.kernel_mmd(torch.from_numpy(clean_acg_shap).to('cuda'), 
                                  torch.from_numpy(adv_acg_shap).to('cuda'), 
                                  n_perm=BOOTSTRAP,
                                  kernel=kernel)
print(f'The results for the clean and untargeted acg perturbed observations are mmd:{result[0]}, p-value:{result[1]}')

The results for the clean and untargeted acg perturbed observations are mmd:0.001306772232055664, p-value:0.0


In [69]:
result = detectors.kernel_mmd(torch.from_numpy(clean_bb_shap).to('cuda'), 
                                  torch.from_numpy(baseline_shap).to('cuda'), 
                                  n_perm=BOOTSTRAP,
                                  kernel=kernel)
print(f'The resulats for the baseline unperturbed bb observations are mmd:{result[0]}, p-value:{result[1]}')

The resulats for the baseline unperturbed bb observations are mmd:0.00033092498779296875, p-value:0.0031999999191612005


In [68]:
result = detectors.kernel_mmd(torch.from_numpy(clean_acg_shap).to('cuda'), 
                                  torch.from_numpy(baseline_shap).to('cuda'), 
                                  n_perm=BOOTSTRAP,
                                  kernel=kernel)
print(f'The resulats for the baseline unperturbed acg observations are mmd:{result[0]}, p-value:{result[1]}')

The resulats for the baseline unperturbed acg observations are mmd:0.00020778179168701172, p-value:0.7908999919891357


In [70]:
np.savetxt(SAVE_DIR + f'clean bb {ATTR}.csv', clean_bb_shap, delimiter=',')
np.savetxt(SAVE_DIR + f'adv bb {ATTR}.csv', adv_bb_shap, delimiter=',')
np.savetxt(SAVE_DIR + f'clean acg {ATTR}.csv', clean_acg_shap, delimiter=',')
np.savetxt(SAVE_DIR + f'adv acg {ATTR}.csv', adv_acg_shap, delimiter=',')
np.savetxt(SAVE_DIR + f'baseline {ATTR}.csv', baseline_shap, delimiter=',')