In [140]:
%load_ext autoreload
%autoreload 2

import os
import sys
import itertools
import pandas as pd
from scenarios import *
from plots import *
from metrics import get_metrics

PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
sys.path.append(PROJECT_ROOT)
#print(PROJECT_ROOT)
from data_preprocessing.data_interface import get_data_sklearn, DataNotAvailable

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets as skl_datasets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [141]:
class model_info:
    '''
    For each model it builds a class where information can be more easily identified.
    
    kind: type of model, can be either target, shadow or attack
    name: name of the classifier
    dataset: name of the dataset used to train the model
    params: parameters used for the model
    attack_scenario: if this is part of an attack scenario, state which scenario name
    metrics: calculated metrics for the model
    
    To view the data contained in the instance use model_info.show().
    '''
    def __init__(self, target_model_id, kind, classifier, dataset, params=None, attack_scenario=None, metrics=None):
        self.target_model_id = target_model_id #identifier of the target model, so it makes possible to map attacks and target models
        self.kind = kind #type of model: target, shadow, attack
        self.classifier = classifier #name of the classifier
        self.dataset = dataset #name of the dataset used to train the model
        self.params = params #parameters used. 'None' assumes default params
        self.attack_scenario = attack_scenario #Only relevant to the attack
        self.metrics = metrics #calculated metrics, expect a dictionary
    
    def show(self):
        print("Target model ID:", self.target_model_id)
        print("Type:", self.kind)
        print("Classifier:", self.classifier)
        print("Parameters:", self.params)
        print("Dataset:", self.dataset)
        print("Attack scenario:", self.attack_scenario)
        print("Metrics")
        for metric, value in self.metrics.items():
            print(metric, value)
    
    def data_frame(self):
        d = {"Target model ID":self.target_model_id, "Type": self.kind, "Classifier":self.classifier,
                          "Dataset":self.dataset, "Attack scenario": self.attack_scenario}
        #print(self.params, type(self.params))
        if not self.params:
            self.params = {}
        return(pd.DataFrame.from_dict({**d, **self.params, **self.metrics}, orient='index').T)

In [122]:
def create_dir(path:str):
    """
    Creates a new directory if it does not exist.

    path: directory to create.
    """
    if not os.path.isdir(path):
        os.mkdir(path)

Create a directory to save images and results files if it doesn't exist

In [123]:
results_dir = os.path.join(PROJECT_ROOT, 'results')
create_dir(results_dir)

Define available datasets

In [164]:
datasets = [
    'mimic2-iaccd',
    'in-hospital-mortality',
    'medical-mnist-ab-v-br-100',
    'indian liver',
    'texas hospitals 10'
]

In [165]:
classifiers = {
        'RandomForestClassifier':RandomForestClassifier, #bootstrap=False
        'DecisionTreeClassifier':DecisionTreeClassifier,
        'GaussianProcessClassifier':GaussianProcessClassifier,
        'MLPClassifier':MLPClassifier,
        'KNeighborsClassifier':KNeighborsClassifier,
        'SVC':SVC,#kernel='rbf', probability=True),
        'AdaBoostClassifier':AdaBoostClassifier #n_estimators=100)
}

In [166]:
experiment_params = {
    'RandomForestClassifier': {
        #'n_estimators': [10, 20, 100],
        #'criterion':['gini','entropy'],
        #'max_depth':[None,2,4],
        #'max_features':[None,'sqrt','log2'],
        'bootstrap': [True, False],
        'min_samples_split': [2, 10],
        #'class_weight':[None,'balanced','balanced_subsample'],
    },
    'DecisionTreeClassifier': {
        #'criterion':['gini','entropy'],
        'max_depth':[None,2,4],
        #'min_samples_split': [2, 10],
        #'max_features':[None,'sqrt','log2'],
        #'class_weight':[None,'balanced']
    },
    'GaussianProcessClassifier': {
        'max_iter_predict':[50,100,200],
        'warm_start':[True,False],
    },
    'MLPClassifier': {
        #'hidden_layer_size':[(50,),(100,),(200,)],
        #'activation':['identity', 'logistic', 'tanh', 'relu'],
        'solver':['lbfgs', 'sgd', 'adam'],
        #'learning_rate': ['constant', 'invscaling', 'adaptive'],
        #'max_iter': [50,200,400,1000]
    },
    'KNeighborsClassifier': {
        'n_neighbors':[2,5,10,20],
        'weights':['uniform', 'distance'],
        #'algorithm':['ball_tree', 'kd_tree', 'brute']
    },
    'SVC': {
        #'Kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        #'decision_function_shape':['ovo', 'ovr'],
        #'max_iter':[-1, 2, 5],
        'probability':[True]
    },
    'AdaBoostClassifier': {
        #'n_estimators': [10, 20, 50, 100],
        'algorithm':['SAMME', 'SAMME.R']
    }
}


In [167]:
class ResultsEntry():
    def __init__(self, dataset_name, scenario_name, classifier_name, shadow_classifier_name=None, shadow_dataset=None, params={}, target_metrics={}, shadow_metrics={}, mia_metrics={}):
        self.metadata = {
            'dataset': dataset_name,
            'scenario': scenario_name,
            'target_classifier': classifier_name,
            'shadow_classifier_name': shadow_classifier_name,
            'shadow_dataset': shadow_dataset
        }
        self.params = params
        self.target_metrics = target_metrics
        self.shadow_metrics = shadow_metrics
        self.mia_metrics = mia_metrics
    
    def to_dataframe(self):
        return(
            pd.DataFrame.from_dict(
                {
                    **self.metadata,
                    **self.params,
                    **self.target_metrics,
                    **self.mia_metrics,
                    **self.shadow_metrics
                }, orient='index').T
            )

In [168]:

MIA_CLASSIFIER_NAME = "RandomForestClassifier"

results_df = pd.DataFrame()

for dataset in datasets[:1]:
    #load the data
    try:
        X, y = get_data_sklearn(dataset)
    except DataNotAvailable as e:
        print(e)
        continue
    #split into training, shadow model and validation data
    X_target_train, X_shadow_train, X_test, y_target_train, y_shadow_train, y_test = split_target_data(X.values, y.values)
        
    for classifier_name, clf_class in classifiers.items():
        all_combinations = itertools.product(*experiment_params[classifier_name].values())
        for i,combination in enumerate(all_combinations):
            
            # Turn this particular combination into a dictionary
            params = {n: v for n, v in zip(experiment_params[classifier_name].keys(), combination)}
            target_classifier = clf_class()
            target_classifier.set_params(**params)
            
            # Train the target model
            target_classifier.fit(X_target_train, y_target_train)
            
            # Get target metrics
            target_metrics = {f"target_{key}": val for key, val in get_metrics(target_classifier, X_test, y_test).items()}
            
            ##########################################
            #######   Worst case scenario     ########
            ##########################################
            
            scenario = "Worst Case"
            mi_test_x, mi_test_y, mi_clf = worst_case_mia(
                target_classifier,
                X_target_train,
                X_test,
                mia_classifier=RandomForestClassifier()
            )
            # Get MIA metrics
            mia_metrics = {f"mia_{key}": val for key, val in get_metrics(mi_clf, mi_test_x, mi_test_y).items()}
            
            new_results = ResultsEntry(
                dataset,
                scenario,
                classifier_name,
                params=params,
                target_metrics=target_metrics,
                mia_metrics=mia_metrics
            )
            
            results_df = pd.concat([results_df, new_results.to_dataframe()], ignore_index=True)
            
            
            ##########################################
            #######   Salem scenario 1        ########
            ##########################################
            
            scenario = "Salem1"
            mi_test_x, mi_test_y, mi_clf, shadow_model, X_shadow_test, y_shadow_test = salem(
                target_classifier,
                classifiers[classifier_name](**params),
                X_target_train,
                X_shadow_train,
                y_shadow_train,
                X_test,
                mia_classifier=RandomForestClassifier()
            )
            
            # Get Shadow and MIA metrics
            shadow_metrics = {f"shadow_{key}": val for key, val in get_metrics(shadow_model, X_shadow_test, y_shadow_test).items()}
            mia_metrics = {f"mia_{key}": val for key, val in get_metrics(mi_clf, mi_test_x, mi_test_y).items()}
            
            new_results = ResultsEntry(
                dataset,
                scenario,
                classifier_name,
                shadow_dataset='Same distribution',
                shadow_classifier_name = classifier_name
                params=params,
                target_metrics=target_metrics,
                mia_metrics=mia_metrics,
                shadow_metrics=shadow_metrics,
            )

            results_df = pd.concat([results_df, new_results.to_dataframe()], ignore_index=True)

            ##########################################
            #######   Salem scenario 2        ########
            ##########################################
            
            shadow_dataset = 'Breast cancer'
            scenario = "Salem2"
            
            X_breast_cancer, y_breast_cancer = skl_datasets.load_breast_cancer(return_X_y=True)
            
            mi_test_x, mi_test_y, mi_clf, shadow_model, X_shadow_test, y_shadow_test = salem(
                target_classifier,
                classifiers[classifier_name](**params),
                X_target_train,
                X_breast_cancer,
                y_breast_cancer,
                X_test,
                mia_classifier=RandomForestClassifier()
            )
            
            # Get Shadow and MIA metrics
            shadow_metrics = {f"shadow_{key}": val for key, val in get_metrics(shadow_model, X_shadow_test, y_shadow_test).items()}
            mia_metrics = {f"mia_{key}": val for key, val in get_metrics(mi_clf, mi_test_x, mi_test_y).items()}
            
            new_results = ResultsEntry(
                dataset,
                scenario,
                classifier_name,
                shadow_classifier_name = classifier_name,
                shadow_dataset=shadow_dataset,
                params=params,
                target_metrics=target_metrics,
                shadow_metrics=shadow_metrics,
                mia_metrics=mia_metrics
            )
            
            results_df = pd.concat([results_df, new_results.to_dataframe()], ignore_index=True)


INFO:C:\Users\simonr04\git\GRAIMatter\data_preprocessing\data_interface.py:DATASET FOLDER = C:\Users\simonr04\git\GRAIMatter\data
INFO:C:\Users\simonr04\git\GRAIMatter\data_preprocessing\data_interface.py:Loading mimic2-iaccd
INFO:C:\Users\simonr04\git\GRAIMatter\data_preprocessing\data_interface.py:Preprocessing
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics[

  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['FAR'] = fp / (fp + tp) #proportion of things classified as positives that are incorrect, also known as false discovery rate
  metrics['PPV'] = tp / (tp + fp) #precision or positive predictive value
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['FAR'] = fp / (fp + tp) #proportion of things classified as positives that are incorrect, also known as false discovery rate
  metrics['PPV'] = tp / (tp + fp) #precision or positive predictive value
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / 

  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['PLR'] = metrics['TPR'] / metrics['FPR'] #positive likelihood ratio
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['NPV'] = tn / (tn + fn) #negative predictive value
  metrics['NLR'] = metrics['FNR'] / metrics['TNR'] #negative likelihood ratio
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR

  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event when there are two possible outcomes
  metrics['OR'] = metrics['PLR'] / metrics['NLR'] #odds ratio, the odds ratio is used to find the probability of an outcome of an event

In [170]:
results_df[results_df.mia_TPR > 0.7].head()

Unnamed: 0,dataset,scenario,target_classifier,shadow_dataset,bootstrap,min_samples_split,target_TPR,target_FPR,target_FAR,target_TNR,...,shadow_NLR,shadow_OR,max_depth,max_iter_predict,warm_start,solver,n_neighbors,weights,probability,algorithm
0,mimic2-iaccd,Worst Case,RandomForestClassifier,,True,2,1.0,0.014925,0.005882,0.985075,...,,,,,,,,,,
1,mimic2-iaccd,Salem1,RandomForestClassifier,Same distribution,True,2,1.0,0.014925,0.005882,0.985075,...,0.0,inf,,,,,,,,
6,mimic2-iaccd,Worst Case,RandomForestClassifier,,False,2,1.0,0.0,0.0,1.0,...,,,,,,,,,,
7,mimic2-iaccd,Salem1,RandomForestClassifier,Same distribution,False,2,1.0,0.0,0.0,1.0,...,0.0,inf,,,,,,,,
8,mimic2-iaccd,Salem2,RandomForestClassifier,Breast cancer,False,2,1.0,0.0,0.0,1.0,...,0.012144,1089.0,,,,,,,,


In [138]:
results_df.columns

Index(['dataset', 'scenario', 'target_classifier', 'shadow_dataset',
       'bootstrap', 'min_samples_split', 'target_TPR', 'target_FPR',
       'target_FAR', 'target_TNR', 'target_PPV', 'target_NPV', 'target_FNR',
       'target_ACC', 'target_Advantage', 'target_PLR', 'target_NLR',
       'target_OR', 'mia_TPR', 'mia_FPR', 'mia_FAR', 'mia_TNR', 'mia_PPV',
       'mia_NPV', 'mia_FNR', 'mia_ACC', 'mia_Advantage', 'mia_PLR', 'mia_NLR',
       'mia_OR', 'shadow_TPR', 'shadow_FPR', 'shadow_FAR', 'shadow_TNR',
       'shadow_PPV', 'shadow_NPV', 'shadow_FNR', 'shadow_ACC',
       'shadow_Advantage', 'shadow_PLR', 'shadow_NLR', 'shadow_OR',
       'max_depth', 'max_iter_predict', 'warm_start', 'solver', 'n_neighbors',
       'weights'],
      dtype='object')

In [None]:
df = pd.DataFrame()
for s, v in sets.items():
    df = pd.concat([v.data_frame(),df], ignore_index=True)

In [None]:
df.groupby(['Target model ID', 'Classifier', 'Attack scenario', 'Type', "Dataset"])['TPR', 'FPR',
                                                                                   'FAR', 'TNR', 
                                                                                    'PPV', 'NPV',
                                                                                   'FNR', 'ACC',
                                                                                   'Advantage',
                                                                                   ].sum()#.reset_index()
                                                                                    #'PLR', 'NLR',
                                                                                    #'OR']

In [None]:
df.head()

In [None]:
rf = RandomForestClassifier(min_samples_split = 2)

In [None]:
print(rf)