# Notebook to loop over datasets, models, scenarios and hyper-parameters

In [1]:
import os
import sys
import json
import importlib
import hashlib
import itertools
import warnings
import logging
import pandas as pd
from metrics import get_metrics
from sklearn import datasets as skl_datasets
from scenarios import *
from plots import *
from experiments import ResultsEntry

PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
sys.path.append(PROJECT_ROOT)
#print(PROJECT_ROOT)
from data_preprocessing.data_interface import get_data_sklearn, DataNotAvailable
from synth_conditional import pategan_classifier

import seaborn as sns

INFO:/home/ec2-user/GRAIMatter/data_preprocessing/data_interface.py:ROOT PROJECT FOLDER = /home/ec2-user/GRAIMatter
DEBUG:tensorflow:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
Instructions for updating:
non-resource variables are not supported in the long term


Load the experimental config from the config file. The config file should include a single dictionary with the following keys:
1. `datasets`: a list of the dataset names (strings) to use
1. `classifiers`: a list of the classifiers to use. Each item is itself a list with two elements, the first being the string of the module (e.g. `"sklearn.svm"`) and the second being the class name (e.g. `"SVC"`)
1. `experiment_params`: A dictionary where each classifier class name from above is a key and the value is another dictionary where the keys are the hyperparameters to vary and the values are lists over the values the hyper-parameter should take. Note in json, use `true, false, null` instead of `True, False, None`.
1. `results_filename`: string of a filename in which to save the results. Will overwrite happily.
1. `n_reps`: how many times to repeat each combination of dataset, model, scenario, hyper-parameter
1. `mia_classifier`: the classifier to use for the attack model. Currently restricted to default hyper-params. Should be specified as a list with two items, as per `classifiers`
1. `scenarios`: a list of the scenarios to run. Can only include a subset of "WorstCase", "Salem1", and "Salem2" at the moment

An example config file is provided as `example_loop_experiment_config.json`

In [2]:
#CONFIG_FILENAME = "example_loop_experiment_config.json"
#CONFIG_FILENAME = "randomForest_config.json"
CONFIG_FILENAME = "simple_synth_experiment_config.json"

In [3]:
with open(CONFIG_FILENAME, 'r') as f:
    config = json.loads(f.read())

datasets = config['datasets']
classifier_strings = config['classifiers']

classifiers = {}
for module_name, class_name in classifier_strings:
    module = importlib.import_module(module_name)
    class_ = getattr(module, class_name)
    classifiers[class_name] = class_

experiment_params = config['experiment_params']

results_filename = config['results_filename']

n_reps = config['n_reps']

mia_classifier_module, mia_classifier_name = config['mia_classifier']
module = importlib.import_module(mia_classifier_module)
mia_classifier = getattr(module, mia_classifier_name)

scenarios = config['scenarios']

@Alba: didn't remove this in case you wanted to keep all those hyper-param lists?

In [4]:
# experiment_params = {
#     'RandomForestClassifier': {
#         #'n_estimators': [10, 20, 100],
#         #'criterion':['gini','entropy'],
#         #'max_depth':[None,2,4],
#         #'max_features':[None,'sqrt','log2'],
#         'bootstrap': [True, False],
#         'min_samples_split': [2, 10],
#         #'class_weight':[None,'balanced','balanced_subsample'],
#     },
#     'DecisionTreeClassifier': {
#         #'criterion':['gini','entropy'],
#         'max_depth':[None,2,4],
#         #'min_samples_split': [2, 10],
#         #'max_features':[None,'sqrt','log2'],
#         #'class_weight':[None,'balanced']
#     },
#     'GaussianProcessClassifier': {
#         'max_iter_predict':[50,100,200],
#         'warm_start':[True,False],
#     },
#     'MLPClassifier': {
#         #'hidden_layer_size':[(50,),(100,),(200,)],
#         #'activation':['identity', 'logistic', 'tanh', 'relu'],
#         'solver':['lbfgs', 'sgd', 'adam'],
#         #'learning_rate': ['constant', 'invscaling', 'adaptive'],
#         #'max_iter': [50,200,400,1000]
#     },
#     'KNeighborsClassifier': {
#         'n_neighbors':[2,5,10,20],
#         'weights':['uniform', 'distance'],
#         #'algorithm':['ball_tree', 'kd_tree', 'brute']
#     },
#     'SVC': {
#         #'Kernel':['linear', 'poly', 'rbf', 'sigmoid'],
#         #'decision_function_shape':['ovo', 'ovr'],
#         #'max_iter':[-1, 2, 5],
#         'probability':[True]
#     },
#     'AdaBoostClassifier': {
#         #'n_estimators': [10, 20, 50, 100],
#         'algorithm':['SAMME', 'SAMME.R']
#     }
# }


In [5]:
results_df = pd.DataFrame()

#if not sys.warnoptions:
#    warnings.simplefilter("once")
    #MPLClassifier is giving a lot of warnings. 
    # For each repetition are the same, so it will only show the same warning once.

for dataset in datasets:#[:1]:
    #load the data
    try:
        X, y = get_data_sklearn(dataset)
    except DataNotAvailable as e:
        print(e)
        continue
    print(dataset)
    for r in range(n_reps):
        #split into training, shadow model and validation data
        X_target_train, X_shadow_train, X_test, y_target_train, y_shadow_train, y_test = split_target_data(X.values, y.values, r_state=r)
        
        for classifier_name, clf_class in classifiers.items():
            all_combinations = itertools.product(*experiment_params[classifier_name].values())
            for i,combination in enumerate(all_combinations):

                # Turn this particular combination into a dictionary
                params = {n: v for n, v in zip(experiment_params[classifier_name].keys(), combination)}
                target_classifier = clf_class()
                target_classifier.set_params(**params)
                
                # Train the target model
                target_classifier.fit(X_target_train, y_target_train)

                # Get target metrics
                target_metrics = {f"target_{key}": val for key, val in get_metrics(target_classifier, X_test, y_test).items()}
                
               
                hashstr = f'{dataset} {classifier_name} {str(params)}'
                model_data_param_id = hashlib.sha256(hashstr.encode('utf-8')).hexdigest()
                
                hashstr = f'{str(params)}'
                param_id = hashlib.sha256(hashstr.encode('utf-8')).hexdigest()
                
                ##########################################
                #######   Worst case scenario     ########
                ##########################################
                if "WorstCase" in scenarios:
                    scenario = "WorstCase"
                    mi_test_x, mi_test_y, mi_clf = worst_case_mia(
                        target_classifier,
                        X_target_train,
                        X_test,
                        mia_classifier=mia_classifier()
                    )
                    # Get MIA metrics
                    mia_metrics = {f"mia_{key}": val for key, val in get_metrics(mi_clf, mi_test_x, mi_test_y).items()}

                    #Create ID for dataset classifier parameters scenario (but not repetition/random split)
                    hashstr = f'{dataset} {classifier_name} {str(params)} {scenario}'
                    full_id = hashlib.sha256(hashstr.encode('utf-8')).hexdigest()

                    new_results = ResultsEntry(
                        full_id, model_data_param_id, param_id,
                        dataset,
                        scenario,
                        classifier_name,
                        attack_classifier_name=mia_classifier_name,
                        repetition=r,
                        params=params,
                        target_metrics=target_metrics,
                        mia_metrics=mia_metrics
                    )

                    results_df = pd.concat([results_df, new_results.to_dataframe()], ignore_index=True)


                ##########################################
                #######   Salem scenario 1        ########
                ##########################################
                if "Salem1" in scenarios:
                    scenario = "Salem1"
                    mi_test_x, mi_test_y, mi_clf, shadow_model, X_shadow_test, y_shadow_test = salem(
                        target_classifier,
                        classifiers[classifier_name](**params),
                        X_target_train,
                        X_shadow_train,
                        y_shadow_train,
                        X_test,
                        mia_classifier=mia_classifier()
                    )

                    # Get Shadow and MIA metrics
                    shadow_metrics = {f"shadow_{key}": val for key, val in get_metrics(shadow_model, X_shadow_test, y_shadow_test).items()}
                    mia_metrics = {f"mia_{key}": val for key, val in get_metrics(mi_clf, mi_test_x, mi_test_y).items()}

                    #Create ID for dataset classifier parameters scenario (but not repetition/random split)
                    hashstr = f'{dataset} {classifier_name} {str(params)} {scenario}'
                    full_id = hashlib.sha256(hashstr.encode('utf-8')).hexdigest()

                    new_results = ResultsEntry(
                        full_id, model_data_param_id, param_id,
                        dataset,
                        scenario,
                        classifier_name,
                        shadow_dataset='Same distribution',
                        shadow_classifier_name=classifier_name,
                        attack_classifier_name=mia_classifier_name,
                        repetition=r,
                        params=params,
                        target_metrics=target_metrics,
                        mia_metrics=mia_metrics,
                        shadow_metrics=shadow_metrics
                    )

                    results_df = pd.concat([results_df, new_results.to_dataframe()], ignore_index=True)

                ##########################################
                #######   Salem scenario 2        ########
                ##########################################
                if "Salem2" in scenarios:
                    shadow_dataset = 'Breast cancer'
                    scenario = "Salem2"

                    X_breast_cancer, y_breast_cancer = skl_datasets.load_breast_cancer(return_X_y=True)

                    mi_test_x, mi_test_y, mi_clf, shadow_model, X_shadow_test, y_shadow_test = salem(
                        target_classifier,
                        classifiers[classifier_name](**params),
                        X_target_train,
                        X_breast_cancer,
                        y_breast_cancer,
                        X_test,
                        mia_classifier=mia_classifier()
                    )

                    # Get Shadow and MIA metrics
                    shadow_metrics = {f"shadow_{key}": val for key, val in get_metrics(shadow_model, X_shadow_test, y_shadow_test).items()}
                    mia_metrics = {f"mia_{key}": val for key, val in get_metrics(mi_clf, mi_test_x, mi_test_y).items()}

                    #Create ID for dataset classifier parameters scenario (but not repetition/random split)
                    hashstr = f'{dataset} {classifier_name} {str(params)} {scenario}'
                    full_id = hashlib.sha256(hashstr.encode('utf-8')).hexdigest()

                    new_results = ResultsEntry(
                        full_id, model_data_param_id, param_id,
                        dataset,
                        scenario,
                        classifier_name,
                        shadow_classifier_name=classifier_name,
                        shadow_dataset=shadow_dataset,
                        attack_classifier_name=mia_classifier_name,
                        repetition=r,
                        params=params,
                        target_metrics=target_metrics,
                        shadow_metrics=shadow_metrics,
                        mia_metrics=mia_metrics
                    )

                    results_df = pd.concat([results_df, new_results.to_dataframe()], ignore_index=True)
warnings.simplefilter("default")#enable warnings again

INFO:/home/ec2-user/GRAIMatter/data_preprocessing/data_interface.py:DATASET FOLDER = /home/ec2-user/GRAIMatter/data
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


/home/ec2-user/GRAIMatter/data/Indian Liver Patient Dataset (ILPD).csv /home/ec2-user/GRAIMatter/data
indian liver


2022-03-16 10:49:09.965889: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1 1 0 0 1 0 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1
 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 1 0 1
 1 1 1 0 1 0 1 0 1 1 1 0]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

[0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


ValueError: too many values to unpack (expected 4)

In [6]:
# Save the results file
results_df.to_csv(results_filename, index=False)

  and should_run_async(code)
