In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

from mia.shadow_models import *
from mia.attack_model import *
from mia.utilities import *
from mia.wrappers import *

from tqdm import tqdm
import sys
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


2022-01-11 12:17:11.577337: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-11 12:17:11.577383: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Num GPUs Available:  0


2022-01-11 12:17:14.531776: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-01-11 12:17:14.576868: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 12:17:14.578766: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce MX130 computeCapability: 5.0
coreClock: 1.189GHz coreCount: 3 deviceMemorySize: 1.96GiB deviceMemoryBandwidth: 37.33GiB/s
2022-01-11 12:17:14.579133: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-11 12:17:14.579301: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'l

## Data Preprocessing, D_in & D_out and Target Model Creation

OK. Let's first load our dataset and take a peek at the data.

In [2]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

--2022-01-11 12:17:14--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3,8M) [application/x-httpd-php]
Saving to: ‘adult.data.4’


2022-01-11 12:17:17 (1,97 MB/s) - ‘adult.data.4’ saved [3974305/3974305]



In [3]:
cols = ['age', 
        'workclass', 
        'fnlwgt', 
        'education',
        'education-num',
        'marital-status',
        'occupation', 
        'relationship', 
        'race',
        'sex',
        'capital-gain',
        'capital-loss',
        'hours-per-week', 
        'native-country',
        'salary']
cat_cols = [
  'workclass', 
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country'     
]

num_cols = list(set(cols) - set(cat_cols) - set({'salary'}))

In [4]:
dataset_path = 'adult.data' 

In [5]:
data_df = pd.read_csv(dataset_path, index_col=False, names=cols)

In [6]:
data_df.loc[data_df['salary'].str.contains('>50K'), 'salary'] = 1
data_df.loc[data_df['salary'].str.contains('>50K') == False, 'salary'] = 0

In [7]:
for col in cat_cols:
  data_df = data_df[data_df[col].str.contains('\?') == False]

In [8]:
data_df[cat_cols] = data_df[cat_cols].astype('category')
for col in cat_cols:
  data_df[col] = data_df[col].cat.codes

In [9]:
y = data_df.pop('salary').to_numpy(dtype=np.int8)
X = data_df.to_numpy(dtype=np.float64) 

In [10]:
# divide to target and attack dataset
X_target, X_attacker, y_target, y_attacker = train_test_split(X, y, test_size=0.5, shuffle=True, random_state=0)

# train-test split for the target
X_train, X_test, y_train, y_test = train_test_split(X_target, y_target, test_size=0.33, shuffle=True, random_state=0)

In [11]:
target_model = DecisionTreeClassifier(random_state=0).fit(X_train, y_train.reshape(-1))

In [12]:
print('Training Test Scores:', 
      classification_report(y_train, target_model.predict(X_train)), 
      'Validation Test Scores:', 
      classification_report(y_test, target_model.predict(X_test)), sep='\n')

Training Test Scores:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7625
           1       1.00      1.00      1.00      2479

    accuracy                           1.00     10104
   macro avg       1.00      1.00      1.00     10104
weighted avg       1.00      1.00      1.00     10104

Validation Test Scores:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      3734
           1       0.60      0.62      0.61      1243

    accuracy                           0.80      4977
   macro avg       0.74      0.74      0.74      4977
weighted avg       0.80      0.80      0.80      4977



In [13]:
target_model.predict = target_model.predict_proba

So we clearly see that Decision tree is overfitted to the data. It is time to run our normal attack on some data that lack of certain features.

## Attacker-Shadows Dataset separation

We will divide the attacker dataset to
- Attack Evaluation dataset of instances that we know they belong to $D_{out}$
- Attack Evaluation dataset of instances that we know they belong to $D_{in}$
- Instances from the dataset distribution we will use to train the shadow and attack models. These will also be used as a guide to fill out null values later.



In [14]:
# divide attacker dataset
X_attacker_train, X_attacker_test_out, y_attacker_train, y_attacker_test_out = train_test_split(X_attacker, y_attacker, test_size=0.33, shuffle=True, random_state=0)

# get a proportion of the D_in dataset for later testing of the model
_, X_attacker_test_in, _, y_attacker_test_in = train_test_split(X_train, y_train, test_size=0.33, shuffle=True, random_state=0)


## Filling null features from the dataset distribution.

For research purposes we will cross out some categorical and numerical datapoints with probability of 10\%$ and fill them using the following methods:

- for numerical features use the mean of the collumn
- for categorical features, find the label, group by it, find the most common value, accross same-labeled instances and assign it to the null case.

In [15]:
# return a df of nullified features
def nullify_features_randomly(_df, prob_of_null=0.1):
  global cols
  df = pd.DataFrame(_df, columns=cols).copy()
  for col in cols:
    df[col] = df[col].apply(lambda x: x if np.random.random() - prob_of_null >= 0.0 else None)
  return df

In [16]:
nulled_df = nullify_features_randomly(np.concatenate((X_attacker_test_out, y_attacker_test_out.reshape(-1, 1)), axis=1))

In [17]:
nulled_df.isna().sum()

age               508
workclass         517
fnlwgt            501
education         479
education-num     498
marital-status    463
occupation        489
relationship      479
race              530
sex               552
capital-gain      482
capital-loss      477
hours-per-week    501
native-country    524
salary            485
dtype: int64

In [18]:
N_SHADOWS=5
SHADOW_EPOCHS=50
SHADOW_DATASET_SIZE=10000


In [19]:
def f_shadow():
  model = DecisionTreeClassifier(random_state=0) 
  model.predict = model.predict_proba
  return  model

In [20]:
shadow_models = ShadowModelBatch(N_SHADOWS, f_shadow, model_type='sklearn')

In [21]:
D_shadows = generate_shadow_dataset(target_model, N_SHADOWS, SHADOW_DATASET_SIZE, 2, attacker_X=X_attacker_train, attacker_y=y_attacker_train)

In [22]:
ShadowModelBatch.VERBOSE=True
shadow_models.fit_all(D_shadows, SHADOW_EPOCHS)

In [23]:
attack_model = DefaultAttackModel(shadow_models, 2, (3,), 'adam')

2022-01-11 12:17:20.049640: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-11 12:17:20.050362: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2022-01-11 12:17:20.050391: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      


In [24]:
DefaultAttackModel.VERBOSE = True
attack_model.fit(epochs=50)

Preparing shadow batch of size 6600
Done!
Preparing shadow batch of size 6600
Done!
Preparing shadow batch of size 6600
Done!
Preparing shadow batch of size 6600
Done!
Preparing shadow batch of size 6600
Done!
Epoch 1/50


2022-01-11 12:17:20.610023: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-01-11 12:17:20.630652: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2899885000 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f171f49a400>

In [25]:
D_in = attack_model.prepare_batch(target_model, X_attacker_test_in, y_attacker_test_in, True)
D_out = attack_model.prepare_batch(target_model, X_attacker_test_out, y_attacker_test_out, False)
D_all = np.concatenate((D_in, D_out))
attack_model.evaluate(D_all[:, :-1], D_all[:, -1], verbose=1)

(0, 14) (0,)


ValueError: not enough values to unpack (expected 2, got 0)