# Membership Inference Attack Revisited

Attack Model:
- Fairly accurate target model $f_{target}(x)$, CIFAR-10 classification model
- Attacker knows nothing about the architecture of the target model and creates his own NN architecture
- Attacker has no information in the dataset (TODO). Attacker uses query based datapoint generation (check Shadow Datasets generation section)
- $D_{target}$ training dataset and $\cup_i D_{shadow_i}$ dataset are disjoint


The target is to prove that in a **complete black-box scenario** the attacker can exploit every minor model leak.

CHANGES from previous session: 
- Target model is more generalized
- The attack model doesn't use c-different classifier, but 1 sole classifier and the learning insances contain the datapoint's class as a feature
- (TODO) The attack model only return a label as prediction and not a prediction vector
- (TODO) Use data synthesis algorithm
 

In [33]:
import numpy as np
import matplotlib.pyplot as plt

import math
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tqdm import tqdm
import sys
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


## Basic Models

Create basic model functions
- __Target model__: A small and simple CNN
- __N Shadow Models__: Same architecture as the target model
- __Attack Models__ : One for every class

In [132]:
def f_target(X_train, y_train, X_test=None, y_test=None, epochs=10):
  """
  Returns a trained target model, if test data are specified we will evaluate the model and print its accuracy
  """
  model = models.Sequential()
  model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
  model.add(layers.Conv2D(32, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Conv2D(64, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))


  model.add(layers.Flatten())
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.Dense(10))
  
  optimizer = keras.optimizers.Adam()
  model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  if X_test is None or y_test is None:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_split=0.2)
  else:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_data=(X_test, y_test), verbose=True)
  return model

In [133]:
def f_shadow(X_train, y_train, X_test=None, y_test=None, epochs=10):
  model = models.Sequential()
  model.add(layers.Conv2D(32, (3, 3), activation='tanh', input_shape=(32, 32, 3)))
  model.add(layers.Conv2D(32, (3, 3), activation='tanh'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Conv2D(64, (3, 3), activation='tanh'))
  model.add(layers.MaxPooling2D((2, 2)))

  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='tanh'))
  model.add(layers.Dense(10)   )
  
  optimizer = keras.optimizers.Adam(learning_rate=0.001)
  model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  if X_test is None or y_test is None:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_split=0.2)
  else:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_data=(X_test, y_test))
  return model

In [134]:
def __f_attack(X_train, y_train, X_test, y_test, epochs=30):
  print(X_train.shape, X_test.shape)
  model = models.Sequential()
  model.add(layers.Dense(10, activation='relu', input_shape=(X_train.shape[1], )))
  model.add(layers.Dense(100, activation='relu'))
  model.add(layers.Dense(1000, activation='relu'))
  model.add(layers.Dense(100, activation='relu'))
  model.add(layers.Dense(10, activation='softmax'))
  model.add(layers.Dense(2))
  
  optimizer = keras.optimizers.Adam()
  model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_data=(X_test, y_test), verbose=True)
  
  return model


def f_attack(X, y):
  # X_i = (class, probability vector, )
  classes = np.unique(train_labels) # all class labels
  with tf.device('/gpu:0'):
  # split to train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2)
    attack_model = __f_attack(X_train, y_train, X_test, y_test) 
  return attack_model

In [104]:
with tf.device('/gpu:0'):
  (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
  train_images = train_images[:20000] # as the paper attack train wiht only 200 records
  train_labels = train_labels[:20000]
  # use the rest as testing - 'out' records
  test_images = np.concatenate((train_images[20000:], test_images))
  test_labels = np.concatenate((train_labels[20000:], test_labels))

In [119]:
with tf.device('/gpu:0'):
  target_model = f_target(train_images, train_labels) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [94]:
# return prediction vector
def predict(model, X_i):
  prob_layer = layers.Softmax()
  return prob_layer(model(X_i.reshape((1, 32, 32, 3)))).numpy()[0]

def rand_record(X=None, k=1):
  if X is None:
    # create a whole new record
    X = np.random.randint(0, 255+1, size=32*32*3).reshape((32, 32, 3))
  else:
    X = X.reshape((32*32*3))
    # change k random features
    k_features = np.random.choice(range(X.shape[0]), size=k, replace=False)

    for i in k_features:
      X[i] += np.random.randint(-X[i], 256-X[i]) # subtract/add a number to change the feature 
  
  return X.reshape((32,32,3))

def synthesize(c, target_model, k_min, k_max, conf_min, iter_max, rej_max):
  X = rand_record()
  y_conf_star = 0.0
  j = 0
  k = k_max
  X_star = None

  for iter in range(iter_max):
    y = predict(target_model, X);
    if y[c] >= y_conf_star:
      if y[c] > conf_min and c == np.argmax(y):
        # sample to decide if we return the data
        if np.random.randint(0, 2):
          return X
      y_conf_star = y[c]
      j = 0
      X_star = X
    else:
      # reject and resample X
      j += 1

      if j > rej_max:
        k = max(k_min, math.ceil(k/2))
        j = 0
    
    X = rand_record(X_star, k)
  
  return X_star # failed. return the last successfull record

In [95]:

def divide_dataset(n_shadows, shadow_dataset_size, X, y):
  D_shadows = []
  rng = np.random.default_rng()
  for i in range(n_shadows):
    sample_i = np.random.choice(range(X.shape[0]), shadow_dataset_size, replace=False)
    assert np.unique(sample_i).shape[0] == shadow_dataset_size # sanity check
    D_shadows.append((X[sample_i, :], y[sample_i, :]))
  return D_shadows

# returns a list of 'n_shadows' datasets
def generate_shadow_dataset(target_model, n_shadows, shadow_dataset_size, n_classes, X_test=None, y_test=None):
  # in case we give test data we will just divide those to train the shadow models
  if X_test is not None and y_test is not None:
    return divide_dataset(n_shadows, shadow_dataset_size, X_test, y_test)
  
  
  # helper function to return a datapoint (for sure)
  def get_shadow_datapoint(c):
    X_i = None
    with tf.device('/gpu:0'):
      while X_i is None:
        X_i = synthesize(c, target_model, 1, 32*32*3, 0.65, 100, 5)
    return X_i
  
  D_shadows = []
  for i in range(n_shadows):
    print(f"Generating D_shadow_{i}")
    # uniformly generate X's for all of the classes (y's)
    X_shadow = np.asarray(
        [get_shadow_datapoint(i%n_classes) for i in range(shadow_dataset_size)]
    )
    y_shadow = np.asarray(
        [(i%n_classes) for i in range(shadow_dataset_size)]
    ).reshape((-1, 1))

    D_shadows.append((X_shadow, y_shadow))
  
  return D_shadows

def create_shadows(D_shadows):
  shadow_models = [] # shadow model list

  for D_shadow in D_shadows:
    # sample data to feed/evaluate the model
    X_shadow, y_shadow = D_shadow
    shadow_X_train, shadow_X_test, shadow_y_train, shadow_y_test = train_test_split(X_shadow, y_shadow, shuffle=True, test_size=0.2)

    # generate the shadow model
    shadow_model = f_shadow(shadow_X_train, shadow_y_train, shadow_X_test, shadow_y_test)

    D_shadow = (shadow_X_train, shadow_y_train), (shadow_X_test, shadow_y_test)
    shadow_models.append((shadow_model, D_shadow))

  return shadow_models # return a list where every item is (model, acc), train-data, test-data

In [96]:
# generate shadow datasets
D_shadows = generate_shadow_dataset(target_model, 15, 5000, 10, test_images, test_labels)

In [97]:
# train the shadow models
shadow_models = create_shadows(D_shadows)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [130]:
# helper function to prepare each shadow dataset batch
def prepare_batch(shadow_model, X, y, in_D=True):
  #decide membership
  y_member = np.ones(shape=(y.shape[0], 1)) if in_D else np.zeros(shape=(y.shape[0], 1))
  
  # get probability vector
  prob_layer = layers.Softmax() # probability layer implementing softmax for mapping NN results to probabilities in [0, 1]
  prob_vec = prob_layer(shadow_model(X)).numpy()
  
  # return an instance <actual class, prob_vec from shadow model, 'in'/'out' D_target membership> 
  return np.concatenate((y.reshape(-1, 1), prob_vec, y_member), axis=1)

def generate_attack_dataset(shadow_models):
  # input is a list where items are model, (X_train, y_train), (X_test, y_test)

  D_attack = None
  # D_attack_i format = <class, prob_vec, membership label (1 or 0)> 
  for shadow_model, ((X_train, y_train), (X_test, y_test)) in shadow_models:
    batch = np.concatenate((
        prepare_batch(shadow_model, X_train, y_train, True), # members of shadow dataset 
        prepare_batch(shadow_model, X_test, y_test, False)   # non members of shadow dataset
    ))   

    D_attack = np.concatenate((D_attack, batch)) if D_attack is not None else batch  

  return D_attack 

In [131]:
D_attack = generate_attack_dataset(shadow_models)

In [135]:
attack_model = f_attack(D_attack[:, :-1], D_attack[:, -1])

(60000, 11) (15000, 11)
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [136]:
def evaluate_attack(attack_model, X_attack, y_attack, n_classes):
  acc_per_class = []
  for c in range(n_classes):
    class_instances = X_attack[:, 0] == c # get same class samples
    test_loss, test_acc = attack_model.evaluate(X_attack[class_instances, :], y_attack[class_instances], verbose=0)
    acc_per_class.append(test_acc)
    print(f"class-{c+1}: {test_acc}")
  return acc_per_class



In [137]:
# create a test dataset 

D_out = prepare_batch(target_model, test_images, test_labels, False)
D_in = prepare_batch(target_model, train_images[:10000], train_labels[:10000], True)
print("Testing with 'in' data only:")
res_in = evaluate_attack(attack_model, D_in[:, :-1], D_in[:, -1], 10)

print("\nTesting with 'out' data only:")
res_out = evaluate_attack(attack_model, D_out[:, :-1], D_out[:, -1], 10)

print("\nTesting with all prev data: ")
res_all = evaluate_attack(attack_model, np.concatenate((D_out[:, :-1], D_in[:, :-1])), np.concatenate((D_out[:, -1], D_in[:, -1])), 10)


Testing with 'in' data only:
class-1: 0.7273631691932678
class-2: 0.8675564527511597
class-3: 0.5784883499145508
class-4: 0.6072834730148315
class-5: 0.8208208084106445
class-6: 0.8121665120124817
class-7: 0.9067960977554321
class-8: 0.8091908097267151
class-9: 0.8936585187911987
class-10: 0.8725789785385132

Testing with 'out' data only:
class-1: 0.5350000262260437
class-2: 0.35199999809265137
class-3: 0.6869999766349792
class-4: 0.7279999852180481
class-5: 0.5299999713897705
class-6: 0.5139999985694885
class-7: 0.27900001406669617
class-8: 0.43799999356269836
class-9: 0.3059999942779541
class-10: 0.38999998569488525

Testing with all prev data: 
class-1: 0.6314214468002319
class-2: 0.6063829660415649
class-3: 0.6318897604942322
class-4: 0.6671627163887024
class-5: 0.6753376722335815
class-6: 0.6582343578338623
class-7: 0.5975369215011597
class-8: 0.6236881613731384
class-9: 0.6034567952156067
class-10: 0.6289752721786499


In [None]:
attack_model_bundle