# Membership Inference Attack Revisited

Attack Model:
- Fairly accurate target model $f_{target}(x)$, CIFAR-10 classification model
- Attacker knows nothing about the architecture of the target model and creates his own NN architecture
- Attacker has no information in the dataset (TODO). Attacker uses query based datapoint generation (check Shadow Datasets generation section)
- $D_{target}$ training dataset and $\cup_i D_{shadow_i}$ dataset are disjoint


The target is to prove that in a **complete black-box scenario** the attacker can exploit every minor model leak.

CHANGES from previous session: 
- Target model is more generalized
- The attack model doesn't use c-different classifier, but 1 sole classifier and the learning insances contain the datapoint's class as a feature
- (TODO) The attack model only return a label as prediction and not a prediction vector
- (TODO) Use data synthesis algorithm
 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import math
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tqdm.notebook import tqdm
import sys
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


## F_target

We create a simple CNN model. 

In [None]:
def f_target(X_train, y_train, X_test=None, y_test=None, epochs=100):
  """
  Returns a trained target model, if test data are specified we will evaluate the model and print its accuracy
  """
  model = models.Sequential()
  model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
  model.add(layers.Conv2D(32, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Dropout(0.2))
  
  model.add(layers.Conv2D(64, (3, 3), activation='relu'))
  model.add(layers.Conv2D(64, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Dropout(0.2))

  model.add(layers.Conv2D(128, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))

  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.Dense(32, activation='relu'))
  model.add(layers.Dense(10))
  
  optimizer = keras.optimizers.Adam()
  model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  if X_test is None or y_test is None:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_split=0.2)
  else:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_data=(X_test, y_test), verbose=True)
  return model

In [None]:
with tf.device('/gpu:0'):
  (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
  train_images = train_images[:20000] # as the paper attack train wiht only 200 records
  train_labels = train_labels[:20000]
  # use the rest as testing - 'out' records
  test_images = np.concatenate((train_images[20000:], test_images))
  test_labels = np.concatenate((train_labels[20000:], test_labels))

In [None]:
with tf.device('/gpu:0'):
  target_model = f_target(train_images, train_labels) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Shadow dataset
Define the $f_{shadow}(x)$ models and train them.

In [None]:
def f_shadow(X_train, y_train, X_test=None, y_test=None, epochs=50):
  model = models.Sequential()
  model.add(layers.Conv2D(32, (3, 3), activation='tanh', input_shape=(32, 32, 3)))
  model.add(layers.Conv2D(32, (3, 3), activation='tanh'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Dropout(0.1))

  model.add(layers.Conv2D(64, (3, 3), activation='tanh'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Dropout(0.1))

  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='tanh'))
  model.add(layers.Dropout(0.1))
  model.add(layers.Dense(64, activation='tanh'))
  model.add(layers.Dense(32, activation='tanh'))

  model.add(layers.Dense(10)   )
  
  optimizer = keras.optimizers.Adam(learning_rate=0.001)
  model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  if X_test is None or y_test is None:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_split=0.2)
  else:
    history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_data=(X_test, y_test))
  return model

In [None]:

def divide_dataset(n_shadows, shadow_dataset_size, X, y):
  D_shadows = []
  rng = np.random.default_rng()
  for i in range(n_shadows):
    sample_i = np.random.choice(range(X.shape[0]), shadow_dataset_size, replace=False)
    assert np.unique(sample_i).shape[0] == shadow_dataset_size # sanity check
    D_shadows.append((X[sample_i, :], y[sample_i, :]))
  return D_shadows

# returns a list of 'n_shadows' datasets
def generate_shadow_dataset(target_model, n_shadows, shadow_dataset_size, n_classes, X_test=None, y_test=None):
  # in case we give test data we will just divide those to train the shadow models
  if X_test is not None and y_test is not None:
    return divide_dataset(n_shadows, shadow_dataset_size, X_test, y_test)
  
  
  # helper function to return a datapoint (for sure)
  def get_shadow_datapoint(c):
    X_i = None
    with tf.device('/gpu:0'):
      while X_i is None:
        X_i = synthesize(c, target_model, 1, 32*32*3, 0.8, 100, 20)
    return X_i
  
  D_shadows = []
  for i in range(n_shadows):
    print(f"Generating D_shadow_{i}")
    # uniformly generate X's for all of the classes (y's)
    X_shadow = np.asarray(
        [get_shadow_datapoint(i%n_classes) for i in range(shadow_dataset_size)]
    )
    y_shadow = np.asarray(
        [(i%n_classes) for i in range(shadow_dataset_size)]
    ).reshape((-1, 1))

    D_shadows.append((X_shadow, y_shadow))
  
  return D_shadows

def create_shadows(D_shadows):
  shadow_models = [] # shadow model list

  for D_shadow in D_shadows:
    # sample data to feed/evaluate the model
    X_shadow, y_shadow = D_shadow
    shadow_X_train, shadow_X_test, shadow_y_train, shadow_y_test = train_test_split(X_shadow, y_shadow, shuffle=True, test_size=0.33)

    # generate the shadow model
    shadow_model = f_shadow(shadow_X_train, shadow_y_train, shadow_X_test, shadow_y_test)

    D_shadow = (shadow_X_train, shadow_y_train), (shadow_X_test, shadow_y_test)
    shadow_models.append((shadow_model, D_shadow))

  return shadow_models # return a list where every item is (model, acc), train-data, test-data

In [None]:
# generate shadow datasets
D_shadows = generate_shadow_dataset(target_model, n_shadows=10, shadow_dataset_size=5000, n_classes=10, X_test=test_images, y_test=test_labels)

In [None]:
# train the shadow models
shadow_models = create_shadows(D_shadows)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [None]:
# helper function to prepare each shadow dataset batch
def prepare_batch(shadow_model, X, y, in_D=True):
  #decide membership
  y_member = np.ones(shape=(y.shape[0], 1)) if in_D else np.zeros(shape=(y.shape[0], 1))
  
  # get probability vector
  prob_layer = layers.Softmax() # probability layer implementing softmax for mapping NN results to probabilities in [0, 1]
  prob_vec = prob_layer(shadow_model(X)).numpy()
  
  # return an instance <actual class, prob_vec from shadow model, 'in'/'out' D_target membership> 
  return np.concatenate((y.reshape(-1, 1), prob_vec, y_member), axis=1)

def generate_attack_dataset(shadow_models):
  # input is a list where items are model, (X_train, y_train), (X_test, y_test)

  D_attack = None
  # D_attack_i format = <class, prob_vec, membership label (1 or 0)> 
  for shadow_model, ((X_train, y_train), (X_test, y_test)) in shadow_models:
    s = min(X_train.shape[0], X_test.shape[0])
    print(s)
    batch = np.concatenate((
        prepare_batch(shadow_model, X_train[:s], y_train[:s], True), # members of shadow dataset 
        prepare_batch(shadow_model, X_test[:s], y_test[:s], False)   # non members of shadow dataset
    ))   

    D_attack = np.concatenate((D_attack, batch)) if D_attack is not None else batch  

  return D_attack 

In [None]:
D_attack = generate_attack_dataset(shadow_models)

1650
1650
1650
1650
1650
1650
1650
1650
1650
1650


In [None]:
def __f_attack(X_train, y_train, X_test, y_test, epochs=100):
  print(X_train.shape, X_test.shape)
  model = models.Sequential()
  model.add(layers.Dense(11, activation='relu', input_shape=(X_train.shape[1], )))
  model.add(layers.Dense(110, activation='relu'))
  model.add(layers.Dense(1100, activation='relu'))
  model.add(layers.Dense(1, activation='sigmoid'))
  
  optimizer = keras.optimizers.Adam()
  model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
  history = model.fit(X_train, y_train, epochs=epochs, 
                    validation_data=(X_test, y_test), verbose=True)
  
  return model


def f_attack(X, y):
  # X_i = (class, probability vector, )
  classes = np.unique(train_labels) # all class labels
  with tf.device('/gpu:0'):
  # split to train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25)
    attack_model = __f_attack(X_train, y_train, X_test, y_test) 
  return attack_model

In [None]:
attack_model = f_attack(D_attack[:, :-1], D_attack[:, -1])

(24750, 11) (8250, 11)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/10

In [None]:
def evaluate_attack(attack_model, X_attack, y_attack, n_classes):
  acc_per_class = []
  for c in range(n_classes):
    class_instances = X_attack[:, 0] == c # get same class samples
    test_loss, test_acc = attack_model.evaluate(X_attack[class_instances, :], y_attack[class_instances], verbose=0)
    acc_per_class.append(test_acc)
    print(f"class-{c+1}: {test_acc}")
  return acc_per_class



In [None]:
# create a test dataset 

D_out = prepare_batch(target_model, test_images, test_labels, False)
D_in = prepare_batch(target_model, train_images[:10000], train_labels[:10000], True)
print("Testing with 'in' data only:")
res_in = evaluate_attack(attack_model, D_in[:, :-1], D_in[:, -1], 10)

print("\nTesting with 'out' data only:")
res_out = evaluate_attack(attack_model, D_out[:, :-1], D_out[:, -1], 10)

print("\nTesting with all prev data: ")
res_all = evaluate_attack(attack_model, np.concatenate((D_out[:, :-1], D_in[:, :-1])), np.concatenate((D_out[:, -1], D_in[:, -1])), 10)


Testing with 'in' data only:
class-1: 0.7930348515510559
class-2: 0.6878849864006042
class-3: 0.7761628031730652
class-4: 0.6594488024711609
class-5: 0.8418418169021606
class-6: 0.7598719596862793
class-7: 0.7514563202857971
class-8: 0.8031967878341675
class-9: 0.9180487990379333
class-10: 0.932721734046936

Testing with 'out' data only:
class-1: 0.5210000276565552
class-2: 0.5239999890327454
class-3: 0.6520000100135803
class-4: 0.7990000247955322
class-5: 0.5979999899864197
class-6: 0.6549999713897705
class-7: 0.5130000114440918
class-8: 0.5440000295639038
class-9: 0.34700000286102295
class-10: 0.3630000054836273

Testing with all prev data: 
class-1: 0.6573566198348999
class-2: 0.6048632264137268
class-3: 0.7150590419769287
class-4: 0.7286706566810608
class-5: 0.7198599576950073
class-6: 0.7057304978370667
class-7: 0.6339901685714722
class-8: 0.6736631393432617
class-9: 0.6360493898391724
class-10: 0.6451287269592285
