# Inference Attacks

---

## 1. Attribute Inference 

**API Reference:**
- https://adversarial-robustness-toolbox.readthedocs.io/en/latest/modules/attacks/inference/attribute_inference.html

**Related paper:**
- https://dl.acm.org/doi/10.1145/2810103.2813677

---

In [5]:
import tensorflow as tf
tf.compat.v1.disable_eager_execution() # To use the ART

from tensorflow.keras import datasets, layers, models

from art.utils import load_nursery
from art.estimators.classification import KerasClassifier

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import os

#cifar_model_path = '/archive/workspace/TTA/demo/models/baisc_cnn_cifar'

### Load dataset

In [6]:
(x_train, y_train), (x_test, y_test), _, _ = load_nursery(test_set=0.2, transform_social=True)

### Target mdoel

To demostrate the attribute attacks, we use a decision tree model for target.

In [7]:
from sklearn.tree import DecisionTreeClassifier
from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
art_classifier = ScikitlearnDecisionTreeClassifier(model)

print('Base model accuracy: ', model.score(x_test, y_test))

Base model accuracy:  0.9787808641975309




### Attacks for black-box and white-box

#### Black-box attack
The black-box attack basically trains an additional classifier (called the attack model) to predict the attacked feature's value from the remaining n-1 features as well as the original (attacked) model's predictions.

##### Train attack model

In [11]:
import numpy as np
from art.attacks.inference.attribute_inference import AttributeInferenceBlackBox

attack_train_ratio = 0.5
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_x_train = x_train[:attack_train_size]
attack_y_train = y_train[:attack_train_size]
attack_x_test = x_train[attack_train_size:]
attack_y_test = y_train[attack_train_size:]

attack_feature = 1  # social

# get original model's predictions
attack_x_test_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(attack_x_test)]).reshape(-1,1)
# only attacked feature
attack_x_test_feature = attack_x_test[:, attack_feature].copy().reshape(-1, 1)
# training data without attacked feature
attack_x_test = np.delete(attack_x_test, attack_feature, 1)

bb_attack = AttributeInferenceBlackBox(art_classifier, attack_feature=attack_feature)

# train attack model
bb_attack.fit(attack_x_train)

##### Infer sensitive feature and check accuracy

In [12]:
# get inferred values
values = [-0.70718864, 1.41404987]
inferred_train_bb = bb_attack.infer(attack_x_test, attack_x_test_predictions, values=values)
# check accuracy
train_acc = np.sum(inferred_train_bb == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_bb)
print(train_acc)

0.6146275569278271


#### White-box attack
These two attacks do not train any additional model, they simply use additional information coded within the attacked decision tree model to compute the probability of each value of the attacked feature and outputs the value with the highest probability.

#### First attack

In [13]:
from art.attacks.inference.attribute_inference import AttributeInferenceWhiteBoxLifestyleDecisionTree

wb_attack = AttributeInferenceWhiteBoxLifestyleDecisionTree(art_classifier, attack_feature=attack_feature)

priors = [3465 / 5183, 1718 / 5183]

# get inferred values
inferred_train_wb1 = wb_attack.infer(attack_x_test, attack_x_test_predictions, values=values, priors=priors)

# check accuracy
train_acc = np.sum(inferred_train_wb1 == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_wb1)
print(train_acc)

0.6227325357005017


##### Second attack

In [14]:
from art.attacks.inference.attribute_inference import AttributeInferenceWhiteBoxDecisionTree

wb2_attack = AttributeInferenceWhiteBoxDecisionTree(art_classifier, attack_feature=attack_feature)

# get inferred values
inferred_train_wb2 = wb2_attack.infer(attack_x_test, attack_x_test_predictions, values=values, priors=priors)

# check accuracy
train_acc = np.sum(inferred_train_wb2 == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_wb2)
print(train_acc)

0.6999228097259745


The white-box attacks are able to correctly infer the attacked feature value in 62% and 70% of the training set respectively. 

Now let's check the precision and recall:

In [16]:
def calc_precision_recall(predicted, actual, positive_value=1):
    score = 0  # both predicted and actual are positive
    num_positive_predicted = 0  # predicted positive
    num_positive_actual = 0  # actual positive
    for i in range(len(predicted)):
        if predicted[i] == positive_value:
            num_positive_predicted += 1
        if actual[i] == positive_value:
            num_positive_actual += 1
        if predicted[i] == actual[i]:
            if predicted[i] == positive_value:
                score += 1
    
    if num_positive_predicted == 0:
        precision = 1
    else:
        precision = score / num_positive_predicted  # the fraction of predicted “Yes” responses that are correct
    if num_positive_actual == 0:
        recall = 1
    else:
        recall = score / num_positive_actual  # the fraction of “Yes” responses that are predicted correctly

    return precision, recall
    
# black-box
print(calc_precision_recall(inferred_train_bb, np.around(attack_x_test_feature, decimals=8), positive_value=1.41404987))
# white-box 1
print(calc_precision_recall(inferred_train_wb1, np.around(attack_x_test_feature, decimals=8), positive_value=1.41404987))
# white-box 2
print(calc_precision_recall(inferred_train_wb2, np.around(attack_x_test_feature, decimals=8), positive_value=1.41404987))

(0.37218045112781956, 0.22956521739130434)
(0.34332425068119893, 0.14608695652173914)
(0.6299694189602446, 0.23884057971014494)


To verify the significance of these results, we now run a baseline attack that uses only the remaining features to try to predict the value of the attacked feature, with no use of the model itself.

In [18]:
from art.attacks.inference.attribute_inference import AttributeInferenceBaseline

baseline_attack = AttributeInferenceBaseline(attack_feature=attack_feature)

# train attack model
baseline_attack.fit(attack_x_train)
# infer values
inferred_train_baseline = baseline_attack.infer(attack_x_test, values=values)
# check accuracy
baseline_train_acc = np.sum(inferred_train_baseline == np.around(attack_x_test_feature, decimals=8).reshape(1,-1)) / len(inferred_train_baseline)
print(baseline_train_acc)

0.5663836356619066


--- 

## 2. Membership Inference

**API Reference:**
- https://adversarial-robustness-toolbox.readthedocs.io/en/latest/modules/attacks/inference/membership_inference.html

**Related paper:**
- 

### Dataset and Target model
- Dataset: Nursery dataset (https://archive.ics.uci.edu/ml/datasets/nursery)
- Target model: Random forest mdoel 

---import os
import sys
sys.path.insert(0, os.path.abspath('..'))

from art.utils import load_nursery

(x_train, y_train), (x_test, y_test), _, _ = load_nursery(test_set=0.5)

In [19]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

from art.utils import load_nursery

(x_train, y_train), (x_test, y_test), _, _ = load_nursery(test_set=0.5)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification.scikitlearn import ScikitlearnRandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)

art_classifier = ScikitlearnRandomForestClassifier(model)

print('Base model accuracy: ', model.score(x_test, y_test))

Base model accuracy:  0.9753010188329732




### Attacks

#### Rule-based attack
The rule-based attack uses the simple rule to determine membership in the training data: if the model's prediction for a sample is correct, then it is a member. Otherwise, it is not a member.

In [21]:
import numpy as np
from art.attacks.inference.membership_inference import MembershipInferenceBlackBoxRuleBased

attack = MembershipInferenceBlackBoxRuleBased(art_classifier)

# infer attacked feature
inferred_train = attack.infer(x_train, y_train)
inferred_test = attack.infer(x_test, y_test)

# check accuracy
train_acc = np.sum(inferred_train) / len(inferred_train)
test_acc = 1 - (np.sum(inferred_test) / len(inferred_test))
acc = (train_acc * len(inferred_train) + test_acc * len(inferred_test)) / (len(inferred_train) + len(inferred_test))
print(train_acc)
print(test_acc)
print(acc)

1.0
0.024698981167026846
0.5123494905835134


This means that for 51% of the data, membership status is inferred correctly.

In [22]:
def calc_precision_recall(predicted, actual, positive_value=1):
    score = 0  # both predicted and actual are positive
    num_positive_predicted = 0  # predicted positive
    num_positive_actual = 0  # actual positive
    for i in range(len(predicted)):
        if predicted[i] == positive_value:
            num_positive_predicted += 1
        if actual[i] == positive_value:
            num_positive_actual += 1
        if predicted[i] == actual[i]:
            if predicted[i] == positive_value:
                score += 1
    
    if num_positive_predicted == 0:
        precision = 1
    else:
        precision = score / num_positive_predicted  # the fraction of predicted “Yes” responses that are correct
    if num_positive_actual == 0:
        recall = 1
    else:
        recall = score / num_positive_actual  # the fraction of “Yes” responses that are predicted correctly

    return precision, recall

# rule-based
print('Precision and Recall:', calc_precision_recall(np.concatenate((inferred_train, inferred_test)), 
                            np.concatenate((np.ones(len(inferred_train)), np.zeros(len(inferred_test))))))

Precision and Recall: (0.5062519537355423, 1.0)


#### Black-box attack
The black-box attack basically trains an additional classifier (called the attack model) to predict the membership status of a sample. It can use as input to the learning process probabilities/logits or losses, depending on the type of model and provided configuration.

##### Train attack model

In [25]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox

attack_train_ratio = 0.5
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_test) * attack_train_ratio)

bb_attack = MembershipInferenceBlackBox(art_classifier)

# train attack model
bb_attack.fit(x_train[:attack_train_size], y_train[:attack_train_size],
              x_test[:attack_test_size], y_test[:attack_test_size])

##### Infer sensitive feature and check accuracy

In [26]:
# get inferred values
inferred_train_bb = bb_attack.infer(x_train[attack_train_size:], y_train[attack_train_size:])
inferred_test_bb = bb_attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])
# check accuracy
train_acc = np.sum(inferred_train_bb) / len(inferred_train_bb)
test_acc = 1 - (np.sum(inferred_test_bb) / len(inferred_test_bb))
acc = (train_acc * len(inferred_train_bb) + test_acc * len(inferred_test_bb)) / (len(inferred_train_bb) + len(inferred_test_bb))
print(train_acc)
print(test_acc)
print(acc)

0.7261500463105897
0.5899969126273541
0.6580734794689719


In [27]:
# black-box
print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), 
                            np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))

(0.6391304347826087, 0.7261500463105897)
