# Adversarial Robustness Toolkit Membership Inference Attacks

Download data from https://physionet.org/content/mimic2-iaccd/1.0/ (https://physionet.org/content/mimic2-iaccd/1.0/full_cohort_data.csv to download the full_cohort data)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold

### Load data

In [2]:
path = "./data/mimic2-iaccd/1.0/"
data_path = os.path.join(path, "full_cohort_data.csv")
X = pd.read_csv(data_path)

In [3]:
X.shape

(1776, 46)

In [4]:
X.head()

Unnamed: 0,aline_flg,icu_los_day,hospital_los_day,age,gender_num,weight_first,bmi,sapsi_first,sofa_first,service_unit,...,platelet_first,sodium_first,potassium_first,tco2_first,chloride_first,bun_first,creatinine_first,po2_first,pco2_first,iv_day_1
0,1,7.63,13,72.36841,1.0,75.0,29.912791,15.0,9.0,SICU,...,354.0,138.0,4.6,15.0,109.0,41.0,1.6,196.0,39.0,2230.875
1,0,1.14,1,64.92076,0.0,55.0,20.121312,,5.0,MICU,...,,,,,,,,,,600.0
2,0,2.86,5,36.5,0.0,70.0,27.118272,16.0,5.0,MICU,...,295.0,144.0,3.9,17.0,101.0,16.0,0.8,298.0,30.0,2086.800293
3,1,0.58,3,44.49191,0.0,,,21.0,7.0,SICU,...,262.0,139.0,4.2,31.0,100.0,16.0,0.5,146.0,23.0,
4,1,1.75,5,23.74217,1.0,95.2,28.464563,18.0,7.0,SICU,...,22.0,146.0,3.4,19.0,110.0,10.0,1.0,134.0,30.0,2358.244141


In [5]:
X.columns

Index(['aline_flg', 'icu_los_day', 'hospital_los_day', 'age', 'gender_num',
       'weight_first', 'bmi', 'sapsi_first', 'sofa_first', 'service_unit',
       'service_num', 'day_icu_intime', 'day_icu_intime_num',
       'hour_icu_intime', 'hosp_exp_flg', 'icu_exp_flg', 'day_28_flg',
       'mort_day_censored', 'censor_flg', 'sepsis_flg', 'chf_flg', 'afib_flg',
       'renal_flg', 'liver_flg', 'copd_flg', 'cad_flg', 'stroke_flg',
       'mal_flg', 'resp_flg', 'map_1st', 'hr_1st', 'temp_1st', 'spo2_1st',
       'abg_count', 'wbc_first', 'hgb_first', 'platelet_first', 'sodium_first',
       'potassium_first', 'tco2_first', 'chloride_first', 'bun_first',
       'creatinine_first', 'po2_first', 'pco2_first', 'iv_day_1'],
      dtype='object')

### Preprocess data

In [6]:
# remove columns non-numerical and repetitive or uninformative data for the analysis
col = ["service_unit", "day_icu_intime", "hosp_exp_flg", "icu_exp_flg", "day_28_flg"]
# service_num is the numerical version of service_unit
# day_icu_intime_num is the numerical version of day_icu_intime
# the other columns are to do with death and are somewhat repetitive with censor_flg
X = X.drop(col, axis=1)
# drop columns with only 1 value
X.drop("sepsis_flg", axis=1, inplace=True)

# Generic cleaning up (all data sets)
# drop NA by row
X.dropna(axis=0, inplace=True)

print(X.shape)

# Change it for each dataset
var = "censor_flg"  #'aline_flg'#

# Obtain labels and remove them from the data
y = X[var]
# Drop the column that contains the labels
X.drop([var], axis=1, inplace=True)

y = np.array(y)
X = np.array(X)
print(X.shape)

test_size = 0.2  # Define test size
X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=True, test_size=test_size, random_state=58954
)

print(f"X_train = {X_train.shape}")
print(f"y_train = {y_train.shape}")
print(f"X_test = {X_test.shape}")
print(f"y_test = {y_test.shape}")

N_FEATURES = X_train.shape[1]

(1064, 40)
(1064, 39)
X_train = (851, 39)
y_train = (851,)
X_test = (213, 39)
y_test = (213,)


### Helper functions

In [7]:
def calc_precision_recall(predicted, actual, positive_value=1):
    """Calculate the precision and recall"""
    score = 0  # both predicted and actual are positive
    num_positive_predicted = 0  # predicted positive
    num_positive_actual = 0  # actual positive
    for i in range(len(predicted)):
        if predicted[i] == positive_value:
            num_positive_predicted += 1
        if actual[i] == positive_value:
            num_positive_actual += 1
        if predicted[i] == actual[i]:
            if predicted[i] == positive_value:
                score += 1
    if num_positive_predicted == 0:
        precision = 1
    else:
        precision = (
            score / num_positive_predicted
        )  # the fraction of predicted "Yes" responses that are correct
    if num_positive_actual == 0:
        recall = 1
    else:
        recall = (
            score / num_positive_actual
        )  # the fraction of "Yes" responses that are predicted correctly
    return precision, recall

### RandomForestClassifier Target Model

In [8]:
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification.scikitlearn import ScikitlearnRandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

art_classifier = ScikitlearnRandomForestClassifier(model)

print(f"Base model accuracy: {model.score(X_test, y_test)}")

Base model accuracy: 0.9906103286384976


### Rule-based attack on RandomForestClassifier
The rule-based attack uses the simple rule to determine membership in the training data: if the model's prediction for a sample is correct, then it is a member. Otherwise, it is not a member.

In [9]:
from art.attacks.inference.membership_inference import (
    MembershipInferenceBlackBoxRuleBased,
)

attack = MembershipInferenceBlackBoxRuleBased(art_classifier)

# infer attacked feature
inferred_train = attack.infer(X_train, y_train)
inferred_test = attack.infer(X_test, y_test)

# check accuracy
train_acc = np.sum(inferred_train) / len(inferred_train)
test_acc = 1 - (np.sum(inferred_test) / len(inferred_test))
acc = (train_acc * len(inferred_train) + test_acc * len(inferred_test)) / (
    len(inferred_train) + len(inferred_test)
)
acc *= 100  # convert fraction to percentage

print("Rule-based attack")
print(f"Train accuracy: {train_acc}")
print(f"Test accuracy: {test_acc}")
print(f"For {acc:.2f}% of the data, membership status is inferred correctly.")
print(
    calc_precision_recall(
        np.concatenate((inferred_train, inferred_test)),
        np.concatenate((np.ones(len(inferred_train)), np.zeros(len(inferred_test)))),
    )
)

Rule-based attack
Train accuracy: 1.0
Test accuracy: 0.009389671361502372
For 80.17% of the data, membership status is inferred correctly.
(0.8013182674199624, 1.0)


### Black-box attack on RandomForestClassifier
The black-box attack trains an additional classifier (the attack model) to predict the membership status of a sample. It can use as input to the learning process probabilities/logits or losses, depending on the type of model and provided configuration.

In [10]:
# Train attack model
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox

attack_train_ratio = 0.5
attack_train_size = int(len(X_train) * attack_train_ratio)
attack_test_size = int(len(X_test) * attack_train_ratio)

bb_attack = MembershipInferenceBlackBox(art_classifier)

# train attack model
bb_attack.fit(
    X_train[:attack_train_size],
    y_train[:attack_train_size],
    X_test[:attack_test_size],
    y_test[:attack_test_size],
)

# Infer sensitive feature and check accuracy

# get inferred values
inferred_train_bb = bb_attack.infer(
    X_train[attack_train_size:], y_train[attack_train_size:]
)
inferred_test_bb = bb_attack.infer(X_test[attack_test_size:], y_test[attack_test_size:])
# check accuracy
train_acc = np.sum(inferred_train_bb) / len(inferred_train_bb)
test_acc = 1 - (np.sum(inferred_test_bb) / len(inferred_test_bb))
acc = (train_acc * len(inferred_train_bb) + test_acc * len(inferred_test_bb)) / (
    len(inferred_train_bb) + len(inferred_test_bb)
)
acc *= 100  # convert fraction to percentage

print("Black-box attack")
print(f"Train accuracy: {train_acc}")
print(f"Test accuracy: {test_acc}")
print(f"For {acc:.2f}% of the data, membership status is inferred correctly.")
print(
    calc_precision_recall(
        np.concatenate((inferred_train_bb, inferred_test_bb)),
        np.concatenate(
            (np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb)))
        ),
    )
)

Black-box attack
Train accuracy: 1.0
Test accuracy: 0.0
For 79.92% of the data, membership status is inferred correctly.
(0.799249530956848, 1.0)
