# Running attribute inference attacks on the Nursery data

Data fetched from https://www.openml.org/d/26

In [1]:
from __future__ import annotations

import logging
import pandas as pd
import numpy as np
import multiprocess as mp
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

logging.basicConfig()
logger = logging.getLogger("aia_nursery")
logger.setLevel(logging.INFO)

r_state = 1  # random seed
attack_threshold = 0  # infer atttributes only if unique highest confidence exceeds this

n_cpu = mp.cpu_count()  # number of CPU cores to use

## Utility class for storing dataset details

In [2]:
class Data:
    """UCI Nursery dataset."""

    def __init__(self, seed: int) -> None:
        """Fetches the UCI Nursery dataset and preprocesses."""

        self.data = fetch_openml(data_id=26, as_frame=True)
        self.names: list[str] = self.data.feature_names
        self.X = np.asarray(self.data.data, dtype=str)
        self.y = np.asarray(self.data.target, dtype=str)

        # target model train / test split
        (
            self.Xt_member,
            self.Xt_nonmember,
            self.yt_member,
            self.yt_nonmember,
        ) = train_test_split(
            self.X,
            self.y,
            test_size=0.5,
            stratify=self.y,
            shuffle=True,
            random_state=seed,
        )

        # one-hot encoding of features and integer encoding of labels
        self.label_encoder = LabelEncoder()
        self.feature_encoder = OneHotEncoder()

        self.X_train = self.feature_encoder.fit_transform(self.Xt_member).toarray()
        self.y_train = self.label_encoder.fit_transform(self.yt_member)
        self.X_test = self.feature_encoder.transform(self.Xt_nonmember).toarray()
        self.y_test = self.label_encoder.transform(self.yt_nonmember)
        self.n_samples: int = np.shape(self.X_train)[0]
        self.n_features: int = np.shape(self.X_train)[1]
        self.n_labels: int = len(np.unique(self.y_train))

        self.X_all = np.vstack((self.X_train, self.X_test))
        self.y_all = np.vstack((self.y_train, self.y_test))

        logger.info(f"X_train shape = {np.shape(self.X_train)}")
        logger.info(f"y_train shape = {np.shape(self.y_train)}")
        logger.info(f"X_test shape = {np.shape(self.X_test)}")
        logger.info(f"y_test shape = {np.shape(self.y_test)}")
        logger.info(f"n_samples = {self.n_samples}")
        logger.info(f"n_features = {self.n_features}")
        logger.info(f"n_labels = {self.n_labels}")

## Utility class for storing attacked feature details

In [3]:
class AttackFeature:
    """Stores details of the attacked feature."""

    def __init__(self, ds: Data, index: int, indices: list[int]) -> None:
        """Initialises the definition of an attack feature."""
        self.index: int = index  # index of attacked feature
        self.name: str = ds.names[index]  # feature name
        self.indices: list[int] = indices  # one-hot encoded indices of attacked feature
        self.unique = np.unique(ds.X[:, self.index])  # unique attribute values (str)
        self.n_unique: int = len(self.unique)  # number of unique attribute values
        self.onehot_enc = OneHotEncoder()
        self.values = self.onehot_enc.fit_transform(
            self.unique.reshape(-1, 1)
        ).toarray()  # one-hot encoded unique attribute values
        self.X_values = None
        self.y_values = None
        self.naive: float = 0
        logger.debug(f"Attacked feature unique values: {self.unique}")

    def set_inference_data(self, model, ds: Data) -> None:
        """Returns a dataset of each sample with every possible missing value for
        performing inference. Assumes we know the sample was in the target model
        training data and the target model predicted labels are available."""
        self.X_values = np.zeros(
            (ds.n_samples, self.n_unique, ds.n_features), dtype=np.float64
        )  # target model training set only
        self.y_values = model.predict(ds.X_train)
        self.y_strings = ds.label_encoder.inverse_transform(self.y_values)
        # for each sample to perform inference on
        # add each possible missing feature value
        for i, x in enumerate(ds.X_train):
            for j, value in enumerate(self.values):
                self.X_values[i][j] = np.copy(x)
                self.X_values[i][j][self.indices] = value
        _, counts = np.unique(ds.Xt_member[:, self.index], return_counts=True)
        self.naive = (np.max(counts) / ds.n_samples) * 100
        logger.debug(f"X_values shape = {np.shape(self.X_values)}")
        logger.debug(f"y_values shape = {np.shape(self.y_values)}")

    def set_inference_data_all(self, model, ds: Data) -> None:
        """Returns a dataset of each sample with every possible missing value for
        performing inference. Uses all train and test samples."""
        n_samples = len(ds.X_all)
        self.X_values = np.zeros(
            (n_samples, self.n_unique, ds.n_features), dtype=np.float64
        )  # target model training and test set
        self.y_values = model.predict(ds.X_all)
        self.y_strings = ds.label_encoder.inverse_transform(self.y_values)
        # for each sample to perform inference on
        # add each possible missing feature value
        for i, x in enumerate(ds.X_all):
            for j, value in enumerate(self.values):
                self.X_values[i][j] = np.copy(x)
                self.X_values[i][j][self.indices] = value
        _, counts = np.unique(ds.X[:, self.index], return_counts=True)
        self.naive = (np.max(counts) / n_samples) * 100
        logger.debug(f"X_values shape = {np.shape(self.X_values)}")
        logger.debug(f"y_values shape = {np.shape(self.y_values)}")

## Utility functions

In [4]:
def unique_max(confidences: list[float], threshold: float) -> bool:
    """Returns whether there is a unique maximum confidence value above threshold."""
    if len(confidences) > 0:
        max_conf = np.max(confidences)
        if max_conf < threshold:
            return False
        u, c = np.unique(confidences, return_counts=True)
        for i in range(len(c)):
            if c[i] == 1 and u[i] == max_conf:
                return True
    return False

## Load data

In [5]:
dataset = Data(r_state)

INFO:aia_nursery:X_train shape = (6480, 27)
INFO:aia_nursery:y_train shape = (6480,)
INFO:aia_nursery:X_test shape = (6480, 27)
INFO:aia_nursery:y_test shape = (6480,)
INFO:aia_nursery:n_samples = 6480
INFO:aia_nursery:n_features = 27
INFO:aia_nursery:n_labels = 5


In [6]:
dataset.data.frame.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [7]:
dataset.data.frame.describe()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
count,12960,12960,12960,12960,12960,12960,12960,12960,12960
unique,3,5,4,4,3,2,3,3,5
top,usual,proper,complete,1,convenient,convenient,nonprob,recommended,not_recom
freq,4320,2592,3240,3240,4320,6480,4320,4320,4320


## Train target model

In [8]:
model = RandomForestClassifier(bootstrap=False, random_state=r_state)
model.fit(dataset.X_train, dataset.y_train)

print(f"Base model train accuracy: {model.score(dataset.X_train, dataset.y_train)}")
print(f"Base model test accuracy: {model.score(dataset.X_test, dataset.y_test)}")

Base model train accuracy: 1.0
Base model test accuracy: 0.984104938271605


## Attribute inference using target model confidence scores and predicted labels

In [9]:
def infer(model, ds: Data, af: AttackFeature, threshold: float) -> str:
    """
    For each possible missing value, compute the confidence scores and
    label with the target model; if the label matches the known target model
    label for the original sample, and the highest confidence score is unique,
    infer that attribute if the confidence score is greater than a threshold.
    """

    # labels and attributes of correct inferences made
    correct_label: list[str] = []
    correct_attrs: list[str] = []
    correct: int = 0  # total number of correct inferences made
    total: int = 0  # total number of inferences made

    for i, x in enumerate(af.X_values):  # each sample to perform inference on
        # get target model confidence scores for the tested sample (batch: all values)
        confidences = model.predict_proba(x)
        # get known target model predicted label for the original sample
        y_label = af.y_values[i]
        conf = []  # confidences for each possible value with correct label
        attr = []  # features for each possible value with correct label
        # for each possible attribute value,
        # if the label matches the target model label
        # then store the confidence score and the tested feature vector
        for j in range(af.n_unique):
            y_candidate = np.argmax(confidences[j])
            scores = confidences[j][y_candidate]
            if y_candidate == y_label:
                conf.append(scores)
                attr.append(x[j])
        # get whether there is a unique maximum confidence score greater than threshold
        if unique_max(conf, threshold):
            total += 1
            # get attributes of the highest confidence matching label
            inf = attr[np.argmax(conf)]
            inf_str = ds.feature_encoder.inverse_transform(inf.reshape(1, -1))[0]
            inf_attr = inf_str[af.index]
            # tested sample matches the original input feature
            if (inf == ds.X_train[i]).all():
                correct_label.append(af.y_strings[i])
                correct_attrs.append(inf_attr)
                correct += 1

    if total > 0:
        msg = (
            f"Correctly inferred: {(correct / total) * 100:.2f}% "
            f"of {(total / len(ds.X_train)) * 100:.2f}% of the data set\n"
            f"Baseline: {af.naive:.2f}%\n"
            f"{pd.crosstab(correct_label, correct_attrs, rownames=['labels'], colnames=['attrs'])}"
        )
    else:
        msg = "Unable to make any inferences"
    return f"Attacking feature {af.name} with {af.n_unique} unique values:\n{msg}"

In [10]:
# create a list of features to attack

attack_features = [
    AttackFeature(dataset, 0, [0, 1, 2]),  # parents
    AttackFeature(dataset, 1, [3, 4, 5, 6, 7]),  # has_nurs
    AttackFeature(dataset, 2, [8, 9, 10, 11]),  # form
    AttackFeature(dataset, 3, [12, 13, 14, 15]),  # children
    AttackFeature(dataset, 4, [16, 17, 18]),  # housing
    AttackFeature(dataset, 5, [19, 20]),  # finance
    AttackFeature(dataset, 6, [21, 22, 23]),  # social
    AttackFeature(dataset, 7, [24, 25, 26]),  # health
]

In [11]:
# for each attacked feature build a matrix of all possible values
# uses training set samples - i.e., assumes attacker knows sample is in dataset
for af in attack_features:
    af.set_inference_data(model, dataset)

In [12]:
# utility function for collecting parallel processed inferences
results: list[str] = []


def collect_results(result: str) -> None:
    """Collects parallel processed inference results."""
    results.append(result)

In [13]:
# for each feature test each possible value and infer the attribute where possible

pool = mp.Pool(processes=n_cpu)
for af in attack_features:
    pool.apply_async(
        infer,
        args=(
            model,
            dataset,
            af,
            attack_threshold,
        ),
        callback=collect_results,
    )
pool.close()
pool.join()

for result in results:
    print(f"{result}\n")

Attacking feature form with 4 unique values:
Correctly inferred: 100.00% of 10.71% of the data set
Baseline: 25.42%
attrs       complete  completed  foster  incomplete
labels                                             
not_recom         15         12       8           7
priority         111         70      80          67
recommend          1          0       0           0
spec_prior        37         47     106          84
very_recom        23         19       1           6

Attacking feature children with 4 unique values:
Correctly inferred: 100.00% of 12.27% of the data set
Baseline: 25.90%
attrs         1    2    3  more
labels                         
not_recom     6   12   16    14
priority    179  101   63    59
recommend     1    0    0     0
spec_prior   26   48  116    90
very_recom   41   20    1     2

Attacking feature housing with 3 unique values:
Correctly inferred: 100.00% of 23.09% of the data set
Baseline: 33.47%
attrs       convenient  critical  less_conv
labels     

## Membership inference attack
#### Construct a dataset with the label of whether or not in the training

In [14]:
miX = model.predict_proba(dataset.X_all)

miY = np.vstack(
    (np.ones((len(dataset.X_train), 1), int), np.zeros((len(dataset.X_test), 1), int))
).flatten()

mi_train_x, mi_test_x, mi_train_y, mi_test_y = train_test_split(
    miX, miY, test_size=0.2, stratify=miY
)

logger.info(f"mi_train_x shape = {np.shape(mi_train_x)}")
logger.info(f"mi_train_y shape = {np.shape(mi_train_y)}")
logger.info(f"mi_test_x shape = {np.shape(mi_test_x)}")
logger.info(f"mi_test_y shape = {np.shape(mi_test_y)}")

INFO:aia_nursery:mi_train_x shape = (10368, 5)
INFO:aia_nursery:mi_train_y shape = (10368,)
INFO:aia_nursery:mi_test_x shape = (2592, 5)
INFO:aia_nursery:mi_test_y shape = (2592,)


#### Train membership inference attack model

In [15]:
mi_attack_model = RandomForestClassifier()
mi_attack_model.fit(mi_train_x, mi_train_y)

pred_probs = mi_attack_model.predict_proba(mi_test_x)
mi_auc = roc_auc_score(mi_test_y, pred_probs[:, 1])
print(f"Membership AUC = {mi_auc}")

Membership AUC = 0.9264927221460143


## Attribute inference and membership inference
Same attack as above, however training set membership is not assumed, and a membership inference model is used first to classify member/non-member.

In [16]:
def infer(model, mia_model, ds: Data, af: AttackFeature, threshold: float) -> str:
    """Infers the missing feature values."""

    # labels and attributes of correct inferences made
    correct_label: list[str] = []
    correct_attrs: list[str] = []
    correct: int = 0  # total number of correct inferences made
    total: int = 0  # total number of inferences made

    for i, x in enumerate(af.X_values):  # each sample to perform inference on
        # get target model confidence scores for the tested sample
        confidence = model.predict_proba(x)
        # get membership inference model confidence scores for the tested sample
        mi_confidence = mia_model.predict_proba(confidence)
        # get known target model predicted label for the original sample
        label = af.y_values[i]
        conf = []  # confidences for each possible value with correct label
        attr = []  # features for each possible value with correct label
        # for each possible attribute value,
        # if mia predicts memberset and label matches target model
        # then store the confidence score and the tested feature vector
        for j in range(af.n_unique):
            mia_label = np.argmax(mi_confidence[j])
            this_label = np.argmax(confidence[j])
            if mia_label == 1 and this_label == label:
                scores = confidence[j][this_label]
                conf.append(scores)
                attr.append(x[j])
        # get whether there is a unique maximum confidence score greater than threshold
        if unique_max(conf, threshold):
            total += 1
            # get attributes of the highest confidence matching label
            inf = attr[np.argmax(conf)]
            inf_str = ds.feature_encoder.inverse_transform(inf.reshape(1, -1))[0]
            inf_attr = inf_str[af.index]
            # tested sample matches the original input feature
            if (inf == ds.X_all[i]).all():
                correct_label.append(af.y_strings[i])
                correct_attrs.append(inf_attr)
                correct += 1

    if total > 0:
        msg = (
            f"Correctly inferred: {(correct / total) * 100:.2f}% "
            f"of {(total / len(ds.X_all)) * 100:.2f}% of the data set\n"
            f"Baseline: {af.naive:.2f}%\n"
            f"{pd.crosstab(correct_label, correct_attrs, rownames=['labels'], colnames=['attrs'])}"
        )
    else:
        msg = "Unable to make any inferences"
    return f"Attacking feature {af.name} with {af.n_unique} unique values:\n{msg}"

In [17]:
# uses training and testing set - i.e., does not assume dataset membership
for af in attack_features:
    af.set_inference_data_all(model, dataset)

In [18]:
# for each feature test each possible value and infer the attribute where possible

results = []

pool = mp.Pool(processes=n_cpu)
for af in attack_features:
    pool.apply_async(
        infer,
        args=(
            model,
            mi_attack_model,
            dataset,
            af,
            attack_threshold,
        ),
        callback=collect_results,
    )
pool.close()
pool.join()

for result in results:
    print(f"{result}\n")

Attacking feature children with 4 unique values:
Correctly inferred: 34.18% of 18.49% of the data set
Baseline: 25.00%
attrs         1    2    3  more
labels                         
not_recom     8   14   21    17
priority    180  103   64    61
recommend     1    0    0     0
spec_prior   26   49  118    93
very_recom   41   20    1     2

Attacking feature parents with 3 unique values:
Correctly inferred: 51.85% of 30.43% of the data set
Baseline: 33.33%
attrs       great_pret  pretentious  usual
labels                                    
not_recom           56           60     73
priority           143          247    542
recommend            0            0      1
spec_prior         612          159     56
very_recom           0           33     63

Attacking feature finance with 2 unique values:
Correctly inferred: 54.47% of 40.29% of the data set
Baseline: 50.00%
attrs       convenient  inconv
labels                        
not_recom          320     278
priority           629   

## Attribute inference using a membership inference attack model
The idea is to find the target feature value that causes the membership inference attack to classify the sample as a member with the highest confidence. Here the attacker is not sure if the sample is in the training set.

In [19]:
def infer_mia(model, mia_model, ds: Data, af: AttackFeature, threshold: float) -> str:
    """Infers the missing feature values with a membership inference attack model."""

    # labels and attributes of correct inferences made
    correct_label: list[str] = []
    correct_attrs: list[str] = []
    correct: int = 0  # total number of correct inferences made
    total: int = 0  # total number of inferences made

    for i, x in enumerate(af.X_values):  # each sample to perform inference on
        # get target model confidence scores for the tested sample
        confidence = model.predict_proba(x)
        # get membership inference model confidence scores for the tested sample
        mi_confidence = mia_model.predict_proba(confidence)
        conf = []  # confidences for each possible value with member label
        attr = []  # features for each possible value with member label
        # for each possible attribute value,
        # if the membership inference model predicts training set membership
        # then store the confidence score and the tested feature vector
        for j in range(af.n_unique):
            this_label = np.argmax(mi_confidence[j])
            scores = mi_confidence[j][this_label]
            if this_label == 1:
                conf.append(scores)
                attr.append(x[j])
        # get whether there is a unique maximum confidence score greater than threshold
        if unique_max(conf, threshold):
            total += 1
            # get attributes of the highest confidence matching label
            inf = attr[np.argmax(conf)]
            inf_str = ds.feature_encoder.inverse_transform(inf.reshape(1, -1))[0]
            inf_attr = inf_str[af.index]
            # tested sample matches the original input feature
            if (inf == ds.X_all[i]).all():
                correct_label.append(af.y_strings[i])
                correct_attrs.append(inf_attr)
                correct += 1

    if total > 0:
        msg = (
            f"Correctly inferred: {(correct / total) * 100:.2f}% "
            f"of {(total / len(ds.X_all)) * 100:.2f}% of the data set\n"
            f"Baseline: {af.naive:.2f}%\n"
            f"{pd.crosstab(correct_label, correct_attrs, rownames=['labels'], colnames=['attrs'])}"
        )
    else:
        msg = "Unable to make any inferences"
    return f"Attacking feature {af.name} with {af.n_unique} unique values:\n{msg}"

In [20]:
# uses training and testing set - i.e., does not assume dataset membership

results = []

pool = mp.Pool(processes=n_cpu)
for af in attack_features:
    pool.apply_async(
        infer_mia,
        args=(
            model,
            mi_attack_model,
            dataset,
            af,
            attack_threshold,
        ),
        callback=collect_results,
    )
pool.close()
pool.join()

for result in results:
    print(f"{result}\n")

Attacking feature form with 4 unique values:
Correctly inferred: 25.00% of 18.86% of the data set
Baseline: 25.00%
attrs       complete  completed  foster  incomplete
labels                                             
not_recom         17         19      11           9
priority         113         70      62          63
spec_prior        39         41      61          57
very_recom        23         19       1           6

Attacking feature finance with 2 unique values:
Correctly inferred: 50.00% of 42.13% of the data set
Baseline: 50.00%
attrs       convenient  inconv
labels                        
not_recom          320     278
priority           629     481
recommend            1       0
spec_prior         390     519
very_recom          83      29

Attacking feature has_nurs with 5 unique values:
Correctly inferred: 20.00% of 24.31% of the data set
Baseline: 20.00%
attrs       critical  improper  less_proper  proper  very_crit
labels                                                

## Attribute inference mapping n-1 features and labels to the missing feature
This black-box attack trains an additional classifier (the attack model) to predict the attacked feature's value from the remaining n-1 features as well as the original (target) model's predictions.

In [21]:
def get_bb_data(ds: Data, af: AttackFeature):
    """Returns data for fitting a black-box model on n-1 features plus predictions."""
    # get target vector of attacked feature
    ya = ds.Xt_member[:, af.index]
    # label encode attacked feature - for sklearn classifiers
    attacked_feature_encoder = LabelEncoder()
    ya = attacked_feature_encoder.fit_transform(ya)
    # combine predictions with n-1 features for attack model input
    Xa = np.copy(ds.X_train)
    predictions = model.predict_proba(Xa)  # get target model's confidences
    Xa = np.delete(Xa, af.indices, axis=1)  # drop attacked feature
    Xa = np.concatenate((Xa, predictions), axis=1)  # combine label predictions
    # attack model train / test split
    Xa_train, Xa_test, ya_train, ya_test = train_test_split(
        Xa, ya, test_size=0.2, shuffle=False, random_state=r_state
    )
    return Xa_train, ya_train, Xa_test, ya_test

In [22]:
attack_models = []

attack_models.append(
    [
        "MLPClassifier",
        MLPClassifier(
            hidden_layer_sizes=(100,),
            activation="relu",
            solver="adam",
            alpha=0.0001,
            batch_size="auto",
            learning_rate="constant",
            learning_rate_init=0.001,
            power_t=0.5,
            max_iter=2000,
            shuffle=True,
            random_state=r_state,
            tol=0.0001,
            verbose=False,
            warm_start=False,
            momentum=0.9,
            nesterovs_momentum=True,
            early_stopping=False,
            validation_fraction=0.1,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-08,
            n_iter_no_change=10,
            max_fun=15000,
        ),
    ]
)

attack_models.append(["RandomForrestClassifier", RandomForestClassifier()])

In [23]:
for af in attack_features:
    print(f"Attacking feature: {af.name}")
    Xa_train, ya_train, Xa_test, ya_test = get_bb_data(dataset, af)
    for name, attack_model in attack_models:
        print(f"{name} attack model")
        attack_model.fit(Xa_train, ya_train)
        print(
            "(Train, Test) accuracy: "
            f"({attack_model.score(Xa_train, ya_train):.5f}, "
            f"{attack_model.score(Xa_test, ya_test):.5f})"
        )
    print("")

Attacking feature: parents
MLPClassifier attack model
(Train, Test) accuracy: (0.61285, 0.33565)
RandomForrestClassifier attack model
(Train, Test) accuracy: (0.72434, 0.25386)

Attacking feature: has_nurs
MLPClassifier attack model
(Train, Test) accuracy: (0.44753, 0.17978)
RandomForrestClassifier attack model
(Train, Test) accuracy: (0.59008, 0.10880)

Attacking feature: form
MLPClassifier attack model
(Train, Test) accuracy: (0.43133, 0.16590)
RandomForrestClassifier attack model
(Train, Test) accuracy: (0.56520, 0.06944)

Attacking feature: children
MLPClassifier attack model
(Train, Test) accuracy: (0.43576, 0.18133)
RandomForrestClassifier attack model
(Train, Test) accuracy: (0.57330, 0.08719)

Attacking feature: housing
MLPClassifier attack model
(Train, Test) accuracy: (0.55652, 0.27778)
RandomForrestClassifier attack model
(Train, Test) accuracy: (0.69155, 0.18056)

Attacking feature: finance
MLPClassifier attack model
(Train, Test) accuracy: (0.72531, 0.40741)
RandomForrestC

## Issues

* Does it matter if you can only predict missing values for **x**% of the people, as long as you can do it accurately for that **x**%?
* Following on, would there be a minimum value of vulnerable percentage **x** (or number of people) below which you didn't care?
* Should TREs have to flag which attributes they care about being vulnerable?
* Does using safe params reduce the risk of attribute vulnerability?
* Is it disclosive if you can say for attribute **A**, the value is definitely not **a**?
* Is vulnerability of an attribute linked to feature importance as predicted by the classifier?