In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from pydp.ml.mechanisms.sklearn_pipeline import LaplaceMechanism
from pydp.ml.util.accountant import BudgetAccountant

In [2]:
# DUMMY DATASET

# Create random dataset
X, y = make_classification(random_state=0)

# Split training test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)

## Simple Sklearn Pipeline Example with Laplace Mechanism

In [3]:
# Create the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('laplace', LaplaceMechanism()), 
    ('nb', GaussianNB())
])

# Train Naive Bayes model with Local DP
pipe.fit(X_train, y_train)

# Get model score
pipe.score(X_test, y_test)

0.84

## Configuring Epsilon and Sensitivity Params

In [4]:
# Set laplace mechanism with epsilon 0.1 and sensitivity .5
laplace = LaplaceMechanism(epsilon=0.1, sensitivity=0.5)

# Initialize scaler and naive bayes extimator
scaler = StandardScaler()
nb = GaussianNB()

# Create the pipeline
pipe = Pipeline([('scaler', scaler), ('laplace', laplace), ('nb', nb)])

# Train Naive Bayes model with Local DP
pipe.fit(X_train, y_train)

# Get model score
pipe.score(X_test, y_test)

0.48

## Sensitivty as Function instead of Number

In [5]:
# Set sensitivity function for numerical data
sensitivity = lambda x: (max(x) - min(x))/ (len(x) + 1)

# Set laplace mechanism with epsilon 0.1 and sensitivity is function
laplace = LaplaceMechanism(epsilon=0.1, sensitivity=sensitivity)

# Initialize scaler and naive bayes extimator
scaler = StandardScaler()
nb = GaussianNB()

# Create the pipeline
pipe = Pipeline([('scaler', scaler), ('laplace', laplace), ('nb', nb)])

# Train Naive Bayes model with Local DP
pipe.fit(X_train, y_train)

# Get model score
pipe.score(X_test, y_test)

0.52

## Budget Accountant

In [6]:
# Set a privacy budget accountant
accountant = BudgetAccountant(10000)

# Set sensitivity function for numerical data
sensitivity = lambda x: (max(x) - min(x))/ (len(x) + 1)

# Set laplace mechanism with epsilon, sensitivity, and accountant
laplace = LaplaceMechanism(epsilon=0.1, sensitivity=sensitivity, accountant=accountant)

# Initialize scaler and naive bayes extimator
scaler = StandardScaler()
nb = GaussianNB()

# Create the pipeline
pipe = Pipeline([('scaler', scaler), ('laplace', laplace), ('nb', nb)])

# Train Naive Bayes model with Local DP
pipe.fit(X_train, y_train)

# Get model score
pipe.score(X_test, y_test)

0.6

## Categorical Feature Support

In [7]:
import random

# Helper function to inject nominal value to dataset
def create_cat_data(idxs, X, cat_data = [0,1, 2, 3, 4, 5]):
    
    X = X.copy()
    
    for idx in idxs:
        for i in range(len(X[:,idx])):
            num = random.choice(cat_data)
            X[i,idx] = num
        
    return X

# DUMMY DATASET

# Create random dataset
X, y = make_classification(random_state=0)

# Indecies for caegorical data
cat_feat_idxs = [0, 19]

# Inject nominal data
X = create_cat_data(cat_feat_idxs, X)

# Split training test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)

# Sensitivity function for numeric features
calculate_sensitivity = lambda x: (max(x) - min(x))/ (len(x) + 1)

# Sensitivity function for categorical features
calculate_sensitivity_cat = lambda x: abs(sum(x)/len(x))

# Set laplace mechanism with categoical support
laplace = LaplaceMechanism(
    epsilon=0.1, 
    sensitivity=calculate_sensitivity,
    cat_sensitivity=calculate_sensitivity_cat,
    cat_feat_idxs=cat_feat_idxs
)

# Initialize scaler and naive bayes extimator
scaler = StandardScaler()
nb = GaussianNB()

# Create the pipeline
pipe = Pipeline([('scaler', scaler), ('laplace', laplace), ('nb', nb)])

# Train Naive Bayes model with Local DP
pipe.fit(X_train, y_train)

# Get model score
pipe.score(X_test, y_test)

0.48