In [1]:
%load_ext autoreload

In [2]:
%autoreload

import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss
import warnings
import datetime
from IPython.display import clear_output

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score, auc, roc_curve

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter


from mmd_grud_utils import *

In [3]:
def sensitivity(feature):
    """
    Find the sensitivity
    This version groups people by subject ID and sums their measured absolute values (L1Norm)
    It then takes the max across all subjects to give the sensitivity. It treats NaN values as 0.
    
    returns:
        max_l1: a float representing this function's sensitivity
    """
    
    return np.nanmax(feature.abs())

In [41]:
def add_noise(feature, epsilon):
    """
    Add noise to a pd series
    
    Args:
        feature: The series on which to add noise
        epsilon: The differential privacy budget to use
        
    returns:
        noisy_df: The dataframe with added noise
    """
    
    generator = np.random.default_rng()

    with warnings.catch_warnings():
        warnings.filterwarnings('error')
        sens = sensitivity(feature)

    scale = sens / epsilon  # Definition of scale parameter for laplace noise to fulfill diff privacy
    noise = generator.laplace(0, scale, len(feature))
    noisy_feature = feature[0] + noise
        
    return np.absolute(noise), noisy_feature

In [47]:
ex_data = np.random.normal(0, 1, 28000)

In [48]:
ex_data = pd.DataFrame(ex_data)

In [12]:
ep_mod = 105

In [44]:
ex_data

Unnamed: 0,0
0,-0.520589
1,2.010509
2,-0.076089
3,-1.117538
4,0.203753
5,-0.69799
6,1.594043
7,1.381536
8,1.391853
9,-1.428721


In [49]:
noise_100, values_100 = add_noise(ex_data, 100 / ep_mod)
noise_300, values_300 = add_noise(ex_data, 300 / ep_mod)
noise_600, values_600 = add_noise(ex_data, 600 / ep_mod)
ex_data["noise_100"] = noise_100
ex_data["noise_300"] = noise_300
ex_data["noise_600"] = noise_600
ex_data["values_100"] = values_100
ex_data["values_300"] = values_300
ex_data["values_600"] = values_600

In [50]:
ex_data

Unnamed: 0,0,noise_100,noise_300,noise_600,values_100,values_300,values_600
0,-1.221157,1.219279,0.568056,0.199246,-0.001878,-1.789213,-1.420403
1,-0.914240,0.409991,1.189433,0.618707,-0.504249,-2.103673,-0.295533
2,-0.515662,2.391288,0.656602,0.713413,-2.906950,0.140939,0.197750
3,-0.712749,0.643779,0.143346,0.026866,-1.356528,-0.569403,-0.739616
4,-0.342229,0.442129,3.735636,0.581378,-0.784358,3.393407,0.239149
...,...,...,...,...,...,...,...
995,0.088637,0.618038,2.962448,0.108431,0.706675,-2.873811,0.197068
996,1.543457,0.066891,0.217746,0.524308,1.476565,1.325711,2.067765
997,0.642904,1.134414,0.273107,1.147459,-0.491511,0.916011,1.790363
998,-1.770992,1.875094,0.144632,0.042117,-3.646087,-1.626360,-1.728875


In [52]:
ex_data.head(5).to_latex()

'\\begin{tabular}{lrrrrrrr}\n\\toprule\n{} &         0 &  noise\\_100 &  noise\\_300 &  noise\\_600 &  values\\_100 &  values\\_300 &  values\\_600 \\\\\n\\midrule\n0 & -1.221157 &   1.219279 &   0.568056 &   0.199246 &   -0.001878 &   -1.789213 &   -1.420403 \\\\\n1 & -0.914240 &   0.409991 &   1.189433 &   0.618707 &   -0.504249 &   -2.103673 &   -0.295533 \\\\\n2 & -0.515662 &   2.391288 &   0.656602 &   0.713413 &   -2.906950 &    0.140939 &    0.197750 \\\\\n3 & -0.712749 &   0.643779 &   0.143346 &   0.026866 &   -1.356528 &   -0.569403 &   -0.739616 \\\\\n4 & -0.342229 &   0.442129 &   3.735636 &   0.581378 &   -0.784358 &    3.393407 &    0.239149 \\\\\n\\bottomrule\n\\end{tabular}\n'