In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp, t
import seaborn as sns
import os, wget, shutil

os.makedirs("data", exist_ok=True)
link_original = "https://s3.amazonaws.com/openneuro.org/ds004148/participants.tsv?versionId=wt81Mu2B3fdeiXSis5ym288A64lXRXkR"
wget.download(link_original)
filename = "participants.tsv"
file_ = [os.path.join(root, file) for root, _, files in os.walk(os.getcwd()) for file in files if file == filename]
shutil.copy2(file_[0], "data")
os.remove(file_[0])
print(f"\nOriginal file downloaded.")

data = np.array(pd.read_csv("data/"+filename,sep='\s+')["Height"]).reshape(-1,1)
clean_data = data[~np.isnan(data)]

100% [..........................................................] 39886 / 39886
Original file downloaded.


In [32]:
def dp(clean_data,pipeline_output):

    loo_output = np.array([np.mean(np.delete(clean_data, i)) for i in range(len(clean_data))])
    loo_scale = np.std(loo_output)
    sensitivity = np.max(np.abs(loo_output - pipeline_output))
    while True:
        noise = np.random.laplace(loc=0.0, scale=loo_scale)
        if abs(noise) >= sensitivity:
            break

    noisy_output = pipeline_output + noise
    return noisy_output, sensitivity

def user_output_loo_stats(clean_data):
    
    loo_mean = np.array([np.mean(np.delete(clean_data, i)) for i in range(len(clean_data))])
    loo_std = np.sqrt(np.sum((clean_data - loo_mean) ** 2) / len(clean_data) - 1)
    loo_t = loo_mean / (loo_std / np.sqrt(len(clean_data) - 1))


    return loo_mean, loo_std, loo_t


In [115]:
n_replicates = 2
subsample_sizes = [5, 6]
t1 = list()
t1p = list()
for run in range(n_replicates):
    data = np.random.normal(loc=170, scale=10, size=100)
    for size in subsample_sizes:
        
        subsample = data[:size].copy()
        loo_data = np.array([np.delete(subsample, i) for i in range(len(subsample))])
        
        for loo_subset in loo_data:
            h = list()
            for _ in range(n_replicates):
                if ttest_1samp(loo_subset, 170)[1] < 0.05:
                    h.append(ttest_1samp(loo_subset, 170)[1])
            t1.append(np.mean(h))
            loo_mean, loo_std, loo_tval = user_output_loo_stats(loo_subset)
            noisy_output, sensitivity_ = dp(loo_subset, loo_subset)
            noisy_tstats = noisy_output / (loo_std / np.sqrt(size - 1))
            pvals = 2 * (1 - t.cdf(np.abs(noisy_tstats), df=size-1))
            t1p.append(np.mean(pvals[pvals < 0.05]))
            
