In [1]:
import numpy as np

"""
Compute accuracy
"""
import concurrent.futures
import os
import pickle as pkl
from itertools import product

import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import shuffle
from tqdm import tqdm
import pandas as pd


class Scores():
    """
    Calculates various scoring metrics amongst training, testing and synthetic data files.

    Parameters
    ----------
    train_file : string, required
        The training file to be used.
    test_file : string, required
        The test file to be used.
    synthetic_file: list, required
        The list of various synthetic data files to be used.
    dist_file: string, optional
        The file that containts previously computed distances to omit recalculation.
    workers: int, optional
        The count of workers to use with the default value of 1.
    """

    def __init__(self, train_file, test_file, synthetic_files, dist_file=None, workers=1, n_neighbors=5, leaf_size=30):
        """
        Collect all training, testing and synthetic data files for processing
        """
        self.n_neighbors = n_neighbors
        self.leaf_size = leaf_size
        training_data = pd.read_csv(train_file)
        training_data = training_data.fillna(training_data.mean())

        testing_data = pd.read_csv(test_file)
        testing_data = testing_data.fillna(testing_data.mean())

        self.data = {
            "training_data": training_data,
            "testing_data": testing_data
        }

        self.synth_keys = []
        for i, s in enumerate(synthetic_files):
            self.data[f'synth_{i}'] = np.clip(pd.read_csv(s), 0, 1)
            self.synth_keys.append(f'synth_{i}')

        self.distances = {}

        if dist_file is not None:
            self.distances = pkl.load(open(dist_file, 'rb'))
        else:
            self.__compute_nn(workers)

    def __nearest_neighbors(self, t, s):
        # Fit to S
        nn_s = NearestNeighbors(n_neighbors=self.n_neighbors, leaf_size=self.leaf_size).fit(self.data[s])
        if t == s:
            # Find distances from s to s
            d = nn_s.kneighbors()[0]
        else:
            # Find distances from t to s
            d = nn_s.kneighbors(self.data[t])[0]
        return t, s, d

    def __compute_nn(self, workers):
        tasks = product(self.data.keys(), repeat=2)

        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            futures = [
                executor.submit(self.__nearest_neighbors, t, s)
                for (t, s) in tasks
            ]

            # Wait for each job to finish
            for future in tqdm(concurrent.futures.as_completed(futures),
                               total=len(futures)):
                t, s, d = future.result()
                self.distances[(t, s)] = d

        if not os.path.isdir("gen_data"):
            os.mkdir("gen_data")

        pkl.dump(self.distances, open(f'gen_data/syn_dists.pkl', 'wb'))

    def __discrepancy_score(self, t, s):
        left = np.mean(self.distances[(t, s)])
        right = np.mean(self.distances[(s, t)])
        return 0.5 * (left + right)

    def compute_discrepancy(self):
        """
        Compute the standard discrepancy scores

        Outputs
        -------
        The discrepency amongst the various data files.
        """
        j_rr = self.__discrepancy_score('training_data', 'testing_data')
        j_ra = []
        j_rat = []
        j_aa = []

        # For all of the synthetic datasets
        for k in self.synth_keys:
            j_ra.append(self.__discrepancy_score('training_data', k))
            j_rat.append(self.__discrepancy_score('testing_data', k))
            # Comparison to other synthetics
            for k_2 in self.synth_keys:
                if k != k_2:
                    j_aa.append(self.__discrepancy_score(k, k_2))

        # Average across synthetics
        j_ra = np.mean(np.array(j_ra))
        j_rat = np.mean(np.array(j_rat))
        j_aa = np.mean(np.array(j_aa))

        discrepancy_training_test = np.round(j_rr, 4)
        discrepancy_training_synthetic = np.round(j_ra, 4)
        discrepancy_test_synthetic = np.round(j_rat, 4)
        discrepancy_synthetic = np.round(j_aa, 4)
        print("Discrepancy in training and test data is: {}".format(discrepancy_training_test))
        print("Discrepancy in training data and synthetic data is: {}".format(discrepancy_training_synthetic))
        print("Discrepancy in testing and synthetic data is: {}".format(discrepancy_test_synthetic))
        print("Discrepancy amongst various synthetic data files is: {}".format(discrepancy_synthetic))
        return discrepancy_training_test, discrepancy_training_synthetic, discrepancy_test_synthetic, discrepancy_synthetic

    def __divergence(self, t, s):
        left = np.mean(np.log(self.distances[(t, s)] / self.distances[(t, t)]))
        right = np.mean(np.log(self.distances[(s, t)] / self.distances[(s, s)]))
        return 0.5 * (left + right)

    def compute_divergence(self):
        """
        Compute the divergence scores

        Outputs
        -------
        The divergence score amongst the various data files.
        """
        d_tr_a = []
        d_te_a = []

        for k in self.synth_keys:
            d_tr_a.append(self.__divergence('training_data', k))
            d_te_a.append(self.__divergence('testing_data', k))

        training = np.mean(np.array(d_tr_a))
        testing = np.mean(np.array(d_te_a))

        divergence_training = np.round(training, 4)
        divergence_test = np.round(testing, 4)
        print("Divergence in training and synthetic data is: {}".format(divergence_training))
        print("Divergence in testing and synthetic data is: {}".format(divergence_test))
        return divergence_training, divergence_test

    def __adversarial_accuracy(self, t, s):
        left = np.mean(self.distances[(t, s)] > self.distances[(t, t)])
        right = np.mean(self.distances[(s, t)] > self.distances[(s, s)])
        return 0.5 * (left + right)

    def __calculate_accuracy(self):
        """
        Compute the standarad adversarial accuracy scores
        """

        train_accuracy = []
        test_accuracy = []
        for key in self.synth_keys:
            train_accuracy.append(self.__adversarial_accuracy('training_data', key))
            test_accuracy.append(self.__adversarial_accuracy('testing_data', key))

        avg_train_accuracy = np.mean(np.array(train_accuracy))
        avg_test_accuracy = np.mean(np.array(test_accuracy))
        return avg_train_accuracy, avg_test_accuracy

    def calculate_accuracy(self):
        """
        Compute the standarad adversarial accuracy scores

        Outputs
        -------
        The adversarial accuracy for the two data files along with privacy loss.
        """

        train_acc, test_acc = self.__calculate_accuracy()
        TrainResemblanceLoss = np.round(train_acc, 4)
        TestResemblanceLoss = np.round(test_acc, 4)
        PrivacyLoss = np.round(np.round(test_acc, 4) - np.round(train_acc, 4), 4)
        print("Adversarial accuracy for train data is: {}".format(TrainResemblanceLoss))
        print("Adversarial accuracy for test data is: {}".format(TestResemblanceLoss))
        print("Privacy Loss is: {}".format(PrivacyLoss))
        return TrainResemblanceLoss, TestResemblanceLoss, PrivacyLoss

In [10]:
class MemInfPlot():
    """
    Uses `matplotlib` and `seaborn` to plot the membership inference plot

    Parameters
    ----------
    train_file : string, required
        The training file to be used for generating the membership inference plot.
    test_file : string, required
        The testing file to be used for generating the membership inference plot.
    synth_file : string, required
        The synthetic data file to be used for generating the membership inference plot.
    name : string, required
        A name for the plot.
    """

    def __init__(self, train_file, test_file, synth_file, name):

        if not os.path.exists('gen_data'):
            os.makedirs('gen_data')

        if not os.path.exists('gen_data/plots'):
            os.makedirs('gen_data/plots')

        data, labels = self.__create_shuffled_data(train_file, test_file)
        self.fpr, self.tpr, self.auc = self.__compute_auc(synth_file, data, labels)
        self.name = name

        print("AUC = {}".format(self.auc))

    def __create_shuffled_data(self, train_file, test_file):

        # Read in train and test
        train_set = pd.read_csv(train_file)
        test_set = pd.read_csv(test_file)

        # Create labels
        label_train = np.empty(train_set.shape[0], dtype=int)
        label_train.fill(-1)
        label_test = np.empty(test_set.shape[0], dtype=int)
        label_test.fill(1)

        # Combine
        labels = np.concatenate([label_train, label_test], axis=0)
        data = pd.concat([train_set, test_set], axis=0)
        data['labels'] = labels.tolist()

        # Randomize
        data = shuffle(data)
        data, labels = (data.drop('labels', axis=1), data['labels'])

        return data, labels

    def __compute_auc(self, synth_file, data, labels):

        synth_data = pd.read_csv(synth_file)

        syn_dists = self.__nearest_neighbors(data, synth_data)
        fpr, tpr, _ = metrics.roc_curve(labels.ravel(), syn_dists)
        roc_auc = metrics.auc(fpr, tpr)

        return fpr, tpr, roc_auc

    def __nearest_neighbors(self, t, s):
        """
        Find nearest neighbors d_ts and d_ss
        """

        # Fit to S
        nn_s = NearestNeighbors().fit(s)

        # Find distances from t to s
        d = nn_s.kneighbors(t)[0]

        return d

    def plot(self, savefig=False):
        """
        The function plots the membership inference plot.

        Parameters
        ----------
        savefig: boolean, optional
            If set to True, the plots generated will be saved to disk.

        Outputs
        -------
        PCA Plot:
            Plots the AUC curve and saves the file as
            `membership_inference_auc_{name}.png`
        """

        pylab.rcParams['figure.figsize'] = 6, 6
        plt.title('Receiver Operating Characteristic', fontsize=24)
        plt.plot([0, 1], [0, 1], 'r--')
        plt.plot(self.fpr, self.tpr, label=f'{self.name} AUC = {self.auc:0.2f}')

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.ylabel('True Positive Rate', fontsize=18)
        plt.xlabel('False Positive Rate', fontsize=18)

        if (savefig):
            plt.savefig(f'gen_data/membership_inference_auc_{self.name}.png')
        plt.show()
        if (savefig):
            print(f"The plot has been saved as membership_inference_auc_{self.name}.png inside gen_data/plots.")

In [3]:
synthetics = {"real_train": "clean_train_0.8.csv", "real_copy_all": "clean_10k_real.csv",
              "augmented_real_train_data": "augmented_real_data.csv", "random": "random_data.csv",
              "Genome-AC-GAN 80%": "clean_polyloss_ce.csv", "Genome-AC-GAN super pop": "clean_full_pop.csv",
              "Genome-AC-GAN sub pop": "clean_full_sub_pop.csv", "RBM new": "clean_rbm_new.csv",
              "RBM old": "clean_old_rbm.csv", "WGAN": "clean_WGAN.csv"}

In [5]:
rows = []
for model_name, file_path in synthetics.items():
    print(f"start {model_name}")
    s = Scores(train_file="clean_train_0.8.csv", test_file="clean_test_0.2.csv",
               synthetic_files=[file_path], n_neighbors=3, leaf_size=10)
    TrainResemblanceLoss, TestResemblanceLoss, PrivacyLoss = s.calculate_accuracy()
    divergence_training, divergence_test = s.compute_divergence()
    discrepancy_training_test, discrepancy_training_synthetic, discrepancy_test_synthetic, discrepancy_synthetic = s.compute_discrepancy()
    rows.append(
        {"model_name": model_name, "TrainResemblanceLoss": TrainResemblanceLoss,
         "TestResemblanceLoss": TestResemblanceLoss, "PrivacyLoss": PrivacyLoss,
         "divergence_training": divergence_training, "divergence_test": divergence_test,
         "discrepancy_training_test": discrepancy_training_test,
         "discrepancy_training_synthetic": discrepancy_training_synthetic,
         "discrepancy_test_synthetic": discrepancy_test_synthetic})

start real_train


100%|██████████| 9/9 [00:16<00:00,  1.86s/it]
  left = np.mean(np.log(self.distances[(t, s)] / self.distances[(t, t)]))
  right = np.mean(np.log(self.distances[(s, t)] / self.distances[(s, s)]))
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.0
Adversarial accuracy for test data is: 0.4987
Privacy Loss is: 0.4987
Divergence in training and synthetic data is: -inf
Divergence in testing and synthetic data is: 0.0005
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 17.6286
Discrepancy in testing and synthetic data is: 27.8641
Discrepancy amongst various synthetic data files is: nan
start real_copy_all


100%|██████████| 9/9 [00:16<00:00,  1.84s/it]
  left = np.mean(np.log(self.distances[(t, s)] / self.distances[(t, t)]))
  right = np.mean(np.log(self.distances[(s, t)] / self.distances[(s, s)]))
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.0727
Adversarial accuracy for test data is: 0.4061
Privacy Loss is: 0.3334
Divergence in training and synthetic data is: -inf
Divergence in testing and synthetic data is: -inf
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 19.5079
Discrepancy in testing and synthetic data is: 23.0524
Discrepancy amongst various synthetic data files is: nan
start augmented_real_train_data


100%|██████████| 9/9 [00:10<00:00,  1.18s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.4999
Adversarial accuracy for test data is: 0.486
Privacy Loss is: -0.0139
Divergence in training and synthetic data is: 0.1181
Divergence in testing and synthetic data is: 0.02
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 38.1205
Discrepancy in testing and synthetic data is: 36.1029
Discrepancy amongst various synthetic data files is: nan
start random


100%|██████████| 9/9 [00:13<00:00,  1.53s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.9583
Adversarial accuracy for test data is: 0.9742
Privacy Loss is: 0.0159
Divergence in training and synthetic data is: 0.4881
Divergence in testing and synthetic data is: 0.4486
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 69.864
Discrepancy in testing and synthetic data is: 69.8965
Discrepancy amongst various synthetic data files is: nan
start Genome-AC-GAN 80%


100%|██████████| 9/9 [00:16<00:00,  1.82s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.645
Adversarial accuracy for test data is: 0.5684
Privacy Loss is: -0.0766
Divergence in training and synthetic data is: 0.0387
Divergence in testing and synthetic data is: 0.0358
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 27.0489
Discrepancy in testing and synthetic data is: 28.0287
Discrepancy amongst various synthetic data files is: nan
start Genome-AC-GAN super pop


100%|██████████| 9/9 [00:18<00:00,  2.09s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.6549
Adversarial accuracy for test data is: 0.5447
Privacy Loss is: -0.1102
Divergence in training and synthetic data is: 0.0579
Divergence in testing and synthetic data is: 0.0447
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 26.6331
Discrepancy in testing and synthetic data is: 27.3342
Discrepancy amongst various synthetic data files is: nan
start Genome-AC-GAN sub pop


100%|██████████| 9/9 [00:18<00:00,  2.04s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.6588
Adversarial accuracy for test data is: 0.5483
Privacy Loss is: -0.1105
Divergence in training and synthetic data is: 0.0428
Divergence in testing and synthetic data is: 0.0293
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 27.1304
Discrepancy in testing and synthetic data is: 27.8469
Discrepancy amongst various synthetic data files is: nan
start RBM new


100%|██████████| 9/9 [00:18<00:00,  2.10s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.6965
Adversarial accuracy for test data is: 0.653
Privacy Loss is: -0.0435
Divergence in training and synthetic data is: 0.0487
Divergence in testing and synthetic data is: 0.0544
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 27.4124
Discrepancy in testing and synthetic data is: 28.6122
Discrepancy amongst various synthetic data files is: nan
start RBM old


100%|██████████| 9/9 [00:20<00:00,  2.32s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Adversarial accuracy for train data is: 0.4651
Adversarial accuracy for test data is: 0.4917
Privacy Loss is: 0.0266
Divergence in training and synthetic data is: 0.0077
Divergence in testing and synthetic data is: -0.0025
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 27.0325
Discrepancy in testing and synthetic data is: 27.8353
Discrepancy amongst various synthetic data files is: nan
start WGAN


100%|██████████| 9/9 [00:19<00:00,  2.19s/it]

Adversarial accuracy for train data is: 0.811
Adversarial accuracy for test data is: 0.6545
Privacy Loss is: -0.1565
Divergence in training and synthetic data is: 0.0667
Divergence in testing and synthetic data is: 0.0483
Discrepancy in training and test data is: 27.8641
Discrepancy in training data and synthetic data is: 28.2856
Discrepancy in testing and synthetic data is: 28.8955
Discrepancy amongst various synthetic data files is: nan



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [6]:
results = pd.DataFrame(rows)
results

Unnamed: 0,model_name,TrainResemblanceLoss,TestResemblanceLoss,PrivacyLoss,divergence_training,divergence_test,discrepancy_training_test,discrepancy_training_synthetic,discrepancy_test_synthetic
0,real_train,0.0,0.4987,0.4987,-inf,0.0005,27.8641,17.6286,27.8641
1,real_copy_all,0.0727,0.4061,0.3334,-inf,-inf,27.8641,19.5079,23.0524
2,augmented_real_train_data,0.4999,0.486,-0.0139,0.1181,0.02,27.8641,38.1205,36.1029
3,random,0.9583,0.9742,0.0159,0.4881,0.4486,27.8641,69.864,69.8965
4,Genome-AC-GAN 80%,0.645,0.5684,-0.0766,0.0387,0.0358,27.8641,27.0489,28.0287
5,Genome-AC-GAN super pop,0.6549,0.5447,-0.1102,0.0579,0.0447,27.8641,26.6331,27.3342
6,Genome-AC-GAN sub pop,0.6588,0.5483,-0.1105,0.0428,0.0293,27.8641,27.1304,27.8469
7,RBM new,0.6965,0.653,-0.0435,0.0487,0.0544,27.8641,27.4124,28.6122
8,RBM old,0.4651,0.4917,0.0266,0.0077,-0.0025,27.8641,27.0325,27.8353
9,WGAN,0.811,0.6545,-0.1565,0.0667,0.0483,27.8641,28.2856,28.8955


In [11]:
for model_name, file_path in synthetics.items():
    print(f"start {model_name}")
    m = MemInfPlot(train_file="clean_train_0.8.csv", test_file="clean_test_0.2.csv", synth_file=file_path,
                   name=model_name)
    m.plot()

start real_train


ValueError: y should be a 1d array, got an array of shape (5008, 5) instead.

In [None]:
# import numpy as np
# from utils.util import get_relevant_columns
# import pandas as pd
#
# file_path = "/Users/shakedahronoviz/Genome-AC-GAN/resource/train_0.8_super_pop.csv"
# model_sequences = pd.read_csv(file_path)
# columns = get_relevant_columns(model_sequences, model_sequences.columns[:2])
# model_sequences = model_sequences[columns]
# columns = [int(i) for i in columns]
# model_sequences.columns = columns
# number_of_samples = len(model_sequences)
# model_sequences[0] = "model_name"
# x = pd.DataFrame({'label': model_sequences[0], 'ind': model_sequences[1]})
# train = pd.DataFrame(np.array(model_sequences.loc[:, 2:].astype(int)))
#
# file_path = "/Users/shakedahronoviz/Genome-AC-GAN/resource/test_0.2_super_pop.csv"
# model_sequences = pd.read_csv(file_path)
# columns = get_relevant_columns(model_sequences, model_sequences.columns[:2])
# model_sequences = model_sequences[columns]
# columns = [int(i) for i in columns]
# model_sequences.columns = columns
# number_of_samples = len(model_sequences)
# model_sequences[0] = "model_name"
# x = pd.DataFrame({'label': model_sequences[0], 'ind': model_sequences[1]})
# test = pd.DataFrame(np.array(model_sequences.loc[:, 2:].astype(int)))
#
# pd.concat([train, test]).to_csv("train_and_")

In [None]:
# file_path = "/Users/shakedahronoviz/Genome-AC-GAN/fake_genotypes_sequences/new_sequences/polyloss_ce/genotypes.hapt"
# model_sequences = pd.read_csv(file_path, sep=' ', header=None)
# model_sequences.columns = [column if column == 0 else column + 1 for column in model_sequences.columns]
# model_sequences.insert(0, 1, [f"AG{sample_id}" for sample_id in range(model_sequences.shape[0])])
# model_sequences[0] = "model_name"
# x = pd.DataFrame({'label': model_sequences[0], 'ind': model_sequences[1]})
# pd.DataFrame(np.array(model_sequences.loc[:, 2:].astype(int))).to_csv("clean_polyloss_ce.csv", index=False)


In [None]:
# file_path = "/Users/shakedahronoviz/Genome-AC-GAN/resource/10K_SNP_1000G_real.hapt"
# model_sequences = pd.read_csv(file_path, sep=' ', header=None)
# if model_sequences.shape[1] == 808:  # special case for a specific file that had an extra empty column
#     model_sequences = model_sequences.drop(columns=model_sequences.columns[-1])
# if model_sequences.shape[0] > number_of_samples:
#     model_sequences = model_sequences.drop(
#         index=np.sort(
#             np.random.choice(np.arange(model_sequences.shape[0]),
#                              size=model_sequences.shape[0] - number_of_samples,
#                              replace=False)))
# model_sequences[0] = "model_name"
# x = pd.DataFrame({'label': model_sequences[0], 'ind': model_sequences[1]})
# pd.DataFrame(np.array(model_sequences.loc[:, 2:].astype(int))).to_csv("clean_10k_real.csv", index=False)

In [None]:
# file_path = "/Users/shakedahronoviz/Genome-AC-GAN/experiment_results/old_model_80%/5001_output.hapt"
# model_sequences = pd.read_csv(file_path, sep=' ', header=None)
# model_sequences = model_sequences.drop(columns=list(model_sequences.columns)[-1], axis=1)
# model_sequences.columns = [column + 2 for column in list(model_sequences.columns)]
# model_sequences.insert(loc=0, column=0, value="none")
# model_sequences.insert(loc=1, column=1, value='none')
# model_sequences[0] = "model_name"
# x = pd.DataFrame({'label': model_sequences[0], 'ind': model_sequences[1]})
# pd.DataFrame(np.array(model_sequences.loc[:, 2:].astype(int))).to_csv("clean_old_model_80%.csv", index=False)