In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial import distance
import importlib
from scipy import stats as scs
import computeAATS
import plot_utils as plu

##  Compute pairwise distances, minimal distances, and AATS

In [None]:
print()
print("#################### DIST AATS ####################")
print("- Computing pairwise distances, minimal distances, and AATS and saving to compressed files...")

In [None]:
importlib.reload(computeAATS) # useful if you changed something in computeAATS in between (when debugging etc)

In [None]:
# Compute only pairwise distances
if not matching_SNPs:
    dSS_dic=dict()
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    for cat, mat in datasets.items():
        dAB = distance.cdist(mat, mat, 'cityblock')       
        np.fill_diagonal(dAB, np.Inf)
        dSS_dic[cat] = dAB.min(axis=1)  #dSS
        sns.distplot(dAB[np.triu_indices(dAB.shape[0], k=1)], hist = False, kde = True, 
                kde_kws = {'linewidth': 3},  #'bw':.02
                label = '{} ({} identical pairs)'.format(cat, (dSS_dic[cat]==0).sum()))
    plt.title("Pairwise distance within each dataset")
    plt.legend()
    
    plt.subplot(1,2,2)
    for cat, d in dSS_dic.items():
        sns.distplot(dSS_dic[cat], hist = False, kde = True, 
                    kde_kws = {'linewidth': 3},  #'bw':.02
                    label = cat)
    plt.title("Minimal pairwise distance within each dataset")
    plt.legend()
    
    plt.savefig(outDir+"haplo_pairw_distrib_within.pdf")
    raise SystemExit("Not computing other distances because there is no correspondance bewteen SNPs of different datasets")

In [None]:
haplo = np.concatenate(list(datasets.values())).T  # orientation of scikit allele

In [None]:
# Compute AATS with reference being Test sets Test1 and Test2 if they exist

# Variable defined in the main notebook but could be overwritten by:
# boolComputeAATS=True # False
outFilePrefix=''
if AATS:
    outFilePrefix=''
    for ref in ['Test1', 'Test2']:
        if not ref in infiles.keys(): continue
        if boolComputeAATS:
            print("Computing AATS with ref " + ref)
            AA, MINDIST = computeAATS.computeAAandDist(
                pd.DataFrame(haplo.T), 
                sampleinfo.label, 
                infiles.keys(), 
                refCateg=ref, 
                saveAllDist=True, 
                outDir=outDir, 
                outFilePrefix=outFilePrefix)
            
            # save AA and MINDIST pd.DataFrame to csv
            # np.array of all pariwise distances are saved as npz automatically when calling computeAAandDist with saveAllDist=True
            AA.to_csv(outDir+f'AA_{ref}.csv.bz2',index=None)
            MINDIST.to_csv(outDir+f'MINDIST_{ref}.csv.bz2', index=None)
        else:
            print("Loading precomputed AATS and MINDIST")
            AA = pd.read_csv(outDir+f'AA_{ref}.csv.bz2')
            MINDIST = pd.read_csv(outDir+f'MINDIST_{ref}.csv.bz2')


In [None]:
# Compute AATS with reference being 'Real' (supposed to be the label of the Training set)

# Variable defined in the main notebook but could be overwritten by:
# boolComputeAATS=True # False
if AATS:
    print(f'boolComputeAATS: {boolComputeAATS}')
    if boolComputeAATS:
        print("Computing AATS")
        AA, MINDIST = computeAATS.computeAAandDist(
            pd.DataFrame(haplo.T), 
            sampleinfo.label, 
            infiles.keys(), 
            saveAllDist=True, 
            outDir=outDir, 
            outFilePrefix=outFilePrefix)
        # save AA and MINDIST pd.DataFrame to csv
        # np.array of all pariwise distances are saved as npz automatically when calling computeAAandDist with saveAllDist=True
        AA.to_csv(outDir+'AA.csv.bz2',index=None)
        MINDIST.to_csv(outDir+'MINDIST.csv.bz2', index=None)
    else:
        print("Loading precomputed AATS and MINDIST")
        AA = pd.read_csv(outDir+'AA.csv.bz2')
        MINDIST = pd.read_csv(outDir+'MINDIST.csv.bz2')
    print('AATS obtained')

In [None]:
if AATS:
    # if already computed we can load the tables:
    AA = pd.read_csv(outDir+'AA.csv.bz2')
    MINDIST = pd.read_csv(outDir+'MINDIST.csv.bz2')

### Plot distribution of Pairwise Differences

In [None]:
#### Distribution WITHIN categories
if DIST:
    W = pd.DataFrame(columns=['stat', 'statistic', 'pvalue', 'label','comparaison'])

    plt.figure(figsize=(18,9))
    plt.subplot(1,2,1)
    for i,cat in enumerate(categ):
        subset = (np.load('{}/{}dist_{}_{}.npz'.format(outDir, outFilePrefix, cat, cat)))['dist']
        if cat=='Real':
            subsetreal = subset
            continue
        sns_plt = sns.distplot(subset, hist = False, kde = True, 
                    kde_kws = {'linewidth': 3},  #'bw':.02
                    # label = '{}'.format(cat)
                    )

        sc = scs.wasserstein_distance(subsetreal,subset)
        W = W.append({'stat':'wasserstein', 'statistic':sc, 'pvalue':None,  'label':cat, 'comparaison':'within'}, ignore_index=True)
    plt.title("Haplotypic pairwise distance within each dataset")
    #plt.savefig(outDir+"haplo_pairw_distrib_within_{}_simplify.pdf".format("-".join(categ)))
    subsetreal=None

    #### Distribution BETWEEN categories
    plt.subplot(1,2,2)
    categ = infiles.keys()
    for i,cat in enumerate(categ):
            subset = (np.load('{}/{}dist_{}_{}.npz'.format(outDir, outFilePrefix, cat, 'Real')))['dist']
            if cat=='Real':
                subsetreal = subset
                continue
            sns.distplot(subset, hist = False, kde = True, 
                    kde_kws = {'linewidth': 3},  #'bw':.02
                    label = '{}'.format(cat))
            
            sc = scs.wasserstein_distance(subsetreal,subset)
            W = W.append({'stat':'wasserstein','statistic':sc, 'pvalue':None,  'label':cat, 'comparaison':'between'}, ignore_index=True)
        
    plt.title("Haplotypic pairwise distance between datasets and ground truth")
    plt.tight_layout()
    plt.savefig(outDir+"haplo_pairw_distrib.pdf".format("-".join(categ)))
    print('    - Save', outDir+"haplo_pairw_distrib.pdf".format("-".join(categ)))
    scores = pd.concat([W])

    print(W)

### Plot distance to nearest neighbor (ie minimal distance)

In [None]:
DISTmelt = MINDIST.melt(id_vars='cat').rename(columns=str.title)
g = sns.FacetGrid(DISTmelt, hue="Cat", height=7, col='Variable',hue_order=infiles.keys())
# cut=0 : negative values have no meaning for distances, however be aware that this might accidently hide real picks at zero (due to copying for example)
# check whether the full distribution is  similar or not (next cell)
g.map(sns.distplot, "Value", hist=False, kde = True, kde_kws = {'linewidth': 4, 'cut':0}) 
g.add_legend()
plt.savefig(outDir+"distrib_minimal_distances_cut.pdf")

In [None]:
DISTmelt = MINDIST.melt(id_vars='cat').rename(columns=str.title)
g = sns.FacetGrid(DISTmelt, hue="Cat", height=7, col='Variable',hue_order=infiles.keys())
g.map(sns.distplot, "Value", hist=False, kde = True, kde_kws = {'linewidth': 4})
g.add_legend()
plt.savefig(outDir+"distrib_minimal_distances_full.pdf")

In [None]:
W = pd.DataFrame(columns=['stat', 'statistic', 'pvalue', 'label','comparaison'])
for cat in infiles.keys():
    for method in ['dTS','dST']:
        real = MINDIST[method][MINDIST.cat=='Real']
        sc = scs.wasserstein_distance(real,MINDIST[method][MINDIST.cat==cat])
        W = W.append({'stat':'wasserstein','statistic':sc, 'pvalue':None,  'label':cat, 'comparaison':method}, ignore_index=True)
scores = pd.concat([scores,W])
scores.to_csv(outDir+"scores_pairwise_distances.csv",index=False)

## AATS

In [None]:
print(AA)

In [None]:
#### Barplot of AATS scores (and subscore) for each dataset
# AATS = (AAtruth+AAsyn)/2
plt.figure(figsize=(1.5*len(categ), 6))

sns.barplot(x='Cat', y='Value', hue='Variable', palette=sns.color_palette('colorblind'),
            data=(AA.drop(columns=['PrivacyLoss','ref'], errors='ignore')).melt(id_vars='cat').rename(columns=str.title))
plt.axhline(0.5, color='black')
if 'Real_test' in AA.cat.values:
    plt.axhline(np.float(AA[AA.cat=='Real_test'].AATS), color=sns.color_palette()[0], ls='--')
plt.ylim(0,1.1)
plt.title("Nearest Neighbor Adversarial Accuracy on training (AATS) and its components")
plt.savefig(outDir + "AATS_scores.pdf")

In [None]:
AA

#### Privacy Loss
Privacy Loss = Test AA -Train AA

It can be computed only if there was a Test set in the dataset list  
Below the Test was set to 'Test2' but this can be changed


In [None]:
Test = '_Test2'
Train = '' # means Training set is Real
dfPL = plu.plotPrivacyLoss(Train, Test, outDir, colpal, allcolpal)
dfPL

In [None]:
# Compute PL for the real dataset Test1
# Useful if an RBM with alternative training scheme (cf paper) is in the list of models
# Because Test1 served for initializing the RBM sampling in this case
Test = '_Test2'
Train = '_Test1' 
dfPL = plu.plotPrivacyLoss(Train, Test, outDir, colpal, allcolpal)
dfPL

In [None]:
print('************************************************************************\n*** Computation and plotting DIST/AATS DONE. Figures saved in {} ***\n************************************************************************'.format(outDir))