In [1]:
import numpy as np
import anndata as ad
from scipy.sparse import issparse
from scipy.spatial.distance import cdist

In [3]:
from scipy.sparse import csr_matrix

# A is mxn, B is pxn, out val is mxp matrix of Hamming dists.
def hamming_dist_sparse(A, B):
    n_features = A.shape[1]
    matches = A.dot(B.T).toarray()
    A_ones = A.sum(axis=1).A1[:, None]
    B_ones = B.sum(axis=1).A1[None, :]
    
    dists = (A_ones + B_ones - 2 * matches) / n_features
    return dists

In [4]:
def compute_fMC_scores_sparse(ST, SC, SG, r, eta=1e-12, batch_size=500):
    from scipy.sparse import vstack
    combined = vstack([ST, SC])
    n_records = combined.shape[0]
    scores = np.zeros(n_records)

    for start in range(0, n_records, batch_size):
        end = min(start + batch_size, n_records)
        # sparse batches
        batch = combined[start:end]
        # distances (batch_size, n_synthetic)
        dists = hamming_dist_sparse(batch, SG)
        within_r = dists <= r

        for i in range(end - start):
            if np.any(within_r[i]):
                scores[start + i] = np.mean(np.log(dists[i, within_r[i]] + eta))
            else:
                scores[start + i] = 0.0

    return scores


In [9]:
def privacy_score_sparse(ST, SC, SG, batch_size=500):
    from scipy.sparse import vstack
    combined = vstack([ST, SC])
    n_records = combined.shape[0]

    min_dists = np.zeros(n_records)
    for start in range(0, n_records, batch_size):
        print(f"{start}/{n_records}")
        end = min(start + batch_size, n_records)
        batch = combined[start:end]
        dists = hamming_dist_sparse(batch, SG)
        min_dists[start:end] = np.min(dists, axis=1)
    
    r = np.median(min_dists)
    print(f"Neighborhood radius r = {r:.4f}")

    scores = compute_fMC_scores_sparse(ST, SC, SG, r, batch_size=batch_size)

    m = ST.shape[0]
    top_indices = np.argsort(scores)[-m:]
    training_indices = set(range(m))
    top_set = set(top_indices)
    privacy = len(training_indices & top_set) / m
    return privacy


In [10]:
train_adata = ad.read_h5ad("100_ind_1000_gene_ct0.h5ad")
holdout_adata = ad.read_h5ad("group1_holdout.h5ad")
synthetic_adata = ad.read_h5ad("1000_genes_100_ind_subset1_out.h5ad")

ST = train_adata.X
SC = holdout_adata.X
SG = synthetic_adata.X

In [11]:
privacy = privacy_score_sparse(ST, SC, SG)
print(f"Privacy Score: {privacy:.4f}")

0/93010
500/93010
1000/93010
1500/93010
2000/93010
2500/93010
3000/93010
3500/93010
4000/93010
4500/93010
5000/93010
5500/93010
6000/93010
6500/93010
7000/93010
7500/93010
8000/93010
8500/93010
9000/93010
9500/93010
10000/93010
10500/93010
11000/93010
11500/93010
12000/93010
12500/93010
13000/93010
13500/93010
14000/93010
14500/93010
15000/93010
15500/93010
16000/93010
16500/93010
17000/93010
17500/93010
18000/93010
18500/93010
19000/93010
19500/93010
20000/93010
20500/93010
21000/93010
21500/93010
22000/93010
22500/93010
23000/93010
23500/93010
24000/93010
24500/93010
25000/93010
25500/93010
26000/93010
26500/93010
27000/93010
27500/93010
28000/93010
28500/93010
29000/93010
29500/93010
30000/93010
30500/93010
31000/93010
31500/93010
32000/93010
32500/93010
33000/93010
33500/93010
34000/93010
34500/93010
35000/93010
35500/93010
36000/93010
36500/93010
37000/93010
37500/93010
38000/93010
38500/93010
39000/93010
39500/93010
40000/93010
40500/93010
41000/93010
41500/93010
42000/93010
4250

  scores[start + i] = np.mean(np.log(dists[i, within_r[i]] + eta))


Privacy Score: 0.5780
