# MNIST (0, 1, 8) — Pairwise Distance Quality

Bifiltration (cubical): $\phi_1$ = inverted intensity, $\phi_2$ = radial.
Compute CMD, MD 121, MD 10 for all pairs, separately for $H_0$ and $H_1$.

In [6]:
import numpy as np
import gudhi as gd
from sklearn.datasets import fetch_openml
from scipy.stats import spearmanr
from tqdm import tqdm
import time, warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [7]:
n_ex = 100
digit_classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

t_values = np.linspace(0, 1, 11)
a_fine = np.linspace(0.05, 0.95, 11)
b_fine = np.linspace(0, 1, 11)
a_coarse = np.array([0.25, 0.75])
b_coarse = np.linspace(0, 1, 5)
BOTTLENECK_E = 0.01

n_cmd = len(t_values)
n_md_fine = len(a_fine) * len(b_fine)
n_md_coarse = len(a_coarse) * len(b_coarse)
print(f'CMD: {n_cmd} | MD fine: {n_md_fine} | MD coarse: {n_md_coarse}')

CMD: 11 | MD fine: 121 | MD coarse: 10


In [8]:
print('Fetching MNIST...')
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
X_all, y_all = mnist.data, mnist.target.astype(int)

Data_img, Labels = [], []
rng = np.random.RandomState(42)
for cls_idx, digit in enumerate(digit_classes):
    indices = np.where(y_all == digit)[0]
    chosen = rng.choice(indices, size=n_ex, replace=False)
    for idx in chosen:
        Data_img.append(X_all[idx].reshape(28, 28).astype(float))
        Labels.append(cls_idx)
Labels = np.array(Labels)
N = len(Data_img)
print(f'N = {N}')

Fetching MNIST...
N = 1000


In [9]:
def build_filtrations(img):
    h, w = img.shape
    yy, xx = np.meshgrid(np.arange(h), np.arange(w), indexing='ij')
    cy, cx = h/2.0, w/2.0
    phi1 = 1.0 - img / 255.0
    binary = img > 50
    radial = np.sqrt((yy-cy)**2 + (xx-cx)**2)
    radial = radial / radial.max()
    phi2 = np.ones((h, w))
    phi2[binary] = radial[binary]
    return phi1, phi2

def cubical_pd(filt_2d):
    h, w = filt_2d.shape
    cc = gd.CubicalComplex(dimensions=[h, w], top_dimensional_cells=filt_2d.flatten())
    cc.persistence()
    pds = []
    for dim in range(2):
        pd = cc.persistence_intervals_in_dimension(dim)
        if pd is not None and len(pd) > 0:
            pd = pd[np.isfinite(pd[:, 1])]
        if pd is None or len(pd) == 0:
            pd = np.empty((0, 2))
        pds.append(pd)
    return tuple(pds)

def phi_star_ab(phi1, phi2, a, b):
    return np.minimum(a, 1-a) * np.maximum((phi1-b)/a, (phi2+b)/(1-a))

def safe_bottleneck(pd1, pd2, e=0.01):
    if len(pd1) == 0 and len(pd2) == 0: return 0.0
    if len(pd1) == 0: return float(np.max((pd2[:,1]-pd2[:,0])/2))
    if len(pd2) == 0: return float(np.max((pd1[:,1]-pd1[:,0])/2))
    return gd.bottleneck_distance(pd1, pd2, e)

In [10]:
Phi1_all, Phi2_all = [], []
for img in Data_img:
    p1, p2 = build_filtrations(img)
    Phi1_all.append(p1)
    Phi2_all.append(p2)

def make_cmd_filts(p1, p2): return [(1-t)*p1 + t*p2 for t in t_values]
def make_mdf_filts(p1, p2): return [phi_star_ab(p1, p2, a, b) for a in a_fine for b in b_fine]
def make_mdc_filts(p1, p2): return [phi_star_ab(p1, p2, a, b) for a in a_coarse for b in b_coarse]

params = [('cmd', make_cmd_filts, n_cmd),
          ('mdf', make_mdf_filts, n_md_fine),
          ('mdc', make_mdc_filts, n_md_coarse)]

PDs = {name: [None]*N for name, _, _ in params}
for i in tqdm(range(N), desc='PDs'):
    phi1, phi2 = Phi1_all[i], Phi2_all[i]
    for name, make_filts, _ in params:
        PDs[name][i] = [cubical_pd(f) for f in make_filts(phi1, phi2)]

PDs: 100%|██████████| 1000/1000 [01:44<00:00,  9.61it/s]


In [11]:
# Distance matrices: separate H0 and H1
Ds = {}
for name, _, nparam in params:
    print(f'Distances: {name}')
    D_H0 = np.zeros((N, N))
    D_H1 = np.zeros((N, N))
    for i in tqdm(range(N), leave=False):
        for j in range(i+1, N):
            max_d0 = max_d1 = 0.0
            for p in range(nparam):
                d0 = safe_bottleneck(PDs[name][i][p][0], PDs[name][j][p][0], BOTTLENECK_E)
                d1 = safe_bottleneck(PDs[name][i][p][1], PDs[name][j][p][1], BOTTLENECK_E)
                if d0 > max_d0: max_d0 = d0
                if d1 > max_d1: max_d1 = d1
            D_H0[i,j] = D_H0[j,i] = max_d0
            D_H1[i,j] = D_H1[j,i] = max_d1
    Ds[name] = {'H0': D_H0, 'H1': D_H1}

Distances: cmd


                                                   

Distances: mdf


                                                  

Distances: mdc


                                                  

In [12]:
np.save('MNIST_stability.npy', {
    'd_cmd_H0': Ds['cmd']['H0'], 'd_cmd_H1': Ds['cmd']['H1'],
    'd_mdf_H0': Ds['mdf']['H0'], 'd_mdf_H1': Ds['mdf']['H1'],
    'd_mdc_H0': Ds['mdc']['H0'], 'd_mdc_H1': Ds['mdc']['H1'],
    'labels': Labels
})

In [13]:
# Quick summary
triu = np.triu_indices(N, k=1)
for hdim in ['H0', 'H1']:
    d_cmd = Ds['cmd'][hdim][triu]
    d_mdf = Ds['mdf'][hdim][triu]
    d_mdc = Ds['mdc'][hdim][triu]
    sp_cmd_mdf, _ = spearmanr(d_mdf, d_cmd)
    sp_mdc_mdf, _ = spearmanr(d_mdf, d_mdc)
    sp_cmd_mdc, _ = spearmanr(d_cmd, d_mdc)
    print(f'\n{hdim}:')
    print(f'  Spearman CMD vs MD121: {sp_cmd_mdf:.4f}')
    print(f'  Spearman MD10 vs MD121: {sp_mdc_mdf:.4f}')
    print(f'  Spearman CMD vs MD10: {sp_cmd_mdc:.4f}')


H0:
  Spearman CMD vs MD121: 0.9800
  Spearman MD10 vs MD121: 0.9258
  Spearman CMD vs MD10: 0.9085

H1:
  Spearman CMD vs MD121: 0.9929
  Spearman MD10 vs MD121: 0.9552
  Spearman CMD vs MD10: 0.9421
