In [1]:
%load_ext autoreload
%autoreload 2

import os
import abc
import sys
import time
sys.path.append(os.path.dirname(os.path.abspath(os.path.join('../'))))

import tqdm
import numpy
import torch
import wandb
import pandas
import joblib
import itertools
import torchvision
import gtda.images
import gtda.diagrams
import gtda.homology
import sklearn.pipeline
import sklearn.ensemble
import sklearn.metrics
import tqdm.contrib.itertools
import sklearn.decomposition
import matplotlib.pyplot as plt

import lib.topology

### Dataset

In [2]:
train = torchvision.datasets.MNIST('mnist', train = True, download = True)
test = torchvision.datasets.MNIST('mnist', train = False, download = True)

In [3]:
train_images = numpy.array([ item[0] for item in train ])[:3000]
train_labels = numpy.array([ item[1] for item in train ])[:3000]

test_images = numpy.array([ item[0] for item in test ])[:300]
test_labels = numpy.array([ item[1] for item in test ])[:300]

In [4]:
def test(train_diagrams, test_diagrams, n_jobs: int = 1, verbose: bool = False):
    feature_calculator = lib.topology.FeatureCalculator(n_jobs = n_jobs, verbose = verbose)
    train_features = numpy.minimum(feature_calculator.calc_features(train_diagrams).to_numpy(), 1e9)
    test_features = numpy.minimum(feature_calculator.calc_features(test_diagrams).to_numpy(), 1e9)

    rf = sklearn.ensemble.RandomForestClassifier(n_jobs = n_jobs)
    rf.fit(train_features, train_labels)
    score = rf.score(test_features, test_labels)

    pca = sklearn.decomposition.PCA(n_components = 8, svd_solver = "full", random_state = 42)
    train_features_reduced = pca.fit_transform(train_features)
    test_features_reduced = pca.transform(test_features)

    rf_reduced = sklearn.ensemble.RandomForestClassifier(n_jobs = n_jobs)
    rf_reduced.fit(train_features_reduced, train_labels)
    score_reduced = rf_reduced.score(test_features_reduced, test_labels)

    return score, score_reduced

### No filtration

In [6]:
cubical_persistence = gtda.homology.CubicalPersistence(n_jobs = -1)
train_diagrams = cubical_persistence.fit_transform(train_images)
test_diagrams = cubical_persistence.transform(test_images)

test(train_diagrams, test_diagrams, n_jobs = -1, verbose = True)

Filtered diagrams: (3000, 50, 3)
Calculating Betti features


 betti: 100%|██████████| 3000/3000 [00:01<00:00, 1691.33it/s]


Calculating landscape features


 landscape: 100%|██████████| 3000/3000 [00:01<00:00, 2416.52it/s]


Calculating silhouette features


 silhouette-1: 100%|██████████| 3000/3000 [00:01<00:00, 2228.27it/s]
 silhouette-2: 100%|██████████| 3000/3000 [00:01<00:00, 2363.05it/s]


Calculating entropy features
Calculating number of points features
Calculating amplitude features


 amplitudes: 100%|██████████| 13/13 [00:00<00:00, 19.76it/s]


Calculating lifetime features


 lifetime: 100%|██████████| 3000/3000 [00:02<00:00, 1146.29it/s]


Filtered diagrams: (300, 28, 3)
Calculating Betti features


 betti: 100%|██████████| 300/300 [00:00<00:00, 1296.83it/s]


Calculating landscape features


 landscape: 100%|██████████| 300/300 [00:00<00:00, 1209.91it/s]


Calculating silhouette features


 silhouette-1: 100%|██████████| 300/300 [00:00<00:00, 1292.96it/s]
 silhouette-2: 100%|██████████| 300/300 [00:00<00:00, 1297.57it/s]


Calculating entropy features
Calculating number of points features
Calculating amplitude features


 amplitudes: 100%|██████████| 13/13 [00:00<00:00, 36.10it/s]


Calculating lifetime features


 lifetime: 100%|██████████| 300/300 [00:00<00:00, 1026.10it/s]


(0.36, 0.32)

### Filtration

In [7]:
height_filtration_directions = [
    [ -1, -1 ], [ 1, 1 ], [ 1, -1 ], [ -1, 1 ],
    [ 0, -1 ], [ 0, 1 ], [ -1, 0 ], [ 1, 0 ]
]


radial_filtration_centers = list(itertools.product([ 7, 14, 21 ], [ 7, 14, 21 ]))
radial_filtration_metrics = [ "euclidean", "manhattan", "cosine" ]

density_filtration_metrics = [ "euclidean" , "manhattan", "cosine" ]
density_filtration_radiuses = [ 1, 5, 15 ]

filtrations = [
    *[ [ gtda.images.HeightFiltration, { 'direction': numpy.array(direction) } ] for direction in height_filtration_directions ],
    *[
        [ gtda.images.RadialFiltration, { 'center': numpy.array(center), 'metric': metric } ]
        for center in radial_filtration_centers
        for metric in radial_filtration_metrics
    ],
    [ gtda.images.DilationFiltration, {} ],
    [ gtda.images.ErosionFiltration, {} ],
    [ gtda.images.SignedDistanceFiltration, {} ],
    *[
        [ gtda.images.DensityFiltration, { 'radius': radius, 'metric': metric } ]
        for metric in density_filtration_metrics
        for radius in density_filtration_radiuses
    ]
]

In [8]:
results = [ ]
processed = set()

In [9]:
def process_one(threshold, filtration):
    id = str(threshold) + "/" + str(filtration)
    if id in processed:
        print(f"Skipped {id}")
        return
    print(id)

    binarizer = gtda.images.Binarizer(threshold = threshold)
    train_images_bin = binarizer.fit_transform(train_images)
    test_images_bin = binarizer.transform(test_images)

    filtration = filtration[0](**filtration[1])
    train_filtered = filtration.fit_transform(train_images_bin)
    test_filtered = filtration.transform(test_images_bin)

    cubical_persistence = gtda.homology.CubicalPersistence(n_jobs = 1)
    train_diagrams = cubical_persistence.fit_transform(train_filtered)
    test_diagrams = cubical_persistence.transform(test_filtered)

    score, score_reduced = test(train_diagrams, test_diagrams)
    return str(threshold), str(filtration), score, score_reduced

attempts = itertools.product([ 0.2, 0.4, 0.6, 0.8 ], filtrations)
items = joblib.Parallel(return_as = 'generator', n_jobs = -1)(
    joblib.delayed(process_one)(*attempt)
    for attempt in attempts
)
for threshold, filtration, score, score_reduced in tqdm.tqdm(items, total = len(filtrations) * 4):
    results.append([ threshold, filtration, score, score_reduced ])
    processed.add(f"{threshold}/{filtration}")

100%|██████████| 188/188 [22:48<00:00,  7.28s/it] 


In [10]:
pandas.DataFrame(results).to_csv("filtrations.csv")
pandas.DataFrame(results)

Unnamed: 0,0,1,2,3
0,0.2,"HeightFiltration(direction=array([-1, -1]))",0.490000,0.483333
1,0.2,"HeightFiltration(direction=array([1, 1]))",0.620000,0.606667
2,0.2,"HeightFiltration(direction=array([ 1, -1]))",0.750000,0.703333
3,0.2,"HeightFiltration(direction=array([-1, 1]))",0.620000,0.553333
4,0.2,"HeightFiltration(direction=array([ 0, -1]))",0.476667,0.440000
...,...,...,...,...
183,0.8,"DensityFiltration(metric='manhattan', radius=5)",0.416667,0.390000
184,0.8,"DensityFiltration(metric='manhattan', radius=15)",0.503333,0.443333
185,0.8,"DensityFiltration(metric='cosine', radius=1)",0.413333,0.406667
186,0.8,"DensityFiltration(metric='cosine', radius=5)",0.413333,0.406667


### As point cloud with binarization

In [23]:
for threshold in tqdm.tqdm([ 1e-9, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ]):
    binarizer = gtda.images.Binarizer(threshold = threshold)
    train_images_bin = binarizer.fit_transform(train_images)
    test_images_bin = binarizer.transform(test_images)

    to_point_cloud = gtda.images.ImageToPointCloud()
    train_point_cloud = to_point_cloud.fit_transform(train_images_bin)
    test_point_cloud = to_point_cloud.transform(test_images_bin)
    
    persistence = gtda.homology.VietorisRipsPersistence(homology_dimensions = [ 0, 1, 2 ], n_jobs = -1)
    train_diagrams = persistence.fit_transform(train_point_cloud)
    test_diagrams = persistence.transform(test_point_cloud)

    score, score_reduced = test(train_diagrams, test_diagrams, -1)
    print(f'{threshold}: {score}, {score_reduced}')

 10%|█         | 1/10 [01:10<10:33, 70.44s/it]

1e-09: 0.46, 0.35


 20%|██        | 2/10 [02:06<08:17, 62.15s/it]

0.1: 0.43333333333333335, 0.43333333333333335


 30%|███       | 3/10 [02:54<06:30, 55.72s/it]

0.2: 0.49333333333333335, 0.41


 40%|████      | 4/10 [03:38<05:05, 50.87s/it]

0.3: 0.4666666666666667, 0.4033333333333333


 50%|█████     | 5/10 [04:16<03:51, 46.36s/it]

0.4: 0.45666666666666667, 0.37333333333333335


 60%|██████    | 6/10 [04:51<02:49, 42.50s/it]

0.5: 0.4266666666666667, 0.32


 70%|███████   | 7/10 [05:23<01:56, 38.90s/it]

0.6: 0.47333333333333333, 0.4066666666666667


 80%|████████  | 8/10 [05:52<01:11, 35.71s/it]

0.7: 0.44, 0.29


 90%|█████████ | 9/10 [06:18<00:32, 32.75s/it]

0.8: 0.4666666666666667, 0.4


100%|██████████| 10/10 [06:42<00:00, 40.26s/it]

0.9: 0.48, 0.44333333333333336





### As point cloud

In [5]:
def make_point_cloud(image, threshold):
    point_cloud = [ ]
    for i in range(image.shape[0]):
        for j in range(image.shape[1]):
            if image[i][j] < threshold:
                continue
            point_cloud.append([ i, j, image[i][j] ])
    return numpy.array(point_cloud)

def make_point_clouds(images, threshold):
    images = numpy.swapaxes(numpy.flip(images, axis = 1), 1, 2)
    return [ make_point_cloud(image, threshold) for image in images ]

In [6]:
for threshold in tqdm.tqdm([ 1, 5, 9, 13, 17, 21, 25 ]):
    train_point_cloud = make_point_clouds(train_images / 255 * 28, threshold)
    test_point_cloud = make_point_clouds(test_images / 255 * 28, threshold)
    
    persistence = gtda.homology.VietorisRipsPersistence(homology_dimensions = [ 0, 1, 2 ], n_jobs = -1)
    train_diagrams = persistence.fit_transform(train_point_cloud)
    test_diagrams = persistence.transform(test_point_cloud)

    score, score_reduced = test(train_diagrams, test_diagrams, -1)
    print(f'{threshold}: {score}, {score_reduced}')

 14%|█▍        | 1/7 [01:09<06:59, 69.86s/it]

1: 0.48, 0.4166666666666667


 29%|██▊       | 2/7 [02:08<05:17, 63.45s/it]

5: 0.5, 0.43


 43%|████▎     | 3/7 [03:01<03:54, 58.67s/it]

9: 0.4866666666666667, 0.42


 57%|█████▋    | 4/7 [03:53<02:47, 55.91s/it]

13: 0.43666666666666665, 0.39


 71%|███████▏  | 5/7 [04:42<01:47, 53.59s/it]

17: 0.45, 0.36


 86%|████████▌ | 6/7 [05:28<00:50, 50.88s/it]

21: 0.41333333333333333, 0.4033333333333333


100%|██████████| 7/7 [06:12<00:00, 53.24s/it]

25: 0.43, 0.38666666666666666



