# ImageNet-10 preparation

In [None]:
import data_split
import data_converter
import vgg19bn

In [None]:
ds = data_split.DataSplit()
ds.save_split('data')

In [None]:
%%time
dc = data_converter.DataConverter(vgg19bn.VGG19bn())
dc.convert_split()

# Mapper

In [65]:
import numpy as np
import mapper
import my_umap
from sklearn.cluster import DBSCAN
from gtda.mapper import FirstSimpleGap

In [66]:
loaded = np.load('data/in10_split_converted.npz', allow_pickle=True)
x_train = loaded['x_train']
y_train = loaded['y_train']
x_train.shape, y_train.shape

((10000, 5096), (10000,))

In [67]:
experiment_name = 'comp120_int10_umap12rs69_dbscan60_uniform'

In [68]:
%%time

clusterer = DBSCAN(eps=60, min_samples=1)
# clusterer = FirstSimpleGap(relative_gap_size=0.3)

projector = my_umap.MyUMAP(n_components=120, random_state=69)

m = mapper.Mapper()
m.fit(x_train, projector=projector, clusterer=clusterer, n_components=120, 
      n_intervals=10, experiment_name=experiment_name)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.5s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.5s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.5s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.5s
[Pipel

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.6min


[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.5s
[Pipeline] ............. (step 3 of 3) Processing cover, total=   0.1s
[Pipeline] .... (step 1 of 3) Processing pullback_cover, total=   0.8s
[Pipeline] ........ (step 2 of 3) Processing clustering, total=  24.1s
[Pipeline] ............. (step 3 of 3) Processing nerve, total=   0.1s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.6s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.6s
[Pipeline] ............. (step 3 of 3) Processing cover, total=   0.1s
[Pipeline] .... (step 1 of 3) Processing pullback_cover, total=   0.8s
[Pipel

[Parallel(n_jobs=4)]: Done 120 out of 120 | elapsed: 12.4min finished


CPU times: user 40min 48s, sys: 10min 47s, total: 51min 36s
Wall time: 14min 27s


# Representations

In [69]:
import mapper
import numpy as np
import pickle

In [70]:
loaded = np.load('data/in10_split_converted.npz', allow_pickle=True)
x_train = loaded['x_train']
x_test_none = loaded['x_test_none']
x_test_gaussian = loaded['x_test_gaussian']
y_train = loaded['y_train']
y_test = loaded['y_test']

mapper_data = pickle.load(open('experiments/{}'.format(experiment_name), 'rb'))
latent_space, graphs, covers = mapper_data[0], mapper_data[1], mapper_data[2]
k = 5
x_train.shape, x_test_none.shape, x_test_gaussian.shape, y_train.shape, y_test.shape

((10000, 5096), (3000, 5096), (3000, 5096), (10000,), (3000,))

In [71]:
%%time

m = mapper.Mapper()
m.get_representations(x_train, x_test_none, x_test_gaussian, y_train, y_test,
                      k, latent_space, graphs, covers, experiment_name + '_k{}'.format(k))

[binarization]: 100%|██████████| 10000/10000 [01:21<00:00, 122.85it/s]
[wknn]: 100%|██████████| 3000/3000 [00:16<00:00, 178.02it/s]
[wknn]: 100%|██████████| 3000/3000 [00:19<00:00, 155.59it/s]


CPU times: user 11min 6s, sys: 1min 40s, total: 12min 46s
Wall time: 10min 47s


In [72]:
loaded = np.load('experiments/{}_k{}.npz'.format(experiment_name, k), allow_pickle=True)
x_train = loaded['x_train']
x_test_none = loaded['x_test_none']
x_test_gaussian = loaded['x_test_gaussian']
y_train = loaded['y_train']
y_test = loaded['y_test']
x_train.shape, x_test_none.shape, x_test_gaussian.shape, y_train.shape, y_test.shape

((10000, 1200), (3000, 1200), (3000, 1200), (10000,), (3000,))