# ImageNet-10 preparation

In [None]:
import data_split
import data_converter
import vgg19bn

In [None]:
ds = data_split.DataSplit()
ds.save_split('data')

In [None]:
%%time
dc = data_converter.DataConverter(vgg19bn.VGG19bn())
dc.convert_split()

# Mapper

In [10]:
import numpy as np
import mapper
import my_umap
from sklearn.cluster import DBSCAN
from gtda.mapper import FirstSimpleGap

In [11]:
loaded = np.load('data/in10_split_converted.npz', allow_pickle=True)
x_train = loaded['x_train']
y_train = loaded['y_train']
x_train.shape, y_train.shape

((10000, 5096), (10000,))

In [19]:
experiment_name = 'comp12_int10_umap12rs69_dbscan50_uniform'

In [13]:
%%time

clusterer = DBSCAN(eps=50, min_samples=1)
projector = my_umap.MyUMAP(n_components=12, random_state=69)

m = mapper.Mapper()
m.fit(x_train, projector=projector, clusterer=clusterer, n_components=12, 
      n_intervals=10, experiment_name=experiment_name, kind='balanced')

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.4s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.4s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.4s
[Pipeline] ......... (step 1 of 2) Processing projector, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] .............. (step 2 of 2) Processing proj, total=   0.0s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.4s
[Pipeline] ....... (step 2 of 3) Processing filter_func, total=   0.4s
[Pipel

[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:   42.5s finished


CPU times: user 3min 11s, sys: 41.1 s, total: 3min 52s
Wall time: 1min 34s


# Representations

In [14]:
import mapper
import numpy as np
import pickle

In [20]:
loaded = np.load('data/in10_split_converted.npz', allow_pickle=True)
x_train = loaded['x_train']
x_test_none = loaded['x_test_none']
x_test_gaussian = loaded['x_test_gaussian']
y_train = loaded['y_train']
y_test = loaded['y_test']

mapper_data = pickle.load(open('experiments/{}'.format(experiment_name), 'rb'))
latent_space, graphs, covers = mapper_data[0], mapper_data[1], mapper_data[2]
k = 5
x_train.shape, x_test_none.shape, x_test_gaussian.shape, y_train.shape, y_test.shape

((10000, 5096), (3000, 5096), (3000, 5096), (10000,), (3000,))

In [17]:
%%time

m = mapper.Mapper()
m.get_representations(x_train, x_test_none, x_test_gaussian, y_train, y_test,
                      k, latent_space, graphs, covers, experiment_name + '_k{}'.format(k))

[binarization]: 100%|██████████| 10000/10000 [00:07<00:00, 1261.71it/s]
[wknn]: 100%|██████████| 3000/3000 [01:32<00:00, 32.35it/s]
[wknn]: 100%|██████████| 3000/3000 [01:32<00:00, 32.57it/s]


CPU times: user 6min 1s, sys: 7min 43s, total: 13min 45s
Wall time: 4min 31s


In [18]:
loaded = np.load('experiments/{}_k{}.npz'.format(experiment_name, k), allow_pickle=True)
x_train = loaded['x_train']
x_test_none = loaded['x_test_none']
x_test_gaussian = loaded['x_test_gaussian']
y_train = loaded['y_train']
y_test = loaded['y_test']
x_train.shape, x_test_none.shape, x_test_gaussian.shape, y_train.shape, y_test.shape

((10000, 121), (3000, 121), (3000, 121), (10000,), (3000,))