In [1]:
import numpy as np
import h5py
from sklearn import svm, ensemble, linear_model, cluster
from sklearn import preprocessing, pipeline, feature_selection

In [2]:
from itertools import permutations

def accuracy(predicted, gt):
    return np.sum(predicted == gt) / len(predicted)

def cluster_accuracy(clusters, gt, n_classes):
    accuracies = []
    for p in permutations(range(n_classes)):
        mapped = np.vectorize({i: x for i, x in enumerate(p)}.get)(clusters)
        accuracies.append(accuracy(mapped, gt))
    return max(accuracies)

In [3]:
with h5py.File('../files/mapped_data1.h5') as f:
    train1  = np.array(f["train_x"])
    train_y = np.array(f["train_y"])
    test1   = np.array(f["test_x"])
    test_y  = np.array(f["test_y"])

metrics = ['betti', 'bottleneck', 'landscape', 'wasserstein']
files_suffixes = ['-8', '-4', '-14']

test_metrics = {}
train_metrics = {}

for suffix in files_suffixes:
    with h5py.File(f'../files/metrics-test{suffix}.h5') as f:
        test_metrics |= {name + suffix: np.array(f[name]) for name in metrics}
        reference_index = np.array(f['reference-index'])

    with h5py.File(f'../files/metrics-train-all{suffix}.h5') as f:
        train_metrics |= {name + suffix: np.array(f[name]) for name in metrics}
        train_index = np.array(f["train-index"])

In [4]:
def create_dataset(X, Y, additional, indexes):
    return np.hstack((X[indexes], additional)), Y[indexes]

In [12]:

datasets = {
    'original': (train1, train_y, test1, test_y),
    **{name: (
        *create_dataset(
            train1, 
            train_y, 
            np.reshape(train_metrics[name], (train_index.shape[0], 40)), 
            train_index),
        *create_dataset(
            test1, 
            test_y, 
            np.reshape(test_metrics[name], (80, 40)), 
            np.array(list(range(80)))),
        ) for name in train_metrics}
}

MAX_NAME_LEN = max(len(n) for n in datasets)

In [8]:
RANDOM_STATE = 2023
VARIANCE_THRESHOLD = 0.9 * (1 - 0.9)

pr = pipeline.Pipeline([
    ("feature selection", feature_selection.VarianceThreshold(VARIANCE_THRESHOLD)),
    ("normalization", preprocessing.StandardScaler())])

sv_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("svm", svm.SVC(random_state=RANDOM_STATE))])
sv_clf.name = "SVM"

rf_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("rf", ensemble.RandomForestClassifier(
        n_estimators=200, 
        random_state=RANDOM_STATE))])
rf_clf.name = "Random Forest"

rr_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("ridge regression", linear_model.RidgeClassifier(max_iter=1000))])
rr_clf.name = "Ridge regression"

lr_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("logistic regression", linear_model.LogisticRegression(max_iter=5000))])
lr_clf.name = "Logistic regression"


classifiers = [sv_clf, rf_clf, rr_clf, lr_clf]

In [9]:
for clf in classifiers:
    print(clf.name)
    for name, (x, y, vx, vy) in datasets.items():
        clf.fit(x, y)
        yp = clf.predict(vx)
        print('\t', name.ljust(MAX_NAME_LEN), accuracy(yp, vy))

SVM
	 original       0.7875
	 betti-8        0.8
	 bottleneck-8   0.775
	 landscape-8    0.775
	 wasserstein-8  0.775
	 betti-4        0.8
	 bottleneck-4   0.775
	 landscape-4    0.775
	 wasserstein-4  0.775
	 betti-14       0.8
	 bottleneck-14  0.775
	 landscape-14   0.775
	 wasserstein-14 0.775
Random Forest
	 original       0.8375
	 betti-8        0.8
	 bottleneck-8   0.8125
	 landscape-8    0.8125
	 wasserstein-8  0.8125
	 betti-4        0.7875
	 bottleneck-4   0.8125
	 landscape-4    0.8125
	 wasserstein-4  0.8125
	 betti-14       0.7875
	 bottleneck-14  0.8125
	 landscape-14   0.8125
	 wasserstein-14 0.8125
Ridge regression
	 original       0.725
	 betti-8        0.7125
	 bottleneck-8   0.725
	 landscape-8    0.725
	 wasserstein-8  0.725
	 betti-4        0.7125
	 bottleneck-4   0.725
	 landscape-4    0.725
	 wasserstein-4  0.725
	 betti-14       0.7125
	 bottleneck-14  0.725
	 landscape-14   0.725
	 wasserstein-14 0.725
Logistic regression
	 original       0.775
	 betti-8        

In [10]:
km_cls = pipeline.Pipeline([
    ("preprocessing", pr),
    ("kmeans", cluster.KMeans(
        n_clusters=4, 
        init='random', 
        random_state=RANDOM_STATE,
        n_init=20))
])

In [13]:
for name, (x, y, _, _) in datasets.items():
    km_cls.fit(x)
    yp = km_cls.predict(x)
    print(name.ljust(MAX_NAME_LEN), cluster_accuracy(yp, y, 4))

original       0.6289954337899544
betti-8        0.5825986968187045
bottleneck-8   0.6301264852433883
landscape-8    0.6301264852433883
wasserstein-8  0.6301264852433883
betti-4        0.5875814488309697
bottleneck-4   0.6301264852433883
landscape-4    0.6301264852433883
wasserstein-4  0.6301264852433883
betti-14       0.5875814488309697
bottleneck-14  0.6301264852433883
landscape-14   0.6301264852433883
wasserstein-14 0.6301264852433883
