In [1]:
import numpy as np
import h5py
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import pandas as pd

In [33]:
from itertools import permutations

def cluster_accuracy(clusters, gt, n_classes):
    accuracies = []
    for p in permutations(range(n_classes)):
        mapped = np.vectorize({i: x for i, x in enumerate(p)}.get)(clusters)
        accuracies.append(sum(mapped == gt))
    return max(accuracies) / len(gt)

In [26]:
with h5py.File('../files/mapped_data1.h5') as f:
    train1  = np.array(f["train_x"])
    train_y = np.array(f["train_y"])
    test1   = np.array(f["test_x"])
    test_y  = np.array(f["test_y"])

metrics = ['betti', 'bottleneck', 'landscape', 'wasserstein']

with h5py.File('../files/metrics-test.h5') as f:
    test_metrics = {name: np.array(f[name]) for name in metrics}
    reference_index = np.array(f['reference-index'])

with h5py.File('../files/metrics-train-all.h5') as f:
    train_metrics = {name: np.array(f[name]) for name in metrics}
    train_index = np.array(f["train-index"])

In [8]:
def create_dataset(X, Y, additional, indexes):
    return np.hstack((X[indexes], additional)), Y[indexes]

In [27]:
train_index.shape

(2609,)

In [28]:
datasets = {
    'original': (train1, train_y, test1, test_y),
    **{name: (
        *create_dataset(
            train1, 
            train_y, 
            np.reshape(train_metrics[name], (train_index.shape[0], 40)), 
            train_index),
        *create_dataset(
            test1, 
            test_y, 
            np.reshape(test_metrics[name], (80, 40)), 
            np.array(list(range(80)))),
        ) for name in metrics}
}

In [22]:
len(datasets['betti'])

4

In [29]:
sv_clf = svm.SVC(random_state=42)
rf_clf = RandomForestClassifier(n_estimators = 300, random_state=1)

In [30]:
for clf in [rf_clf, sv_clf]:
    print(clf)
    for name, (x, y, vx, vy) in datasets.items():
        clf.fit(x, y)
        yp = clf.predict(vx)
        print('\t', name, sum(yp == vy) / len(vy))


RandomForestClassifier(n_estimators=300, random_state=1)
	 original 0.825
	 betti 0.7875
	 bottleneck 0.8125
	 landscape 0.8125
	 wasserstein 0.775
SVC(random_state=42)
	 original 0.7625
	 betti 0.7
	 bottleneck 0.75
	 landscape 0.75
	 wasserstein 0.75


In [31]:
kmeans = KMeans(n_clusters=4)

In [34]:
for name, (x, y, _, _) in datasets.items():
    yp = kmeans.fit_predict(x)
    print(name, cluster_accuracy(yp, y, 4))




original 0.4337899543378995




betti 0.451513990034496




bottleneck 0.4342660022997317




landscape 0.4334994250670755




wasserstein 0.4342660022997317


In [37]:
train_all = np.hstack((
    train1[train_index], 
    *(np.reshape(train_metrics[name], (train_index.shape[0], 40)) for name in train_metrics)))
test_all = np.hstack((
    test1,
    *(np.reshape(test_metrics[name], (80, 40)) for name in test_metrics)))
test_all.shape

(80, 214)

In [39]:
for clf in [rf_clf, sv_clf]:
    clf.fit(train_all, train_y[train_index])
    yp = clf.predict(test_all)
    print(clf, sum(yp == test_y) / len(vy))

RandomForestClassifier(n_estimators=300, random_state=1) 0.7875
SVC(random_state=42) 0.7


In [41]:
yp = kmeans.fit_predict(train_all)
print(cluster_accuracy(yp, train_y[train_index], 4))



0.451513990034496
