In [3]:
import numpy as np
import h5py
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

In [4]:
from itertools import permutations

def cluster_accuracy(clusters, gt, n_classes):
    accuracies = []
    for p in permutations(range(n_classes)):
        mapped = np.vectorize({i: x for i, x in enumerate(p)}.get)(clusters)
        accuracies.append(sum(mapped == gt))
    return max(accuracies) / len(gt)

In [5]:
with h5py.File('../files/mapped_data1.h5') as f:
    train1  = np.array(f["train_x"])
    train_y = np.array(f["train_y"])
    test1   = np.array(f["test_x"])
    test_y  = np.array(f["test_y"])

metrics = ['betti', 'bottleneck', 'landscape', 'wasserstein']

with h5py.File('../files/metrics-test.h5') as f:
    test_metrics = {name: np.array(f[name]) for name in metrics}
    reference_index = np.array(f['reference-index'])

with h5py.File('../files/metrics-train-all.h5') as f:
    train_metrics = {name: np.array(f[name]) for name in metrics}
    train_index = np.array(f["train-index"])

In [6]:
def create_dataset(X, Y, additional, indexes):
    return np.hstack((X[indexes], additional)), Y[indexes]

In [7]:
train_index.shape

(2609,)

In [8]:
datasets = {
    'original': (train1, train_y, test1, test_y),
    **{name: (
        *create_dataset(
            train1, 
            train_y, 
            np.reshape(train_metrics[name], (train_index.shape[0], 40)), 
            train_index),
        *create_dataset(
            test1, 
            test_y, 
            np.reshape(test_metrics[name], (80, 40)), 
            np.array(list(range(80)))),
        ) for name in metrics}
}

In [9]:
len(datasets['betti'])

4

In [64]:
from sklearn import preprocessing, pipeline, feature_selection, linear_model

RANDOM_STATE = 2023
VARIANCE_THRESHOLD = 0.9 * (1 - 0.9)

pr = pipeline.Pipeline([
    ("feature selection", feature_selection.VarianceThreshold(VARIANCE_THRESHOLD)),
    ("normalization", preprocessing.StandardScaler())])

sv_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("svm", svm.SVC(random_state=RANDOM_STATE))])
rf_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("rf", RandomForestClassifier(n_estimators = 500, random_state=RANDOM_STATE))])
rr_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("ridge regression", linear_model.RidgeClassifier(max_iter=1000))])
lr_clf = pipeline.Pipeline([
    ("preprocessing", pr),
    ("logistic regression", linear_model.LogisticRegression(max_iter=5000))])

In [65]:
# for clf in [rf_clf, sv_clf, rr_clf, lr_clf]:
for clf in [sv_clf, rr_clf, lr_clf]:
    print(clf)
    for name, (x, y, vx, vy) in datasets.items():
        clf.fit(x, y)
        yp = clf.predict(vx)
        print('\t', name, sum(yp == vy) / len(vy))

Pipeline(steps=[('feature selection', VarianceThreshold(threshold=0.24)),
                ('preprocessing', StandardScaler()),
                ('svm', SVC(random_state=2023))])
	 original 0.7875
	 betti 0.8
	 bottleneck 0.775
	 landscape 0.775
	 wasserstein 0.775
Pipeline(steps=[('feature selection', VarianceThreshold(threshold=0.24)),
                ('preprocessing', StandardScaler()),
                ('ridge regression', RidgeClassifier(max_iter=1000))])
	 original 0.725
	 betti 0.7125
	 bottleneck 0.725
	 landscape 0.725
	 wasserstein 0.725
Pipeline(steps=[('feature selection', VarianceThreshold(threshold=0.24)),
                ('preprocessing', StandardScaler()),
                ('logistic regression', LogisticRegression(max_iter=5000))])
	 original 0.775
	 betti 0.7875
	 bottleneck 0.775
	 landscape 0.775
	 wasserstein 0.775


In [50]:
from sklearn import preprocessing, metrics, pipeline

kmeans = pipeline.Pipeline([
    ("feature selection", feature_selection.VarianceThreshold(0.6 * 0.4)),
    ("normalization", preprocessing.StandardScaler()),
    ("kmeans", KMeans(
        n_clusters=4, 
        init='random', 
        random_state=RANDOM_STATE,
        n_init=20))
])

In [66]:
for name, (x, y, _, _) in datasets.items():
    kmeans.fit(x)
    yp = kmeans.predict(x)
    print(name, cluster_accuracy(yp, y, 4))

original 0.6289954337899544
betti 0.5825986968187045
bottleneck 0.6301264852433883
landscape 0.6301264852433883
wasserstein 0.6301264852433883


In [18]:
train_all = np.hstack((
    train1[train_index], 
    *(np.reshape(train_metrics[name], (train_index.shape[0], 40)) for name in train_metrics)))
test_all = np.hstack((
    test1,
    *(np.reshape(test_metrics[name], (80, 40)) for name in test_metrics)))
test_all.shape

(80, 214)

In [48]:
for clf in [rf_clf, sv_clf]:
    clf.fit(train_all, train_y[train_index])
    yp = clf.predict(test_all)
    print(clf, sum(yp == test_y) / len(vy))

Pipeline(steps=[('feature selection', VarianceThreshold(threshold=0.24)),
                ('preprocessing', StandardScaler()),
                ('rf',
                 RandomForestClassifier(n_estimators=500, random_state=2023))]) 0.8
Pipeline(steps=[('feature selection', VarianceThreshold(threshold=0.24)),
                ('preprocessing', StandardScaler()),
                ('svm', SVC(random_state=2023))]) 0.8


In [49]:
yp = kmeans.fit_predict(train_all)
print(cluster_accuracy(yp, train_y[train_index], 4))



0.5825986968187045
