In [1]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, HDBSCAN

In [2]:
with open('performance_results.csv', 'r') as f:
    results = pd.read_csv(f)
results = results.drop(
    columns=results.columns[1:17]).set_index('name').transpose()
results

name,plant-margin_R.dat,teachingAssistant.csv,ar4.csv,glass_R.dat,prnn_synth.csv,kc3.csv,energy-y2_R.dat,pittsburg-bridges-T-OR-D_R.dat
NeuralNetworkModel,0.855932,0.615385,0.769231,0.972222,0.923077,0.987342,1.0,0.888889
XGBoostClassifier,0.533898,0.153846,0.769231,0.861111,0.615385,0.962025,0.875,0.888889
AdaBoostClassifier,0.161017,1.0,0.923077,0.944444,0.730769,0.962025,0.875,1.0
BaggingClassifier,1.0,0.769231,0.923077,1.0,0.692308,0.962025,0.944444,0.888889
BernoulliNB,0.165254,0.307692,0.769231,0.305556,0.115385,0.962025,0.722222,0.888889
CalibratedClassifierCV,0.830508,0.692308,0.846154,0.916667,0.923077,1.0,0.847222,0.888889
CategoricalNB,0.0,0.307692,0.769231,0.305556,0.115385,0.0,0.722222,0.888889
ComplementNB,0.326271,0.076923,0.615385,0.666667,0.384615,1.0,0.777778,0.944444
DecisionTreeClassifier,0.012712,0.230769,0.769231,0.861111,0.115385,0.962025,0.875,0.888889
DummyClassifier,0.016949,0.307692,0.769231,0.305556,0.0,0.962025,0.0,0.888889


In [3]:
n = 7

In [4]:
clusterer = KMeans(n_clusters=n)
clusters = clusterer.fit_predict(results)
clusters

array([0, 2, 0, 0, 2, 0, 6, 2, 2, 1, 1, 4, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 2, 0, 5, 0, 2, 2, 0, 1, 0, 2, 1, 2], dtype=int32)

In [5]:
for i in range(n):
    print(f'Cluster {i}:')
    print(results.index[clusters == i])

Cluster 0:
Index(['NeuralNetworkModel', 'AdaBoostClassifier', 'BaggingClassifier',
       'CalibratedClassifierCV', 'GaussianProcessClassifier',
       'HistGradientBoostingClassifier', 'KNeighborsClassifier',
       'LabelPropagation', 'LabelSpreading', 'LinearDiscriminantAnalysis',
       'LinearSVC', 'LogisticRegression', 'LogisticRegressionCV',
       'NearestCentroid', 'PassiveAggressiveClassifier',
       'RadiusNeighborsClassifier', 'RidgeClassifier'],
      dtype='object')
Cluster 1:
Index(['DummyClassifier', 'ExtraTreeClassifier', 'GradientBoostingClassifier',
       'RandomForestClassifier', 'SGDClassifier'],
      dtype='object')
Cluster 2:
Index(['XGBoostClassifier', 'BernoulliNB', 'ComplementNB',
       'DecisionTreeClassifier', 'MLPClassifier', 'MultinomialNB',
       'Perceptron', 'QuadraticDiscriminantAnalysis', 'RidgeClassifierCV',
       'SVC'],
      dtype='object')
Cluster 3:
Index(['GaussianNB'], dtype='object')
Cluster 4:
Index(['ExtraTreesClassifier'], dtype='obj

In [6]:
clusterer = AgglomerativeClustering(n_clusters=n)
clusters = clusterer.fit_predict(results)
clusters

array([5, 2, 4, 5, 2, 5, 0, 2, 2, 1, 1, 0, 6, 5, 1, 5, 5, 5, 5, 5, 5, 5,
       5, 2, 2, 5, 3, 5, 2, 2, 4, 1, 4, 2, 1, 2])

In [7]:
for i in range(n):
    print(f'Cluster {i}:')
    print(results.index[clusters == i])

Cluster 0:
Index(['CategoricalNB', 'ExtraTreesClassifier'], dtype='object')
Cluster 1:
Index(['DummyClassifier', 'ExtraTreeClassifier', 'GradientBoostingClassifier',
       'RandomForestClassifier', 'SGDClassifier'],
      dtype='object')
Cluster 2:
Index(['XGBoostClassifier', 'BernoulliNB', 'ComplementNB',
       'DecisionTreeClassifier', 'MLPClassifier', 'MultinomialNB',
       'Perceptron', 'QuadraticDiscriminantAnalysis', 'RidgeClassifierCV',
       'SVC'],
      dtype='object')
Cluster 3:
Index(['NuSVC'], dtype='object')
Cluster 4:
Index(['AdaBoostClassifier', 'RadiusNeighborsClassifier', 'RidgeClassifier'], dtype='object')
Cluster 5:
Index(['NeuralNetworkModel', 'BaggingClassifier', 'CalibratedClassifierCV',
       'GaussianProcessClassifier', 'HistGradientBoostingClassifier',
       'KNeighborsClassifier', 'LabelPropagation', 'LabelSpreading',
       'LinearDiscriminantAnalysis', 'LinearSVC', 'LogisticRegression',
       'LogisticRegressionCV', 'NearestCentroid',
       'Passive

In [8]:
clusterer = HDBSCAN()
clusters = clusterer.fit_predict(results)
clusters

array([ 0,  1, -1,  0, -1,  0, -1,  1,  1, -1, -1, -1, -1,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  1,  1,  0, -1,  0, -1, -1,  0, -1,  0,  1,
       -1,  1])

In [9]:
for i in range(clusters.max() + 1):
    print(f'Cluster {i}:')
    print(results.index[clusters == i])

Cluster 0:
Index(['NeuralNetworkModel', 'BaggingClassifier', 'CalibratedClassifierCV',
       'GaussianProcessClassifier', 'HistGradientBoostingClassifier',
       'KNeighborsClassifier', 'LabelPropagation', 'LabelSpreading',
       'LinearDiscriminantAnalysis', 'LinearSVC', 'LogisticRegression',
       'LogisticRegressionCV', 'NearestCentroid',
       'PassiveAggressiveClassifier', 'RadiusNeighborsClassifier',
       'RidgeClassifier'],
      dtype='object')
Cluster 1:
Index(['XGBoostClassifier', 'ComplementNB', 'DecisionTreeClassifier',
       'MLPClassifier', 'MultinomialNB', 'RidgeClassifierCV', 'SVC'],
      dtype='object')
