In [22]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, make_scorer

# Fetch the data

In [2]:
X, y = fetch_openml('mnist_784', return_X_y=True)

In [23]:
# For the sake of time im taking a small subsample of the data
X2, _, y2, _ = train_test_split(X, y, train_size=0.025)

# Split and scale the data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2)

mms = MinMaxScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.transform(X_test)

# Set up the sklearn [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [25]:
kmeans = KMeans()
pca = PCA()
pca_kmeans = Pipeline(steps=[
    ('PCA', pca),
    ('KMeans', kmeans),
])

# Use [silhoutte score](https://en.wikipedia.org/wiki/Silhouette_(clustering)) as the metric to optimize over in the randomized search
[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html)

In [41]:
# adapted from https://stackoverflow.com/a/44682305
def cv_silhouette_scorer(estimator, X):
    estimator.fit(X)
    cluster_labels = estimator.predict(X)
    num_labels = len(set(cluster_labels))
    num_samples = X.shape[0]
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return silhouette_score(X, cluster_labels)

# Randomized Search

In [46]:
rcv = RandomizedSearchCV(pca_kmeans,
    {
        'KMeans__n_clusters': range(3,11,2),
        'PCA__n_components': range(1, 100, 10)
    },
    n_iter=3,
    scoring=cv_silhouette_scorer
)

In [47]:
rcv.fit(X_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('PCA',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('KMeans',
                                              KMeans(algorithm='auto',
                                                     copy_x=True,
                                                     init='k-means++',
                                                     max_iter=300, n_clusters=8,
                                                     n_init=10, n_jobs=N

In [48]:
pd.DataFrame(data=rcv.cv_results_)[[
    'param_PCA__n_components',
    'param_KMeans__n_clusters',
    'split0_test_score',
    'split1_test_score',
    'split2_test_score',
    'mean_test_score',
    'std_test_score',
    'rank_test_score',
]]

Unnamed: 0,param_PCA__n_components,param_KMeans__n_clusters,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,61,9,0.063022,0.068896,0.055914,0.062611,0.005306,2
1,41,3,0.06299,0.049817,0.046,0.052943,0.007281,3
2,31,9,0.065641,0.081978,0.062208,0.069939,0.008623,1


In [49]:
best_pipeline = rcv.best_estimator_
best_pipeline

Pipeline(memory=None,
         steps=[('PCA',
                 PCA(copy=True, iterated_power='auto', n_components=31,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('KMeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=9, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)