In [9]:
# Clustering as Preprocessor

# Example using MNIST digits: 
# cluster-then-regression beats just regression.

from sklearn.datasets import load_digits
X,y = load_digits(return_X_y=True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
from sklearn.linear_model import LogisticRegression
# We get "fail to converge" errors unless #iterations is very high.
# Higher numbers can yield higher scores but it isn't linear.
# Error message also suggests scaling the data.
log = LogisticRegression(max_iter=5000)
log.fit(X_train,y_train)
log.score(X_test,y_test)
# Without clustering, regression gets 96% accuracy.

0.9622222222222222

In [16]:
# Now try clustering + regression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ("kmeans",KMeans(n_clusters=50)),
    ("regression",LogisticRegression(max_iter=20000))
])
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)
# With clustering, regression gets 97% accuracy.

0.9688888888888889

In [24]:
# Now optimize hyperparameters.
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
param = dict(kmeans__n_clusters=range(10,100))
clf = GridSearchCV(pipe,param,cv=3,verbose=2)
# This took about 15 minutes and did not improve accuracy at all.
#clf.fit(X_train,y_train)
#pipe.score(X_test,y_test)

# This took 16 minutes at 400% cpu and made things worse: 94% accuracy
pipe = Pipeline([
    ("kmeans",KMeans(n_clusters=50)),
    ("scaler",MinMaxScaler()),
    ("regression",LogisticRegression(max_iter=1000))
])
param = dict(kmeans__n_clusters=range(10,200))
clf = GridSearchCV(pipe,param,cv=3,verbose=1)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

Fitting 3 folds for each of 190 candidates, totalling 570 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 570 out of 570 | elapsed: 16.7min finished


0.9422222222222222