# Clustering as Preprocessor

## MNIST digits 
Geron claims cluster-then-regression beats just regression. We have trouble making it happen.

In [7]:
from sklearn.datasets import load_digits
X,y = load_digits(return_X_y=True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
from sklearn.linear_model import LogisticRegression
# We get "fail to converge" errors unless #iterations is very high.
# Higher numbers can yield higher scores but it isn't linear.
# Error message also suggests scaling the data.
log = LogisticRegression(max_iter=5000)
log.fit(X_train,y_train)
score1=log.score(X_test,y_test)
score1

0.9688888888888889

In [8]:
# Now try clustering + regression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ("kmeans",KMeans(n_clusters=50)),
    ("regression",LogisticRegression(max_iter=20000))
])
pipe.fit(X_train,y_train)
score2=pipe.score(X_test,y_test)
score2

0.9777777777777777

In [None]:
# Now optimize hyperparameters.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
param = dict(kmeans__n_clusters=range(10,100))

#clf = GridSearchCV(pipe,param,cv=3,verbose=2)
# This took about 15 minutes and did not improve accuracy at all.
#clf.fit(X_train,y_train)
#pipe.score(X_test,y_test)

# With regular Kmerns, this took 16 minutes at 400% cpu.
# Reduce time (and accuracy) with minibatch.
from sklearn.cluster import MiniBatchKMeans
pipe = Pipeline([
    ("kmeans",MiniBatchKMeans(n_clusters=50)),
    ("scaler",MinMaxScaler()),
    ("regression",LogisticRegression(max_iter=1000))
])
param = dict(kmeans__n_clusters=range(10,200))
clf = GridSearchCV(pipe,param,cv=3,verbose=1)
clf.fit(X_train,y_train)
score3=clf.score(X_test,y_test)
score3

Fitting 3 folds for each of 190 candidates, totalling 570 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print("%f Just regression."%score1)
print("%f Kmeans, regression"%score2)
print("%f Kmeans, scaling, regression"%score3)
