In [None]:
%matplotlib nbagg
import matplotlib.pyplot as plt
import numpy as np

Grid Searches
=================

<img src="figures/grid_search_cross_validation.svg" width=100%>

Grid-Search with build-in cross validation

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC

In [None]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target)

Define parameter grid:

In [None]:
import numpy as np

param_grid = {'C': 10. ** np.arange(-3, 3),
              'gamma' : 10. ** np.arange(-5, 0)}

np.set_printoptions(suppress=True)
print(param_grid)

In [None]:
grid_search = GridSearchCV(SVC(), param_grid, verbose=3)

A GridSearchCV object behaves just like a normal classifier.

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.predict(X_test)

In [None]:
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_

In [None]:
# We extract just the scores

scores = [x.mean_validation_score for x in grid_search.grid_scores_]
scores = np.array(scores).reshape(6, 5)

plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(5), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C']);

Nested Cross-validation in scikit-learn:

In [None]:
from sklearn.cross_validation import cross_val_score
cross_val_score(GridSearchCV(SVC(), param_grid),
                digits.data, digits.target)

# Exercises
Use GridSearchCV to adjust max_depth and max_features of a RandomForestClassifier (from ``sklearn.ensemble``) on the digits dataset.

Visualize the results as a heat map.

Should you also adjust ``n_estimators``?

In [None]:
# %load solutions/grid_search_forest.py

# Randomized Search

<img src="figures/randomized_search.png" width=100%>

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

In [None]:
from scipy.stats import expon
plt.hist([expon.rvs(scale=0.001) for x in range(10000)], bins=100, normed=True);

In [None]:
from sklearn.grid_search import RandomizedSearchCV

param_distributions = {'C': expon(), 'gamma': expon()}
rs = RandomizedSearchCV(SVC(), param_distributions=param_distributions,
                        n_iter=50)

In [None]:
rs.fit(X_train, y_train)

In [None]:
rs.best_params_

In [None]:
rs.best_score_

In [None]:
scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['C'], score.parameters['gamma'])
                           for score in rs.grid_scores_])

In [None]:
plt.scatter(Cs, gammas, s=50, c=scores, linewidths=0)
plt.xlabel("C")
plt.ylabel("gamma")
plt.xscale("log")
plt.yscale("log")
plt.colorbar()

# Exercise
Add parameters that are not relevant (like ``coef0``) to param_distribution. Observe that it doesn't change the runtime and be happy.