In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
data = load_breast_cancer()

In [2]:
df = pd.DataFrame(data.data,columns = data.feature_names)
df['label'] = data.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
clf = RandomForestClassifier()

In [4]:
param_dist = {"max_depth": [3, None],
              "max_features": range(1, 11),
              "min_samples_split": range(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [5]:
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=10, cv=5)

In [6]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [7]:
random_search.fit(x,y)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [3, None], 'max_features': range(1, 11), 'min_samples_split': range(2, 11), 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [8]:
random_search.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 10,
 'min_samples_split': 4}

In [9]:
random_search.best_score_

0.96485061511423553

In [10]:
random_search.cv_results_



{'mean_fit_time': array([ 0.03988395,  0.01398349,  0.01349783,  0.02174363,  0.01239619,
         0.01273084,  0.01085935,  0.02757959,  0.02922487,  0.0160079 ]),
 'mean_score_time': array([ 0.00116634,  0.00104527,  0.00100217,  0.00101485,  0.00103378,
         0.00104704,  0.00098391,  0.00116444,  0.00134568,  0.00105915]),
 'mean_test_score': array([ 0.94903339,  0.95606327,  0.95079086,  0.95957821,  0.95079086,
         0.94551845,  0.93673111,  0.96485062,  0.96133568,  0.95079086]),
 'mean_train_score': array([ 0.99648543,  0.9960507 ,  0.97671574,  0.99867841,  0.96617397,
         0.97451503,  0.970131  ,  1.        ,  0.98374874,  0.97891066]),
 'param_bootstrap': masked_array(data = [True True True False True False True False False False],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'param_criterion': masked_array(data = ['entropy' 'entropy' 'gini' 'gini' 'entropy' 'gini' 'gini' 'gini' 'entropy'
  'gini'],

# Grid Search

In [16]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, 5],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [17]:
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(x, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
grid_search.best_score_

0.96660808435852374

In [19]:
final = grid_search.best_estimator_
final

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)