# Grid/RandomSearchCV with Pipelines

In [1]:
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import pipeline

In [2]:
df = pd.read_csv('../input/mobile-price-classification/train.csv')
X  = df.drop('price_range', axis = 1).values
y  = df['price_range'].values

In [3]:
scl = preprocessing.StandardScaler()
pca = decomposition.PCA()

In [4]:
#n_jobs=-1 so that it can use all the cores of the system
rf = ensemble.RandomForestClassifier(n_jobs=-1)

In [5]:
classifier = pipeline.Pipeline([("scaling", scl),("pca", pca),("rf", rf)])

In [6]:
param_grid = {
    "pca__n_components": np.arange(5, 10),
    "rf__n_estimators": np.arange(100, 1500, 100), #100 to 1500 with 100 step_size
    "rf__max_depth": np.arange(1, 20),
    "rf__criterion": ["gini", "entropy"],
}

In [7]:
model = model_selection.RandomizedSearchCV(
    estimator = classifier,
    param_distributions = param_grid,
    n_iter = 5,
    scoring = "accuracy",
    verbose = 10,  #max_value
    n_jobs = 1,
    cv = 5,   
    #stratified fold is recomended
    # if we dont specify cv = 5 but it is still going to use cv = 5
    # if we have categoris as target or its binalry then is it going to use stratified k-fold     
)

# RandomizedSearchCV

In [8]:
model.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9, score=0.405, total=   4.2s
[CV] rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV]  rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9, score=0.475, total=   2.9s
[CV] rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.0s remaining:    0.0s


[CV]  rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9, score=0.485, total=   2.8s
[CV] rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.9s remaining:    0.0s


[CV]  rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9, score=0.487, total=   3.1s
[CV] rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.0s remaining:    0.0s


[CV]  rf__n_estimators=800, rf__max_depth=4, rf__criterion=entropy, pca__n_components=9, score=0.432, total=   3.0s
[CV] rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.0s remaining:    0.0s


[CV]  rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5, score=0.350, total=   0.5s
[CV] rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   16.5s remaining:    0.0s


[CV]  rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5, score=0.375, total=   0.6s
[CV] rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   17.1s remaining:    0.0s


[CV]  rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5, score=0.287, total=   0.6s
[CV] rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   17.7s remaining:    0.0s


[CV]  rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5, score=0.305, total=   0.5s
[CV] rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   18.3s remaining:    0.0s


[CV]  rf__n_estimators=200, rf__max_depth=1, rf__criterion=gini, pca__n_components=5, score=0.285, total=   0.5s
[CV] rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9 
[CV]  rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9, score=0.405, total=   5.8s
[CV] rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9 
[CV]  rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9, score=0.455, total=   5.7s
[CV] rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9 
[CV]  rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9, score=0.485, total=   5.5s
[CV] rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9 
[CV]  rf__n_estimators=1000, rf__max_depth=11, rf__criterion=entropy, pca__n_components=9, score=0.517, total=   5.4s
[CV] rf__n_estimators=1000, rf__max_depth=11, rf__crite

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaling', StandardScaler()),
                                             ('pca', PCA()),
                                             ('rf',
                                              RandomForestClassifier(n_jobs=-1))]),
                   n_iter=5, n_jobs=1,
                   param_distributions={'pca__n_components': array([5, 6, 7, 8, 9]),
                                        'rf__criterion': ['gini', 'entropy'],
                                        'rf__max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'rf__n_estimators': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
       1200, 1300, 1400])},
                   scoring='accuracy', verbose=10)

In [9]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.4575
{'memory': None, 'steps': [('scaling', StandardScaler()), ('pca', PCA(n_components=9)), ('rf', RandomForestClassifier(criterion='entropy', max_depth=11, n_estimators=1000,
                       n_jobs=-1))], 'verbose': False, 'scaling': StandardScaler(), 'pca': PCA(n_components=9), 'rf': RandomForestClassifier(criterion='entropy', max_depth=11, n_estimators=1000,
                       n_jobs=-1), 'scaling__copy': True, 'scaling__with_mean': True, 'scaling__with_std': True, 'pca__copy': True, 'pca__iterated_power': 'auto', 'pca__n_components': 9, 'pca__random_state': None, 'pca__svd_solver': 'auto', 'pca__tol': 0.0, 'pca__whiten': False, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'entropy', 'rf__max_depth': 11, 'rf__max_features': 'auto', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_impurity_split': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__min_weight_fraction

# GridSearchCV

In [10]:
param_grid = {
    "pca__n_components": [5, 10],
    "rf__n_estimators": [100, 200, 300, 400],
    "rf__max_depth": [1, 3, 5, 7],
    "rf__criterion": ["gini", "entropy"]
}

In [11]:
model = model_selection.GridSearchCV(
    estimator = classifier,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 10,  #max_value
    n_jobs = 1,
    
    #stratified fold is recomended
    # if we dont specify cv = 5 but it is still going to use cv = 5
    # if we have categoris as target or its binalry then is it going to use stratified k-fold 
    cv = 5   
)

In [12]:
model.fit(X,y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100, score=0.345, total=   0.4s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100, score=0.390, total=   0.4s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100, score=0.328, total=   0.4s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100, score=0.338, total=   0.4s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.6s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=100, score=0.278, total=   0.4s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.9s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200, score=0.355, total=   0.5s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.4s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200, score=0.367, total=   0.5s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.0s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200, score=0.280, total=   0.5s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.5s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200, score=0.312, total=   0.5s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.0s remaining:    0.0s


[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=200, score=0.292, total=   0.5s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300 
[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300, score=0.365, total=   1.0s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300 
[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300, score=0.360, total=   0.8s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300 
[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300, score=0.287, total=   0.8s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300 
[CV]  pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300, score=0.312, total=   0.8s
[CV] pca__n_components=5, rf__criterion=gini, rf__max_depth=1, rf__n_estimators=300 
[CV]  pca_

[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed:  4.6min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaling', StandardScaler()),
                                       ('pca', PCA()),
                                       ('rf',
                                        RandomForestClassifier(n_jobs=-1))]),
             n_jobs=1,
             param_grid={'pca__n_components': [5, 10],
                         'rf__criterion': ['gini', 'entropy'],
                         'rf__max_depth': [1, 3, 5, 7],
                         'rf__n_estimators': [100, 200, 300, 400]},
             scoring='accuracy', verbose=10)

In [13]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.504
{'memory': None, 'steps': [('scaling', StandardScaler()), ('pca', PCA(n_components=10)), ('rf', RandomForestClassifier(max_depth=5, n_estimators=200, n_jobs=-1))], 'verbose': False, 'scaling': StandardScaler(), 'pca': PCA(n_components=10), 'rf': RandomForestClassifier(max_depth=5, n_estimators=200, n_jobs=-1), 'scaling__copy': True, 'scaling__with_mean': True, 'scaling__with_std': True, 'pca__copy': True, 'pca__iterated_power': 'auto', 'pca__n_components': 10, 'pca__random_state': None, 'pca__svd_solver': 'auto', 'pca__tol': 0.0, 'pca__whiten': False, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'gini', 'rf__max_depth': 5, 'rf__max_features': 'auto', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_impurity_split': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__min_weight_fraction_leaf': 0.0, 'rf__n_estimators': 200, 'rf__n_jobs': -1, 'rf__oob_score': False, 'rf__random_st