#  Bayesian Optimization with Gaussian Process

In [1]:
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import pipeline

In [2]:
df = pd.read_csv('../input/mobile-price-classification/train.csv')
X  = df.drop('price_range', axis = 1).values
y  = df['price_range'].values

https://scikit-optimize.github.io/stable/modules/generated/skopt.gp_minimize.html

In [3]:
def optimize(params, param_names, x, y):
    print(params, param_names)
    params = dict(zip(param_names, params))  #this will not work if we are tuning the params of different models
    model  = ensemble.RandomForestClassifier(**params) #**params to read the dict
    kf     = model_selection.StratifiedKFold(n_splits = 5)
    
    accuracies = []
    for idx in kf.split(X=x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        
        xtrain = x[train_idx]
        ytrain = y[train_idx]
        xtest = x[test_idx]
        ytest = y[test_idx]
        
        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        
        accuracies.append(fold_acc)
    
    return -1*np.mean(accuracies)

In [4]:
from functools import partial
from skopt import space
from skopt import gp_minimize

In [5]:
param_space = [
    space.Integer(3,15, name = "max_depth"),
    space.Integer(100, 600, name = "n_estimators"),
    space.Categorical( ["gini", "entropy"], name = "criterion"),
    
    space.Real(0.1,1, prior = "uniform", name = "max_features")
]

In [6]:
param_names  = ["max_depth", "n_estimators", "criterion", "max_features" ]
param_names

['max_depth', 'n_estimators', 'criterion', 'max_features']

In [7]:
optimization_func = partial(optimize, param_names = param_names, x = X, y = y)

In [8]:
result = gp_minimize(optimization_func, dimensions = param_space, n_calls = 15, n_random_starts = 10, verbose = 10)

Iteration No: 1 started. Evaluating function at random point.
[9, 126, 'entropy', 0.912206898906815] ['max_depth', 'n_estimators', 'criterion', 'max_features']
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 8.0921
Function value obtained: -0.9060
Current minimum: -0.9060
Iteration No: 2 started. Evaluating function at random point.
[11, 138, 'gini', 0.5927327398488184] ['max_depth', 'n_estimators', 'criterion', 'max_features']
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 4.8718
Function value obtained: -0.8975
Current minimum: -0.9060
Iteration No: 3 started. Evaluating function at random point.
[13, 234, 'gini', 0.779772655152289] ['max_depth', 'n_estimators', 'criterion', 'max_features']
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 9.9245
Function value obtained: -0.8995
Current minimum: -0.9060
Iteration No: 4 started. Evaluating function at random point.
[4, 372, 'entropy', 0.8106684152123416] ['max_depth', 'n_estim

In [9]:
print(dict(zip(param_names, result.x)))

{'max_depth': 9, 'n_estimators': 178, 'criterion': 'entropy', 'max_features': 0.7614783548856662}


In [10]:
classifier = ensemble.RandomForestClassifier(criterion ='entropy', max_depth = 12, n_estimators = 448,
                                             max_features = 0.5306651750933808, n_jobs=-1)

In [11]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier,X,y, cv=10)
print('scores\n',score)
print('\ncv values', score.shape)
print('\nScore_Mean', score.mean())

scores
 [0.92  0.915 0.935 0.92  0.91  0.92  0.885 0.905 0.89  0.89 ]

cv values (10,)

Score_Mean 0.9090000000000001
