In [1]:
# XGBoost on Otto dataset, Tune learning_rate
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot


In [2]:
# load data
data = read_csv('../../data/train-sample.csv')
dataset = data.values
# split data into X and y
X = dataset[:,0:94]
y = dataset[:,94]
# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)


In [3]:
print(X)

[[1 1 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]
 [3 0 0 ... 0 0 0]
 ...
 [61876 0 0 ... 0 0 0]
 [61877 1 0 ... 3 10 0]
 [61878 0 0 ... 0 2 0]]


In [4]:
print (y)

['Class_1' 'Class_1' 'Class_1' ... 'Class_9' 'Class_9' 'Class_9']


In [5]:
print(label_encoded_y)

[0 0 0 ... 8 8 8]


In [6]:
# grid search
model = XGBClassifier()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, label_encoded_y)


In [7]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: -0.000832 using {'learning_rate': 0.3}
-2.155496 (0.000089) with: {'learning_rate': 0.0001}
-1.841059 (0.000739) with: {'learning_rate': 0.001}
-0.597215 (0.001007) with: {'learning_rate': 0.01}
-0.000926 (0.001211) with: {'learning_rate': 0.1}
-0.000835 (0.001185) with: {'learning_rate': 0.2}
-0.000832 (0.001178) with: {'learning_rate': 0.3}


In [9]:
# plot
pyplot.errorbar(learning_rate, means, yerr=stds)
pyplot.title("XGBoost learning_rate vs Log Loss")
pyplot.xlabel('learning_rate')
pyplot.ylabel('Log Loss')
pyplot.savefig('learning_rate.png')