In [55]:
## import

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from bayes_opt import BayesianOptimization as BO
from IPython.display import display
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import xgboost as XGB
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor as GBR
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn import linear_model
import dask_searchcv as dcv
from dask.diagnostics import ProgressBar

%matplotlib inline
plt.style.use('ggplot')


In [6]:
df_train = pd.read_csv('train_new_features.csv', index_col='id')
df_train.head()

Unnamed: 0_level_0,sg,n_atoms,x_Al,x_Ga,x_In,a,b,c,alpha,beta,...,avg_size_calc,avg_EA,avg_EN,avg_HOMO,avg_IP,avg_LUMO,avg_mass,avg_rd_max,avg_rp_max,avg_rs_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,...,0.201799,-0.020559,0.172588,-0.277587,-0.593489,0.028551,4.405298,0.282055,0.147109,0.110364
2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,...,0.201425,-0.020521,0.172268,-0.277073,-0.592391,0.028498,4.397146,0.281534,0.146837,0.11016
3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,...,0.184712,-0.023364,0.168444,-0.276418,-0.591693,0.033063,3.578027,0.299762,0.150104,0.11285
4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,...,0.223062,-0.027645,0.16898,-0.278012,-0.584846,0.04599,5.004543,0.28811,0.150574,0.114528
5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,...,0.253691,-0.015645,0.152321,-0.233002,-0.483778,0.028951,7.336285,0.175926,0.114532,0.08701


In [8]:
## data
exclude_feature = ['alpha_r', 'beta_r', 'gamma_r', 'x_In', 'x_Al', 'sg', 'n_atoms', 'alpha', 'beta', 'gamma', 'x_Ga', 'vol']
data_train = df_train.drop(exclude_feature, axis=1)
features = list(data_train.drop(['Ef', 'Eg'], axis=1))

X = data_train[features].values
y_g = data_train['Eg'].values
y_f = data_train['Ef'].values

#df_test = pd.read_csv('test_new_features.csv', index_col='id')
#X_test = df_test.drop(exclude_feature, axis=1).values

In [9]:
## define scoring fuction rmsle
def rmsle(actual, predicted):
    return np.square(np.log(predicted + 1) - np.log(actual + 1)).mean() ** 0.5

my_score = make_scorer(rmsle, greater_is_better=False)

In [40]:
def opt_model(X, y, model, params, cv_search):
    '''
    model: any ml models;
    params: parameters corresponding to the model;
    cv_search: cross validation searching method;
    '''
    if cv_search == "GridSearchCV":
        method = dcv.GridSearchCV(model,
                            param_grid=params, 
                            scoring=my_score,
                            cv=5,
                            n_jobs=8,
                            return_train_score=False)
    
    elif cv_search == "RandomizedSearchCV":
        method = dcv.RandomizedSearchCV(model, 
                            param_distributions=params,
                            scoring=my_score,
                            cv=5,
                            n_iter=25,
                            n_jobs=8,
                            return_train_score=False)
         
    opt = method.fit(X, y)
    return opt

### Polynomial ridge regression

In [16]:
model = Pipeline([('poly', PolynomialFeatures()), 
                  ('ridge', linear_model.Ridge())])
param_grid = dict(poly__degree=[3], ridge__alpha=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5])
with ProgressBar():
    prr_g = opt_model(X, y_g, model, param_grid, "GridSearchCV")

[########################################] | 100% Completed |  3.7s


In [17]:
pd.DataFrame(prr_g.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_poly__degree,param_ridge__alpha,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
0,0.822763,0.320215,-0.088784,3,0.001,"{u'poly__degree': 3, u'ridge__alpha': 0.001}",1,-0.084362,-0.090244,-0.094591,-0.081379,-0.093342,0.048637,0.157831,0.005119
1,0.790816,0.246949,-0.089433,3,0.005,"{u'poly__degree': 3, u'ridge__alpha': 0.005}",2,-0.084747,-0.090125,-0.09457,-0.083153,-0.094569,0.080305,0.048203,0.004788
2,0.690965,0.313409,-0.089622,3,0.01,"{u'poly__degree': 3, u'ridge__alpha': 0.01}",3,-0.085145,-0.090125,-0.094181,-0.083943,-0.094716,0.098893,0.136186,0.004456
3,0.678107,0.236025,-0.089633,3,0.05,"{u'poly__degree': 3, u'ridge__alpha': 0.05}",4,-0.086019,-0.090233,-0.091947,-0.085475,-0.094488,0.068737,0.088787,0.003454
4,0.658248,0.185668,-0.089656,3,0.1,"{u'poly__degree': 3, u'ridge__alpha': 0.1}",5,-0.086304,-0.090308,-0.091035,-0.085925,-0.094707,0.145885,0.07468,0.003256
5,0.634407,0.218456,-0.090101,3,0.5,"{u'poly__degree': 3, u'ridge__alpha': 0.5}",6,-0.086923,-0.090625,-0.090573,-0.086705,-0.095678,0.1247,0.123114,0.003263


In [18]:
with ProgressBar():
    prr_f = opt_model(X, y_f, model, param_grid, "GridSearchCV")

[########################################] | 100% Completed |  3.7s


In [19]:
pd.DataFrame(prr_f.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_poly__degree,param_ridge__alpha,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
0,0.66577,0.156025,-0.034727,3,0.001,"{u'poly__degree': 3, u'ridge__alpha': 0.001}",1,-0.03514,-0.032987,-0.035461,-0.033966,-0.036082,0.043084,0.01686,0.001109
1,0.775056,0.167651,-0.034837,3,0.005,"{u'poly__degree': 3, u'ridge__alpha': 0.005}",2,-0.035154,-0.033328,-0.035599,-0.034248,-0.035855,0.06537,0.050774,0.000932
2,0.832565,0.223985,-0.03493,3,0.01,"{u'poly__degree': 3, u'ridge__alpha': 0.01}",3,-0.035202,-0.033487,-0.035663,-0.034373,-0.035925,0.070863,0.132896,0.000894
3,0.810626,0.141307,-0.035044,3,0.05,"{u'poly__degree': 3, u'ridge__alpha': 0.05}",4,-0.035284,-0.033745,-0.035803,-0.03462,-0.035766,0.086559,0.01971,0.000778
4,0.707572,0.223533,-0.035093,3,0.1,"{u'poly__degree': 3, u'ridge__alpha': 0.1}",5,-0.035298,-0.033807,-0.03591,-0.03473,-0.035722,0.096111,0.114656,0.00076
5,0.797008,0.275838,-0.035359,3,0.5,"{u'poly__degree': 3, u'ridge__alpha': 0.5}",6,-0.035411,-0.034036,-0.036313,-0.035047,-0.035987,0.130906,0.156453,0.000794


In [22]:
best_g = prr_g.best_estimator_
best_f = prr_f.best_estimator_

print("Best models:", best_g, best_f)

('Best models:', Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]), Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]))


In [23]:
print("Bandgap RMSLE: {:.6f}, formation RMSLE: {:.6f}".format(abs(prr_g.best_score_), abs(prr_f.best_score_)))
print("Average RMSLE: {:.6f}".format(abs(prr_g.best_score_ + prr_f.best_score_)/2))

Bandgap RMSLE: 0.088784, formation RMSLE: 0.034727
Average RMSLE: 0.061755


### SVR

In [24]:
X_scale = scale(X)
model = SVR()
param_grid = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1, 10, 100]}
with ProgressBar():
    svr_g = opt_model(X_scale, y_g, model, param_grid, "GridSearchCV")

[########################################] | 100% Completed | 42.6s


In [46]:
pd.DataFrame(svr_g.cv_results_).sort_values("rank_test_score").head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_C,param_kernel,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
7,0.433804,0.024051,-0.089802,10.0,rbf,"{u'kernel': u'rbf', u'C': 10}",1,-0.090638,-0.092085,-0.089724,-0.083618,-0.092943,0.02863,0.000427,0.003287
9,2.361154,0.026896,-0.091046,100.0,rbf,"{u'kernel': u'rbf', u'C': 100}",2,-0.092827,-0.092655,-0.093278,-0.08397,-0.092499,0.438512,0.005306,0.003547
5,0.208265,0.019746,-0.09385,1.0,rbf,"{u'kernel': u'rbf', u'C': 1}",3,-0.096708,-0.0962,-0.092688,-0.086796,-0.096857,0.028041,0.005635,0.003843
3,0.18008,0.023013,-0.101962,0.1,rbf,"{u'kernel': u'rbf', u'C': 0.1}",4,-0.105421,-0.107645,-0.102316,-0.09183,-0.102598,0.026318,0.006526,0.00543
6,5.900496,0.01419,-0.11213,10.0,linear,"{u'kernel': u'linear', u'C': 10}",5,-0.116095,-0.119373,-0.109472,-0.10726,-0.108448,1.114784,0.003787,0.004744


In [26]:
with ProgressBar():
    svr_f = opt_model(X_scale, y_f, model, param_grid, "GridSearchCV")

[########################################] | 100% Completed | 13.5s


In [45]:
pd.DataFrame(svr_f.cv_results_).sort_values("rank_test_score").head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_C,param_kernel,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
3,0.029313,0.003311,-0.049103,0.1,rbf,"{u'kernel': u'rbf', u'C': 0.1}",1,-0.048188,-0.048672,-0.050336,-0.047389,-0.050928,0.004176,0.000801,0.001328
7,0.115738,0.003113,-0.04948,10.0,rbf,"{u'kernel': u'rbf', u'C': 10}",2,-0.050915,-0.048004,-0.04651,-0.049134,-0.052837,0.012089,0.000501,0.002211
5,0.071083,0.003047,-0.050483,1.0,rbf,"{u'kernel': u'rbf', u'C': 1}",3,-0.050142,-0.050266,-0.050042,-0.04894,-0.053022,0.007574,0.000695,0.001355
9,0.292022,0.004479,-0.050853,100.0,rbf,"{u'kernel': u'rbf', u'C': 100}",4,-0.050065,-0.049997,-0.051965,-0.047646,-0.054593,0.037187,0.000434,0.002318
2,0.117186,0.001832,-0.053342,0.1,linear,"{u'kernel': u'linear', u'C': 0.1}",5,-0.052397,-0.052997,-0.055989,-0.051235,-0.054091,0.00546,0.000432,0.001614


In [28]:
best_g = prr_g.best_estimator_
best_f = prr_f.best_estimator_

print("Best models:", best_g, best_f)

('Best models:', Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]), Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('ridge', Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]))


In [29]:
print("Bandgap RMSLE: {:.6f}, formation RMSLE: {:.6f}".format(abs(svr_g.best_score_), abs(svr_f.best_score_)))
print("Average RMSLE: {:.6f}".format(abs((svr_g.best_score_ + svr_f.best_score_)/2)))

Bandgap RMSLE: 0.089802, formation RMSLE: 0.049103
Average RMSLE: 0.069452


In [41]:
## Bayes search
model = SVR()
param_grid = {'kernel':('linear', 'rbf'), 
              'C':[0.01, 0.1, 1, 10, 100],
              'degree': [1, 2, 3, 4]}
with ProgressBar():
    svr_bs_g = opt_model(X_scale, y_g, model, param_grid, "RandomizedSearchCV")


[########################################] | 100% Completed |  1min 26.4s


In [42]:
pd.DataFrame(svr_bs_g.cv_results_).sort_values("rank_test_score").head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_C,param_degree,param_kernel,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
16,0.45199,0.024008,-0.089802,10,1,rbf,"{u'kernel': u'rbf', u'C': 10, u'degree': 1}",1,-0.090638,-0.092085,-0.089724,-0.083618,-0.092943,0.020936,0.000515,0.003287
9,0.485648,0.026406,-0.089802,10,4,rbf,"{u'kernel': u'rbf', u'C': 10, u'degree': 4}",1,-0.090638,-0.092085,-0.089724,-0.083618,-0.092943,0.065478,0.004516,0.003287
20,2.063877,0.024256,-0.091046,100,4,rbf,"{u'kernel': u'rbf', u'C': 100, u'degree': 4}",3,-0.092827,-0.092655,-0.093278,-0.08397,-0.092499,0.098253,0.000208,0.003547
17,2.043319,0.024118,-0.091046,100,2,rbf,"{u'kernel': u'rbf', u'C': 100, u'degree': 2}",3,-0.092827,-0.092655,-0.093278,-0.08397,-0.092499,0.096167,0.000501,0.003547
5,2.526914,0.029677,-0.091046,100,1,rbf,"{u'kernel': u'rbf', u'C': 100, u'degree': 1}",3,-0.092827,-0.092655,-0.093278,-0.08397,-0.092499,0.441058,0.006463,0.003547


In [43]:
with ProgressBar():
    svr_bs_f = opt_model(X_scale, y_f, model, param_grid, "RandomizedSearchCV")
pd.DataFrame(svr_bs_f.cv_results_).sort_values("rank_test_score").head()

[########################################] | 100% Completed | 27.0s


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_C,param_degree,param_kernel,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
20,0.042347,0.005344,-0.049103,0.1,3,rbf,"{u'kernel': u'rbf', u'C': 0.1, u'degree': 3}",1,-0.048188,-0.048672,-0.050336,-0.047389,-0.050928,0.002468,0.000157,0.001328
5,0.026491,0.002774,-0.049103,0.1,2,rbf,"{u'kernel': u'rbf', u'C': 0.1, u'degree': 2}",1,-0.048188,-0.048672,-0.050336,-0.047389,-0.050928,0.002048,0.000381,0.001328
17,0.033792,0.003837,-0.049103,0.1,4,rbf,"{u'kernel': u'rbf', u'C': 0.1, u'degree': 4}",1,-0.048188,-0.048672,-0.050336,-0.047389,-0.050928,0.002364,0.000237,0.001328
0,0.118054,0.003158,-0.04948,10.0,2,rbf,"{u'kernel': u'rbf', u'C': 10, u'degree': 2}",4,-0.050915,-0.048004,-0.04651,-0.049134,-0.052837,0.018816,0.000977,0.002211
7,0.119762,0.003284,-0.04948,10.0,4,rbf,"{u'kernel': u'rbf', u'C': 10, u'degree': 4}",4,-0.050915,-0.048004,-0.04651,-0.049134,-0.052837,0.021089,0.0008,0.002211


In [44]:
print("Bandgap RMSLE: {:.6f}, formation RMSLE: {:.6f}".format(abs(svr_bs_g.best_score_), abs(svr_bs_f.best_score_)))
print("Average RMSLE: {:.6f}".format(abs((svr_bs_g.best_score_ + svr_bs_f.best_score_)/2)))

Bandgap RMSLE: 0.089802, formation RMSLE: 0.049103
Average RMSLE: 0.069452


### Random forest

In [47]:
## Optimization

model = RFR(random_state=99)
param_grid = {"n_estimators": [200, 400, 600, 800, 1000],
     'max_depth': [2, 4, 6, 8],
     'min_samples_split': [5, 10, 15],
     'max_features': ['auto', 'sqrt']}

with ProgressBar():
    rfr_g = opt_model(X, y_g, model, param_grid, "RandomizedSearchCV")
print(rfr_g.best_estimator_, "\nBest score: {}".format(rfr_g.best_score_))

[########################################] | 100% Completed |  8min 21.1s
(RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=1,
           oob_score=False, random_state=99, verbose=0, warm_start=False), '\nBest score: -0.0937994593445')


In [50]:
pd.DataFrame(rfr_g.cv_results_).sort_values("rank_test_score").head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
22,47.307799,1.26356,-0.093799,8,auto,10,800,"{u'max_features': u'auto', u'min_samples_split...",1,-0.098056,-0.095859,-0.092874,-0.088235,-0.093974,2.235457,1.015207,0.003293
24,26.629552,0.226457,-0.095288,8,sqrt,10,1000,"{u'max_features': u'sqrt', u'min_samples_split...",2,-0.100159,-0.098206,-0.093393,-0.087427,-0.097255,6.490922,0.050709,0.004506
20,28.572345,1.682999,-0.095307,8,sqrt,10,600,"{u'max_features': u'sqrt', u'min_samples_split...",3,-0.100077,-0.098432,-0.093419,-0.087366,-0.097243,1.217688,0.141891,0.004536
5,41.207723,1.746223,-0.095371,8,sqrt,5,1000,"{u'max_features': u'sqrt', u'min_samples_split...",4,-0.100061,-0.098085,-0.093294,-0.08779,-0.097623,1.753893,1.281988,0.004387
6,24.421139,1.726627,-0.095384,8,sqrt,5,600,"{u'max_features': u'sqrt', u'min_samples_split...",5,-0.100074,-0.09822,-0.093204,-0.087793,-0.09763,6.819907,0.455859,0.004415


In [48]:
with ProgressBar():
    rfr_f = opt_model(X, y_f, model, param_grid, "RandomizedSearchCV")
print rfr_f.best_estimator_,"\nBest score:",rfr_f.best_score_

[########################################] | 100% Completed |  9min 11.7s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=1,
           oob_score=False, random_state=99, verbose=0, warm_start=False) 
Best score: -0.035499976297408975


In [51]:
pd.DataFrame(rfr_f.cv_results_).sort_values("rank_test_score").head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
17,47.520724,1.517249,-0.0355,8,auto,5,800,"{u'max_features': u'auto', u'min_samples_split...",1,-0.037772,-0.033947,-0.036157,-0.032422,-0.037202,6.165359,0.89966,0.002019
0,15.028004,0.597484,-0.03555,8,auto,10,200,"{u'max_features': u'auto', u'min_samples_split...",2,-0.037804,-0.033884,-0.0362,-0.032612,-0.03725,0.175258,0.093602,0.00199
22,49.087515,1.611576,-0.035683,8,auto,15,800,"{u'max_features': u'auto', u'min_samples_split...",3,-0.037798,-0.034037,-0.036445,-0.032787,-0.037349,3.346601,0.730071,0.001946
18,9.917923,0.394181,-0.035707,8,auto,15,200,"{u'max_features': u'auto', u'min_samples_split...",4,-0.037836,-0.034086,-0.036351,-0.032946,-0.037318,1.123365,0.18859,0.001886
20,21.864355,1.315621,-0.035815,8,sqrt,5,400,"{u'max_features': u'sqrt', u'min_samples_split...",5,-0.037644,-0.033798,-0.036073,-0.033638,-0.037921,6.071424,0.102284,0.001825


In [49]:
print("Bandgap RMSLE: {:.6f}, formation RMSLE: {:.6f}".format(abs(rfr_g.best_score_), abs(rfr_f.best_score_)))
print("Average RMSLE: {:.6f}".format(abs((rfr_g.best_score_ + rfr_f.best_score_)/2)))

Bandgap RMSLE: 0.093799, formation RMSLE: 0.035500
Average RMSLE: 0.064650


###   Gradient Boosting Regressor (GBR)

In [58]:
model = XGBRegressor()
param_grid = {
     'learning_rate': [0.005, 0.001],
     'n_estimators': [1000, 1500],
     'max_depth': [2, 4, 6],
     'colsample_bytree': [0.4, 0.6, 0.8],
     'subsample': [0.4, 0.6, 0.8]
 }


with ProgressBar():
    gbr_g = opt_model(X, y_g, model, param_grid, "RandomizedSearchCV")

display(pd.DataFrame(gbr_g.cv_results_).sort_values("rank_test_score").head())
print gbr_g.best_estimator_,"\nBest score:",gbr_g.best_score_

[########################################] | 100% Completed | 53.1s


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
1,3.621698,0.046336,-0.092844,0.8,0.005,4,1500,0.6,"{u'n_estimators': 1500, u'subsample': 0.6, u'l...",1,-0.096519,-0.09629,-0.090771,-0.087111,-0.09353,0.438819,0.001251,0.00355
24,2.896545,0.033803,-0.092915,0.6,0.005,4,1500,0.6,"{u'n_estimators': 1500, u'subsample': 0.6, u'l...",2,-0.096535,-0.095957,-0.090922,-0.087475,-0.093684,0.222623,0.001267,0.003362
13,5.31348,0.089106,-0.093298,0.8,0.005,6,1500,0.4,"{u'n_estimators': 1500, u'subsample': 0.4, u'l...",3,-0.09856,-0.096783,-0.09241,-0.084998,-0.093739,0.108226,0.003102,0.004684
22,2.354024,0.026909,-0.093976,0.6,0.005,4,1000,0.6,"{u'n_estimators': 1000, u'subsample': 0.6, u'l...",4,-0.09798,-0.097372,-0.091524,-0.088586,-0.09442,0.08209,0.000305,0.003545
15,6.602622,0.09578,-0.094304,0.8,0.005,6,1500,0.8,"{u'n_estimators': 1500, u'subsample': 0.8, u'l...",5,-0.09811,-0.098011,-0.093389,-0.087004,-0.095008,0.202825,0.002976,0.004071


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.005,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=1500, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6) 
Best score: -0.09284429468242683


In [59]:
with ProgressBar():
    gbr_f = opt_model(X, y_f, model, param_grid, "RandomizedSearchCV")

display(pd.DataFrame(gbr_f.cv_results_).sort_values("rank_test_score").head())
print gbr_f.best_estimator_, "\nBest score:", format(gbr_f.best_score_)

[########################################] | 100% Completed | 45.9s


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,std_fit_time,std_score_time,std_test_score
14,2.958271,0.045459,-0.034919,0.6,0.005,6,1000,0.4,"{u'n_estimators': 1000, u'subsample': 0.4, u'l...",1,-0.036922,-0.032863,-0.034805,-0.033486,-0.036517,0.053102,0.002501,0.001604
23,2.50173,0.046805,-0.034927,0.4,0.005,6,1000,0.6,"{u'n_estimators': 1000, u'subsample': 0.6, u'l...",2,-0.036719,-0.032761,-0.034995,-0.033409,-0.036751,0.1023,0.006994,0.001646
24,3.863248,0.068419,-0.034958,0.6,0.005,6,1500,0.4,"{u'n_estimators': 1500, u'subsample': 0.4, u'l...",3,-0.036876,-0.032823,-0.034988,-0.033643,-0.03646,0.257099,0.001315,0.001564
19,5.329215,0.096868,-0.035055,0.6,0.005,6,1500,0.8,"{u'n_estimators': 1500, u'subsample': 0.8, u'l...",4,-0.037216,-0.032746,-0.035571,-0.033548,-0.036193,0.028052,0.003491,0.001663
18,2.347364,0.026687,-0.035167,0.6,0.005,4,1000,0.6,"{u'n_estimators': 1000, u'subsample': 0.6, u'l...",5,-0.036962,-0.033445,-0.03501,-0.033954,-0.036463,0.062542,0.001117,0.001368


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.005,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.4) 
Best score: -0.0349185775765


In [61]:
print("Bandgap RMSLE: {:.6f}, formation RMSLE: {:.6f}".format(abs(gbr_g.best_score_), abs(gbr_f.best_score_)))
print("Average RMSLE: {:.6f}".format(abs((gbr_g.best_score_ + gbr_f.best_score_)/2)))

Bandgap RMSLE: 0.092844, formation RMSLE: 0.034919
Average RMSLE: 0.063881
