In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import preprocess as datapre

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import LabelEncoder

In [2]:
origin_data = pd.read_csv("../datasets/train_preprocess.csv")

In [3]:
train_data = datapre.dataset_sample(origin_data, frac=0.05)

In [4]:
train_data.shape

(44206, 22)

In [5]:
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', \
                 'HasBlock', 'RoadTypeID', 'RoadBlockID', 'RoadName1ID', 'RoadName2ID', 'X', 'Y']

In [6]:
X = train_data[feature_names]

In [7]:
y_true = train_data["Category"]

In [8]:
def neg_log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None):
    return -log_loss(y_true, y_pred, eps, normalize, sample_weight, labels)

call_neg_log_loss = make_scorer(neg_log_loss, needs_proba=True)

In [9]:
base_params = dict(max_depth=6, learning_rate=0.1, n_estimators=100, objective="multi:softprob",
                   n_job=-1, gamma=0.3, min_child_weight=5, subsample=0.5, colsample_bytree=0.5, random_state=10)

In [10]:
param_grid = {"n_estimators": list(range(400, 1001, 400))}
# param_grid = {"n_estimators": list(range(10, 20, 10))}

xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_n_estimators = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_n_estimators.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 16.2min remaining: 24.2min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 29.9min remaining: 12.8min


Training elapse 2503 sencond.


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 41.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 41.7min finished


In [11]:
cv_clf_n_estimators.best_params_

{'n_estimators': 400}

In [12]:
cv_clf_n_estimators.cv_results_

{'mean_fit_time': array([ 706.2623425 , 1240.52460256]),
 'std_fit_time': array([  6.15049484, 106.93194536]),
 'mean_score_time': array([50.70109897, 90.33956256]),
 'std_score_time': array([0.49489602, 2.71716984]),
 'param_n_estimators': masked_array(data=[400, 800],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 400}, {'n_estimators': 800}],
 'split0_test_score': array([-2.52965947, -2.64010256]),
 'split1_test_score': array([-2.54994375, -2.67232373]),
 'split2_test_score': array([-2.53260182, -2.64737798]),
 'split3_test_score': array([-2.53605858, -2.64986897]),
 'split4_test_score': array([-2.53708622, -2.65612748]),
 'mean_test_score': array([-2.53706921, -2.65315748]),
 'std_test_score': array([0.0069547 , 0.01087493]),
 'rank_test_score': array([1, 2]),
 'split0_train_score': array([-1.61464122, -1.21133016]),
 'split1_train_score': array([-1.60947029, -1.20253583]),
 'split2_train_score': array([-1.6152229

In [13]:
param_grid = {"max_depth": list(range(5,11))}

base_params["n_estimators"] = cv_clf_n_estimators.best_params_["n_estimators"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_max_depth = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_max_depth.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 30.5min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 53.7min remaining: 16.3min


Training elapse 4311 sencond.


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 71.9min finished


In [14]:
cv_clf_max_depth.best_params_

{'max_depth': 5}

In [15]:
cv_clf_max_depth.cv_results_

{'mean_fit_time': array([626.19248233, 702.92684155, 769.43168325, 828.9511951 ,
        875.59429774, 842.25797238]),
 'std_fit_time': array([ 3.45494354,  2.82458215,  5.74190074,  2.62147023, 28.06416944,
         9.61565067]),
 'mean_score_time': array([47.0621099 , 52.13231273, 55.64878421, 55.9638371 , 60.83959661,
        58.87585282]),
 'std_score_time': array([0.09338096, 4.88887037, 1.73601142, 3.34069386, 4.68284527,
        1.80940752]),
 'param_max_depth': masked_array(data=[5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 5},
  {'max_depth': 6},
  {'max_depth': 7},
  {'max_depth': 8},
  {'max_depth': 9},
  {'max_depth': 10}],
 'split0_test_score': array([-2.50880498, -2.52965947, -2.56193835, -2.59330729, -2.6279933 ,
        -2.65627778]),
 'split1_test_score': array([-2.52031923, -2.54994375, -2.58666441, -2.62619213, -2.65724265,
        -2.69486221]),
 'spli

In [16]:
param_grid = {"subsample": [i/10.0 for i in range(5,11)]}

base_params["max_depth"] = cv_clf_max_depth.best_params_["max_depth"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_subsample = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_subsample.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 44.4min remaining: 13.5min


Training elapse 3391 sencond.


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 56.5min finished


In [17]:
cv_clf_subsample.best_params_

{'subsample': 1.0}

In [18]:
cv_clf_subsample.cv_results_

{'mean_fit_time': array([627.3380919 , 640.1003655 , 636.48946977, 618.588555  ,
        587.57899246, 464.12498069]),
 'std_fit_time': array([ 1.56979833,  1.20545202,  3.20307576,  4.84241038, 34.82023552,
         1.24286883]),
 'mean_score_time': array([49.89434581, 51.53903599, 51.74610453, 51.9848238 , 51.16554165,
        44.8976038 ]),
 'std_score_time': array([0.30745812, 0.30386716, 1.07445005, 0.55010802, 1.91373078,
        0.18291945]),
 'param_subsample': masked_array(data=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'subsample': 0.5},
  {'subsample': 0.6},
  {'subsample': 0.7},
  {'subsample': 0.8},
  {'subsample': 0.9},
  {'subsample': 1.0}],
 'split0_test_score': array([-2.50880498, -2.5084495 , -2.50513324, -2.50563333, -2.50351309,
        -2.5011439 ]),
 'split1_test_score': array([-2.52031923, -2.52289024, -2.52167484, -2.52212255, -2.51814985,
       

In [19]:
param_grid = {"colsample_bytree": [i/10.0 for i in range(5,11)]}

base_params["subsample"] = cv_clf_subsample.best_params_["subsample"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_colsample_bytree = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_colsample_bytree.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 44.9min remaining: 13.7min


Training elapse 3567 sencond.


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 59.5min finished


In [20]:
cv_clf_colsample_bytree.best_params_

{'colsample_bytree': 0.5}

In [21]:
cv_clf_colsample_bytree.cv_results_

{'mean_fit_time': array([550.31487417, 592.83387299, 637.61420302, 676.87519708,
        704.27644124, 685.7923378 ]),
 'std_fit_time': array([ 1.66238957,  2.82515991,  1.85659911,  3.49916238, 34.46894857,
         7.15058799]),
 'mean_score_time': array([47.59789696, 48.41070781, 48.6649641 , 46.02194614, 47.89022465,
        43.20287366]),
 'std_score_time': array([0.28487682, 3.85917526, 1.27288079, 2.93536516, 4.65367396,
        1.56842069]),
 'param_colsample_bytree': masked_array(data=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'colsample_bytree': 0.5},
  {'colsample_bytree': 0.6},
  {'colsample_bytree': 0.7},
  {'colsample_bytree': 0.8},
  {'colsample_bytree': 0.9},
  {'colsample_bytree': 1.0}],
 'split0_test_score': array([-2.5011439 , -2.50372545, -2.50824547, -2.51176005, -2.51685107,
        -2.52096143]),
 'split1_test_score': array([-2.51336366, -2.5153409

In [22]:
param_grid = {"gamma": [i/10.0 for i in range(1,6)]}

base_params["colsample_bytree"] = cv_clf_colsample_bytree.best_params_["colsample_bytree"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_gamma = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_gamma.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done  17 out of  25 | elapsed: 40.6min remaining: 19.1min


Training elapse 2943 sencond.


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 49.1min finished


In [23]:
cv_clf_gamma.best_params_

{'gamma': 0.5}

In [24]:
cv_clf_gamma.cv_results_

{'mean_fit_time': array([551.24358735, 554.29367762, 563.41414204, 554.31911249,
        510.53647475]),
 'std_fit_time': array([ 2.77376359,  6.0755266 ,  5.4145506 ,  5.91920237, 85.68212822]),
 'mean_score_time': array([51.92375593, 51.48269906, 52.5986012 , 50.23768287, 46.97435017]),
 'std_score_time': array([0.47365801, 0.428969  , 0.92783886, 1.34428278, 7.30802904]),
 'param_gamma': masked_array(data=[0.1, 0.2, 0.3, 0.4, 0.5],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'gamma': 0.1},
  {'gamma': 0.2},
  {'gamma': 0.3},
  {'gamma': 0.4},
  {'gamma': 0.5}],
 'split0_test_score': array([-2.50153481, -2.49949787, -2.5011439 , -2.49878625, -2.49698049]),
 'split1_test_score': array([-2.51486742, -2.51499955, -2.51336366, -2.51024172, -2.50924856]),
 'split2_test_score': array([-2.49890718, -2.49638023, -2.4973559 , -2.4928639 , -2.49138574]),
 'split3_test_score': array([-2.50295881, -2.50292551, -2.503922

In [25]:
param_grid = {"min_child_weight": list(range(4,10))}

base_params["gamma"] = cv_clf_gamma.best_params_["gamma"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_min_child_weight = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_min_child_weight.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 40.0min remaining: 12.2min


Training elapse 3072 sencond.


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 51.2min finished


In [26]:
cv_clf_min_child_weight.best_params_

{'min_child_weight': 9}

In [27]:
cv_clf_min_child_weight.cv_results_

{'mean_fit_time': array([557.10483918, 549.70378079, 546.94308958, 541.70458159,
        521.92100205, 456.36973176]),
 'std_fit_time': array([ 3.436357  ,  1.39122553,  2.40576659,  1.59643441, 31.73576629,
         2.40518425]),
 'mean_score_time': array([51.47318053, 49.99020824, 50.26187272, 49.3864233 , 47.68853731,
        43.23817577]),
 'std_score_time': array([0.36121709, 0.30604972, 0.29297429, 0.31949911, 1.95143244,
        0.29821205]),
 'param_min_child_weight': masked_array(data=[4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'min_child_weight': 4},
  {'min_child_weight': 5},
  {'min_child_weight': 6},
  {'min_child_weight': 7},
  {'min_child_weight': 8},
  {'min_child_weight': 9}],
 'split0_test_score': array([-2.49869243, -2.49698049, -2.49290738, -2.49160397, -2.48774096,
        -2.48711578]),
 'split1_test_score': array([-2.51212317, -2.50924856, -2.50576084, -2.50417

In [28]:
param_grid = {"n_estimators": list(range(400, 3201, 400))}

base_params["min_child_weight"] = cv_clf_min_child_weight.best_params_["min_child_weight"]
base_params["learning_rate"] = 0.01
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_final = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_final.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done  36 out of  40 | elapsed: 291.6min remaining: 32.4min


Training elapse 20018 sencond.


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 333.6min finished


In [29]:
cv_clf_final.best_params_

{'n_estimators': 1200}

In [30]:
cv_clf_final.cv_results_

{'mean_fit_time': array([ 552.45958552, 1140.96406412, 1705.57841015, 2252.76837096,
        2805.65646358, 3346.5563849 , 3885.43975735, 4104.11954036]),
 'std_fit_time': array([  2.12940765,   5.99545494,   7.3856671 ,  13.87726463,
         24.57331849,  31.97847775,  44.3044339 , 231.03627305]),
 'mean_score_time': array([ 47.5550982 ,  94.73566446, 143.26178946, 196.71456027,
        246.45410538, 298.92934752, 343.14996161, 374.46643195]),
 'std_score_time': array([ 0.16273386,  2.47533958,  2.25280454,  1.96533256,  3.97578127,
         5.66808751,  5.49605444, 12.01964086]),
 'param_n_estimators': masked_array(data=[400, 800, 1200, 1600, 2000, 2400, 2800, 3200],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 400},
  {'n_estimators': 800},
  {'n_estimators': 1200},
  {'n_estimators': 1600},
  {'n_estimators': 2000},
  {'n_estimators': 2400},
  {'n_estimators': 2800},
  

#### 使用整个训练集进行训练

In [31]:
total_X = origin_data[feature_names]
total_y_true = origin_data["Category"]

In [32]:
base_params["n_estimators"] = cv_clf_final.best_params_["n_estimators"]
print(base_params)

{'max_depth': 5, 'learning_rate': 0.01, 'n_estimators': 1200, 'objective': 'multi:softprob', 'n_job': -1, 'gamma': 0.5, 'min_child_weight': 9, 'subsample': 1.0, 'colsample_bytree': 0.5, 'random_state': 10}


In [34]:
print(base_params)

{'max_depth': 5, 'learning_rate': 0.01, 'n_estimators': 1200, 'objective': 'multi:softprob', 'n_job': -1, 'gamma': 0.5, 'min_child_weight': 9, 'subsample': 1.0, 'colsample_bytree': 0.5, 'random_state': 10}


In [39]:
base_params["max_depth"] = 6
base_params["subsample"] = 0.6
base_params["n_jobs"] = 8

In [40]:
xgbclf_best = xgb.XGBClassifier(**base_params)
start = time()
xgbclf_best.fit(total_X, total_y_true,
                eval_set=[(total_X, total_y_true)],
                eval_metric="mlogloss",
                verbose=True
               )
print("Training elapse %d sencond." % (time()-start))

[0]	validation_0-mlogloss:3.64363
[1]	validation_0-mlogloss:3.62512
[2]	validation_0-mlogloss:3.60692
[3]	validation_0-mlogloss:3.58909
[4]	validation_0-mlogloss:3.57189
[5]	validation_0-mlogloss:3.55548
[6]	validation_0-mlogloss:3.53974
[7]	validation_0-mlogloss:3.52385
[8]	validation_0-mlogloss:3.50859
[9]	validation_0-mlogloss:3.49386
[10]	validation_0-mlogloss:3.47974
[11]	validation_0-mlogloss:3.46577
[12]	validation_0-mlogloss:3.45253
[13]	validation_0-mlogloss:3.43883
[14]	validation_0-mlogloss:3.42585
[15]	validation_0-mlogloss:3.4136
[16]	validation_0-mlogloss:3.40107
[17]	validation_0-mlogloss:3.38941
[18]	validation_0-mlogloss:3.37776
[19]	validation_0-mlogloss:3.36635
[20]	validation_0-mlogloss:3.35504
[21]	validation_0-mlogloss:3.34406
[22]	validation_0-mlogloss:3.33319
[23]	validation_0-mlogloss:3.32284
[24]	validation_0-mlogloss:3.31248
[25]	validation_0-mlogloss:3.30226
[26]	validation_0-mlogloss:3.29248
[27]	validation_0-mlogloss:3.28254
[28]	validation_0-mlogloss:3.27

[232]	validation_0-mlogloss:2.56348
[233]	validation_0-mlogloss:2.56225
[234]	validation_0-mlogloss:2.56104
[235]	validation_0-mlogloss:2.55981
[236]	validation_0-mlogloss:2.55858
[237]	validation_0-mlogloss:2.55739
[238]	validation_0-mlogloss:2.5562
[239]	validation_0-mlogloss:2.55497
[240]	validation_0-mlogloss:2.55376
[241]	validation_0-mlogloss:2.55259
[242]	validation_0-mlogloss:2.55143
[243]	validation_0-mlogloss:2.55023
[244]	validation_0-mlogloss:2.54908
[245]	validation_0-mlogloss:2.54795
[246]	validation_0-mlogloss:2.54684
[247]	validation_0-mlogloss:2.54571
[248]	validation_0-mlogloss:2.54456
[249]	validation_0-mlogloss:2.54346
[250]	validation_0-mlogloss:2.54235
[251]	validation_0-mlogloss:2.54127
[252]	validation_0-mlogloss:2.54017
[253]	validation_0-mlogloss:2.53907
[254]	validation_0-mlogloss:2.538
[255]	validation_0-mlogloss:2.53692
[256]	validation_0-mlogloss:2.53586
[257]	validation_0-mlogloss:2.53482
[258]	validation_0-mlogloss:2.53379
[259]	validation_0-mlogloss:2.5

[461]	validation_0-mlogloss:2.41161
[462]	validation_0-mlogloss:2.41128
[463]	validation_0-mlogloss:2.41096
[464]	validation_0-mlogloss:2.41064
[465]	validation_0-mlogloss:2.4103
[466]	validation_0-mlogloss:2.40997
[467]	validation_0-mlogloss:2.40963
[468]	validation_0-mlogloss:2.40932
[469]	validation_0-mlogloss:2.40898
[470]	validation_0-mlogloss:2.40867
[471]	validation_0-mlogloss:2.40837
[472]	validation_0-mlogloss:2.40804
[473]	validation_0-mlogloss:2.40772
[474]	validation_0-mlogloss:2.4074
[475]	validation_0-mlogloss:2.40708
[476]	validation_0-mlogloss:2.40677
[477]	validation_0-mlogloss:2.40645
[478]	validation_0-mlogloss:2.40616
[479]	validation_0-mlogloss:2.40587
[480]	validation_0-mlogloss:2.40557
[481]	validation_0-mlogloss:2.40527
[482]	validation_0-mlogloss:2.40498
[483]	validation_0-mlogloss:2.40467
[484]	validation_0-mlogloss:2.40439
[485]	validation_0-mlogloss:2.40407
[486]	validation_0-mlogloss:2.40376
[487]	validation_0-mlogloss:2.40347
[488]	validation_0-mlogloss:2.

[690]	validation_0-mlogloss:2.36293
[691]	validation_0-mlogloss:2.3628
[692]	validation_0-mlogloss:2.36266
[693]	validation_0-mlogloss:2.36253
[694]	validation_0-mlogloss:2.36241
[695]	validation_0-mlogloss:2.36224
[696]	validation_0-mlogloss:2.3621
[697]	validation_0-mlogloss:2.36197
[698]	validation_0-mlogloss:2.36184
[699]	validation_0-mlogloss:2.36171
[700]	validation_0-mlogloss:2.36157
[701]	validation_0-mlogloss:2.36144
[702]	validation_0-mlogloss:2.36132
[703]	validation_0-mlogloss:2.36119
[704]	validation_0-mlogloss:2.36107
[705]	validation_0-mlogloss:2.36094
[706]	validation_0-mlogloss:2.36081
[707]	validation_0-mlogloss:2.36068
[708]	validation_0-mlogloss:2.36055
[709]	validation_0-mlogloss:2.36043
[710]	validation_0-mlogloss:2.3603
[711]	validation_0-mlogloss:2.36017
[712]	validation_0-mlogloss:2.36004
[713]	validation_0-mlogloss:2.35992
[714]	validation_0-mlogloss:2.3598
[715]	validation_0-mlogloss:2.35968
[716]	validation_0-mlogloss:2.35956
[717]	validation_0-mlogloss:2.35

[919]	validation_0-mlogloss:2.33956
[920]	validation_0-mlogloss:2.33948
[921]	validation_0-mlogloss:2.33941
[922]	validation_0-mlogloss:2.33932
[923]	validation_0-mlogloss:2.33924
[924]	validation_0-mlogloss:2.33916
[925]	validation_0-mlogloss:2.33909
[926]	validation_0-mlogloss:2.339
[927]	validation_0-mlogloss:2.33892
[928]	validation_0-mlogloss:2.33883
[929]	validation_0-mlogloss:2.33875
[930]	validation_0-mlogloss:2.33867
[931]	validation_0-mlogloss:2.3386
[932]	validation_0-mlogloss:2.33851
[933]	validation_0-mlogloss:2.33844
[934]	validation_0-mlogloss:2.33835
[935]	validation_0-mlogloss:2.33828
[936]	validation_0-mlogloss:2.33819
[937]	validation_0-mlogloss:2.33812
[938]	validation_0-mlogloss:2.33804
[939]	validation_0-mlogloss:2.33796
[940]	validation_0-mlogloss:2.33789
[941]	validation_0-mlogloss:2.33781
[942]	validation_0-mlogloss:2.33773
[943]	validation_0-mlogloss:2.33765
[944]	validation_0-mlogloss:2.33757
[945]	validation_0-mlogloss:2.3375
[946]	validation_0-mlogloss:2.33

[1144]	validation_0-mlogloss:2.32402
[1145]	validation_0-mlogloss:2.32396
[1146]	validation_0-mlogloss:2.32389
[1147]	validation_0-mlogloss:2.32382
[1148]	validation_0-mlogloss:2.32376
[1149]	validation_0-mlogloss:2.3237
[1150]	validation_0-mlogloss:2.32363
[1151]	validation_0-mlogloss:2.32358
[1152]	validation_0-mlogloss:2.32353
[1153]	validation_0-mlogloss:2.32347
[1154]	validation_0-mlogloss:2.3234
[1155]	validation_0-mlogloss:2.32335
[1156]	validation_0-mlogloss:2.32329
[1157]	validation_0-mlogloss:2.32323
[1158]	validation_0-mlogloss:2.32316
[1159]	validation_0-mlogloss:2.32311
[1160]	validation_0-mlogloss:2.32304
[1161]	validation_0-mlogloss:2.32297
[1162]	validation_0-mlogloss:2.32291
[1163]	validation_0-mlogloss:2.32285
[1164]	validation_0-mlogloss:2.32279
[1165]	validation_0-mlogloss:2.32273
[1166]	validation_0-mlogloss:2.32266
[1167]	validation_0-mlogloss:2.3226
[1168]	validation_0-mlogloss:2.32253
[1169]	validation_0-mlogloss:2.32247
[1170]	validation_0-mlogloss:2.32241
[117

#### 预测测试集

In [41]:
valid_data = pd.read_csv("../datasets/test_preprocess.csv")
valid_X = valid_data[feature_names]

In [42]:
y_pred_prob = np.round(xgbclf_best.predict_proba(valid_X), 4)
csv_output = pd.DataFrame(columns=xgbclf_best.classes_, data=y_pred_prob)
csv_output.insert(0, "Id", valid_data["Id"])
csv_output.to_csv("../results/XGBClassifier_best.csv", index=False)

In [43]:
xgbclf_best.save_model("../models/xgbclf_best.model")