In [11]:
import xgboost as xgb
import pandas as pd
import numpy as np
import preprocess as datapre

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [2]:
origin_train_data = pd.read_csv("../datasets/train_preprocess.csv")

In [3]:
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', \
                 'HasBlock', 'RoadTypeID', 'RoadBlockID', 'RoadName1ID', 'RoadName2ID', 'X', 'Y']

In [4]:
valid_data = pd.read_csv("../datasets/test_preprocess.csv")
valid_X = valid_data[feature_names]

----

In [3]:
train_data = datapre.dataset_sample(origin_train_data)

In [6]:
X = train_data[feature_names]

In [7]:
TargetEnc = LabelEncoder()
y_true = TargetEnc.fit_transform(train_data["Category"])

In [8]:
xgbclf = xgb.XGBClassifier(max_depth=6,
                       learning_rate=0.1, 
                       n_estimators=100, 
                       objective="multi:softprob",
                       n_job=-1,
                       gamma=2,
                       min_child_weight=10,
                       max_delta_step=2,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       random_state=10
                    )

In [None]:
param_grid = {"n_estimators": list(range(400, 2001, 400))}

cv_clf_n_estimators = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = make_scorer(log_loss, needs_proba=True), n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_n_estimators.fit(X, y_true)
print("Training with total data elapse %d sencond." % (time()-start))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] n_estimators=400 ................................................
[CV] n_estimators=400 ................................................
[CV] n_estimators=400 ................................................
[CV] n_estimators=400 ................................................


#### XGBoost基础模型

In [15]:
base_X = origin_train_data[feature_names]

In [16]:
base_y_true =origin_train_data["Category"]

In [17]:
base_y_true.shape

(878049,)

In [23]:
base_xgbclf = xgb.XGBClassifier(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=200,
    subsample=0.5,
    colsample_bytree=0.5,
    gamma=0.2,
    n_jobs=-1,
    random_state=42
    )

base_xgbclf.fit(base_X, base_y_true,
                eval_set=[(base_X, base_y_true)],
                eval_metric="mlogloss",
                verbose=True)

[0]	validation_0-mlogloss:3.4734
[1]	validation_0-mlogloss:3.34404
[2]	validation_0-mlogloss:3.24319
[3]	validation_0-mlogloss:3.16024
[4]	validation_0-mlogloss:3.09035
[5]	validation_0-mlogloss:3.03031
[6]	validation_0-mlogloss:2.97736
[7]	validation_0-mlogloss:2.92991
[8]	validation_0-mlogloss:2.88951
[9]	validation_0-mlogloss:2.85297
[10]	validation_0-mlogloss:2.81952
[11]	validation_0-mlogloss:2.78938
[12]	validation_0-mlogloss:2.7613
[13]	validation_0-mlogloss:2.73599
[14]	validation_0-mlogloss:2.71362
[15]	validation_0-mlogloss:2.69247
[16]	validation_0-mlogloss:2.67257
[17]	validation_0-mlogloss:2.65476
[18]	validation_0-mlogloss:2.63826
[19]	validation_0-mlogloss:2.62341
[20]	validation_0-mlogloss:2.60929
[21]	validation_0-mlogloss:2.596
[22]	validation_0-mlogloss:2.58357
[23]	validation_0-mlogloss:2.57197
[24]	validation_0-mlogloss:2.56103
[25]	validation_0-mlogloss:2.55101
[26]	validation_0-mlogloss:2.54132
[27]	validation_0-mlogloss:2.53243
[28]	validation_0-mlogloss:2.52452

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=-1, nthread=None,
       objective='multi:softprob', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.5)

In [24]:
base_y_pred_prob = np.round(base_xgbclf.predict_proba(valid_X), 4)
csv_output = pd.DataFrame(columns=base_xgbclf.classes_, data=base_y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/XGBClassifier_base.csv', index=False)