In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import preprocess as datapre

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import LabelEncoder

In [None]:
origin_data = pd.read_csv("../datasets/train_preprocess.csv")

In [None]:
train_data = datapre.dataset_sample(origin_data, frac=0.001)

In [None]:
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', \
                 'HasBlock', 'RoadTypeID', 'RoadBlockID', 'RoadName1ID', 'RoadName2ID', 'X', 'Y']

In [None]:
X = train_data[feature_names]

In [None]:
y_true = train_data["Category"]

In [None]:
def neg_log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None):
    return -log_loss(y_true, y_pred, eps, normalize, sample_weight, labels)

call_neg_log_loss = make_scorer(neg_log_loss, needs_proba=True)

In [None]:
base_params = dict(max_depth=6, learning_rate=0.1, n_estimators=100, objective="multi:softprob",
                   n_job=-1, gamma=0.3, min_child_weight=5, subsample=0.5, colsample_bytree=0.5, random_state=10)

In [None]:
# param_grid = {"n_estimators": list(range(400, 2001, 400))}
param_grid = {"n_estimators": list(range(10, 20, 10))}

xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_n_estimators = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_n_estimators.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_n_estimators.best_params_

In [None]:
cv_clf_n_estimators.cv_results_

In [None]:
param_grid = {"max_depth": list(range(2,3))}

base_params["n_estimators"] = cv_clf_n_estimators.best_params_["n_estimators"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_max_depth = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_max_depth.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_max_depth.best_params_

In [None]:
cv_clf_max_depth.cv_results_

In [None]:
param_grid = {"subsample": [i/10.0 for i in range(2,3)]}

base_params["max_depth"] = cv_clf_max_depth.best_params_["max_depth"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_subsample = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_subsample.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_subsample.best_params_

In [None]:
cv_clf_subsample.cv_results_

In [None]:
param_grid = {"colsample_bytree": [i/10.0 for i in range(2,3)]}

base_params["subsample"] = cv_clf_subsample.best_params_["subsample"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_colsample_bytree = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_colsample_bytree.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_colsample_bytree.best_params_

In [None]:
cv_clf_colsample_bytree.cv_results_

In [None]:
param_grid = {"gamma": [i/10.0 for i in range(1,2)]}

base_params["colsample_bytree"] = cv_clf_colsample_bytree.best_params_["colsample_bytree"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_gamma = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_gamma.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_gamma.best_params_

In [None]:
cv_clf_gamma.cv_results_

In [None]:
param_grid = {"min_child_weight": list(range(2,3))}

base_params["gamma"] = cv_clf_gamma.best_params_["gamma"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_min_child_weight = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_min_child_weight.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_min_child_weight.best_params_

In [None]:
cv_clf_min_child_weight.cv_results_

In [None]:
param_grid = {"n_estimators": list(range(2, 3))}

base_params["min_child_weight"] = cv_clf_min_child_weight.best_params_["min_child_weight"]
xgbclf = xgb.XGBClassifier(**base_params)
cv_clf_final = GridSearchCV(estimator=xgbclf, 
                                  param_grid = param_grid,
                                  scoring = call_neg_log_loss, n_jobs=-1,
                                  cv=5, verbose=4, return_train_score=True, refit=False)

from time import time
start = time()
cv_clf_final.fit(X, y_true)
print("Training elapse %d sencond." % (time()-start))

In [None]:
cv_clf_final.best_params_

In [None]:
cv_clf_final.cv_results_

#### 使用整个训练集进行训练

In [None]:
total_X = origin_data[feature_names]
total_y_true = origin_data["Category"]

In [None]:
base_param["n_estimators"] = cv_clf_final.best_params_["n_estimators"]
print(base_params)

In [None]:
xgbclf_best = xgb.XGBClassifier(**base_params)
start = time()
xgbclf_best.fit(total_X, total_y_true,
                eval_set=[(total_X, total_y_true)],
                eval_metric="mlogloss",
                verbose=True
               )
print("Training elapse %d sencond." % (time()-start))

#### 预测测试集

In [None]:
valid_data = pd.read_csv("../datasets/test_preprocess.csv")
valid_X = valid_data[feature_names]

In [None]:
y_pred_prob = np.round(xgbclf_best.predict_proba(valid_X), 4)
csv_output = pd.DataFrame(columns=xgbclf.classes_, data=y_pred_prob)
csv_output.insert(0, "Id", valid_data["Id"])
csv_output.to_csv("../results/XGBClassifier_best.csv", index=False)