In [1]:
import pandas as pd
import numpy as np
import time
import preprocess as datapre

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
origin_data = pd.read_csv("../datasets/train_preprocess.csv")

In [3]:
train_data = datapre.dataset_sample(origin_data, frac=0.5)

In [4]:
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', \
                 'HasBlock', 'RoadTypeID', 'RoadBlockID', 'RoadName1ID', 'RoadName2ID', 'X', 'Y']

In [5]:
X = train_data[feature_names]
y_true = train_data["Category"]

In [6]:
TargetEnc = LabelEncoder()
y_true = TargetEnc.fit_transform(y_true)

In [7]:
def neg_log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None):
    return -log_loss(y_true, y_pred, eps, normalize, sample_weight, labels)

call_neg_log_loss = make_scorer(neg_log_loss, needs_proba=True)

In [9]:
base_params = dict(n_estimators=300, max_depth=5, min_samples_split=20,
                    max_features=0.5, n_jobs=-1, random_state=42)

In [10]:
param_grid = {"criterion": ["gini", "entropy"]}

rfclf = RandomForestClassifier(**base_params)
cv_clf_criterion = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_criterion.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  9.6min remaining: 14.4min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 10.7min remaining:  4.6min


Training needs 787 seconds.


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.1min finished


In [11]:
cv_clf_criterion.best_params_

{'criterion': 'entropy'}

In [12]:
cv_clf_criterion.cv_results_

{'mean_fit_time': array([377.62422113, 360.51584072]),
 'std_fit_time': array([  1.95029143, 163.90649584]),
 'mean_score_time': array([37.51303835, 28.0612236 ]),
 'std_score_time': array([ 0.62397431, 14.37338393]),
 'param_criterion': masked_array(data=['gini', 'entropy'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'gini'}, {'criterion': 'entropy'}],
 'split0_test_score': array([-2.52907413, -2.52679097]),
 'split1_test_score': array([-2.52407811, -2.52206075]),
 'split2_test_score': array([-2.52470506, -2.52167583]),
 'split3_test_score': array([-2.52755128, -2.52481451]),
 'split4_test_score': array([-2.52662738, -2.52489309]),
 'mean_test_score': array([-2.5264072 , -2.52404703]),
 'std_test_score': array([0.00183268, 0.00191851]),
 'rank_test_score': array([2, 1]),
 'split0_train_score': array([-2.52244801, -2.51980186]),
 'split1_train_score': array([-2.52358955, -2.5211375 ]),
 'split2_train_score': array([-

In [13]:
param_grid = {"min_samples_split": list(range(20, 101, 20))}

base_params["criterion"] = cv_clf_criterion.best_params_["criterion"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_min_samples_split = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_min_samples_split.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done  17 out of  25 | elapsed: 33.4min remaining: 15.7min


Training needs 2095 seconds.


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 34.9min finished


In [None]:
cv_clf_min_samples_split.best_params_

{'min_samples_split': 60}

In [None]:
cv_clf_min_samples_split.cv_results_

{'mean_fit_time': array([464.73708339, 464.26790142, 464.3276195 , 463.22402234,
        386.73499465]),
 'std_fit_time': array([  1.41639691,   1.40555434,   0.32925884,   1.34880926,
        163.42989577]),
 'mean_score_time': array([39.81927314, 40.5610456 , 41.66956849, 40.11531024, 32.93259096]),
 'std_score_time': array([ 0.49060337,  0.98599894,  0.5183828 ,  0.6906298 , 13.53346669]),
 'param_min_samples_split': masked_array(data=[20, 40, 60, 80, 100],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_split': 20},
  {'min_samples_split': 40},
  {'min_samples_split': 60},
  {'min_samples_split': 80},
  {'min_samples_split': 100}],
 'split0_test_score': array([-2.52679097, -2.52679097, -2.52679163, -2.52678297, -2.52681774]),
 'split1_test_score': array([-2.52206075, -2.52206075, -2.52206089, -2.52206541, -2.52205331]),
 'split2_test_score': array([-2.52167583, -2.52167583, -2.52165876, -2.5216590

In [None]:
param_grid = {"max_features": [round(i*0.1,1) for i in range(5, 11)]}

base_params["min_samples_split"] = cv_clf_min_samples_split.best_params_["min_samples_split"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_max_features = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_max_features.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 23.1min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 39.2min remaining: 11.9min


Training with total dataset needs 3045 seconds.


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 50.8min finished


In [None]:
cv_clf_max_features.best_params_

{'max_features': 0.6}

In [None]:
cv_clf_max_features.cv_results_

{'mean_fit_time': array([466.10893898, 521.05996537, 577.37079701, 625.0942153 ,
        658.65490699, 615.98455191]),
 'std_fit_time': array([ 0.58864978,  0.66178994,  1.96446675,  5.61541478, 48.3168318 ,
        14.03208936]),
 'mean_score_time': array([36.83551712, 39.84070601, 37.56637659, 47.3072475 , 34.50676036,
        24.62801032]),
 'std_score_time': array([ 0.63427667,  0.72726104,  0.53699966, 11.69145063,  1.60775988,
         4.52784121]),
 'param_max_features': masked_array(data=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_features': 0.5},
  {'max_features': 0.6},
  {'max_features': 0.7},
  {'max_features': 0.8},
  {'max_features': 0.9},
  {'max_features': 1.0}],
 'split0_test_score': array([-2.52679163, -2.52679075, -2.52811846, -2.53081133, -2.53307424,
        -2.54405006]),
 'split1_test_score': array([-2.52206089, -2.52156219, -2.52255916, -2.525

In [None]:
param_grid = {"bootstrap": [True, False]}

base_params["max_features"] = cv_clf_max_features.best_params_["max_features"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_bootstrap = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_bootstrap.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 11.9min remaining: 17.8min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 12.5min remaining:  5.4min


Training needs 930 seconds.


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 15.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 15.5min finished


In [None]:
cv_clf_bootstrap.best_params_

{'bootstrap': True}

In [None]:
cv_clf_bootstrap.cv_results_

{'mean_fit_time': array([514.41487279, 414.07483211]),
 'std_fit_time': array([  1.22824183, 202.95871421]),
 'mean_score_time': array([36.44589915, 27.05701218]),
 'std_score_time': array([ 0.24104818, 13.63592581]),
 'param_bootstrap': masked_array(data=[True, False],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'bootstrap': True}, {'bootstrap': False}],
 'split0_test_score': array([-2.52679075, -2.5273457 ]),
 'split1_test_score': array([-2.52156219, -2.52226348]),
 'split2_test_score': array([-2.52140237, -2.52196708]),
 'split3_test_score': array([-2.52458029, -2.52528339]),
 'split4_test_score': array([-2.52425029, -2.52514066]),
 'mean_test_score': array([-2.5237172 , -2.52440007]),
 'std_test_score': array([0.00202387, 0.00202444]),
 'rank_test_score': array([1, 2]),
 'split0_train_score': array([-2.5195774 , -2.52015829]),
 'split1_train_score': array([-2.52060323, -2.52144707]),
 'split2_train_score': array([-2.52075189, 

In [10]:
base_params["criterion"] = "entropy"
base_params["min_samples_split"] = 60
base_params["max_features"] = 0.6
base_params["bootstrap"] = True
print(base_params)

{'n_estimators': 300, 'max_depth': 5, 'min_samples_split': 60, 'max_features': 0.6, 'n_jobs': -1, 'random_state': 42, 'criterion': 'entropy', 'bootstrap': True}


In [11]:
param_grid = {"max_depth": [5, 8, 10]}

# base_params["bootstrap"] = cv_clf_bootstrap.best_params_["bootstrap"]
base_params["n_estimators"] = 1000
rfclf = RandomForestClassifier(**base_params)
cv_clf_max_depth = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_max_depth.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed: 23.1min remaining: 63.6min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 33.0min remaining: 28.8min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 56.7min remaining: 14.2min


Training needs 3547 seconds.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 59.1min finished


In [12]:
cv_clf_max_depth.best_params_

{'max_depth': 10}

In [13]:
cv_clf_max_depth.cv_results_

{'mean_fit_time': array([1003.51930728, 1513.28980088, 1645.8866456 ]),
 'std_fit_time': array([  2.51895593,  67.7506661 , 141.67328709]),
 'mean_score_time': array([75.08413138, 74.60860305, 38.3990098 ]),
 'std_score_time': array([ 0.7727872 ,  5.7839875 , 14.37314495]),
 'param_max_depth': masked_array(data=[5, 8, 10],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 5}, {'max_depth': 8}, {'max_depth': 10}],
 'split0_test_score': array([-2.52350063, -2.44961679, -2.40709855]),
 'split1_test_score': array([-2.52462105, -2.45054982, -2.4078899 ]),
 'split2_test_score': array([-2.52384361, -2.45078024, -2.40855196]),
 'split3_test_score': array([-2.52215904, -2.44769313, -2.40368641]),
 'split4_test_score': array([-2.52478213, -2.44908327, -2.40410059]),
 'mean_test_score': array([-2.52378129, -2.44954473, -2.40626568]),
 'std_test_score': array([0.00094031, 0.00111203, 0.00199489]),
 'rank_test_score': array([3, 2

In [14]:
param_grid = {"max_depth": [5, 8, 10]}

# base_params["bootstrap"] = cv_clf_bootstrap.best_params_["bootstrap"]
base_params["n_estimators"] = 2000
rfclf = RandomForestClassifier(**base_params)
cv_clf_max_depth1 = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_max_depth1.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed: 45.6min remaining: 125.4min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 65.6min remaining: 57.4min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 112.7min remaining: 28.2min


Training needs 7066 seconds.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 117.8min finished


In [15]:
cv_clf_max_depth1.best_params_

{'max_depth': 10}

In [16]:
cv_clf_max_depth1.cv_results_

{'mean_fit_time': array([1986.80162911, 3020.59122295, 3289.66810756]),
 'std_fit_time': array([  4.90335491, 134.86174621, 290.88060144]),
 'mean_score_time': array([146.23303008, 149.15595608,  76.25234532]),
 'std_score_time': array([ 1.66534833, 10.3555175 , 27.7178049 ]),
 'param_max_depth': masked_array(data=[5, 8, 10],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 5}, {'max_depth': 8}, {'max_depth': 10}],
 'split0_test_score': array([-2.52351883, -2.44938286, -2.40693083]),
 'split1_test_score': array([-2.52460509, -2.45029807, -2.4072422 ]),
 'split2_test_score': array([-2.52385268, -2.45051053, -2.4081915 ]),
 'split3_test_score': array([-2.52205703, -2.44732185, -2.40306253]),
 'split4_test_score': array([-2.52480403, -2.44885817, -2.40398719]),
 'mean_test_score': array([-2.52376753, -2.44927437, -2.40588304]),
 'std_test_score': array([0.00097677, 0.00114657, 0.00199111]),
 'rank_test_score': array([3

#### 结合已经找到的最优参数，使用整个训练集进行网格搜索交叉验证找到n_estimators和max_depth的最优值，并得到最优模型。

In [12]:
total_X = origin_data[feature_names]
total_y_true = origin_data["Category"]

In [13]:
totalTargetEnc = LabelEncoder()
total_y_true = totalTargetEnc.fit_transform(total_y_true)

In [None]:
param_grid = {"n_estimators": list(range(1000, 2001, 500))}

base_params["max_depth"] = cv_clf_max_depth.best_params_["max_depth"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_final = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=True)

start = time.time()
cv_clf_final.fit(total_X, total_y_true)
print("Training needs %d seconds." % (time.time()-start))

In [None]:
cv_clf_final.best_params_

In [None]:
cv_clf_final.cv_results_

In [None]:
base_params["n_estimators"]=cv_clf_final.best_params_["n_estimators"]
base_params["max_depth"] = cv_clf_final.best_params_["max_depth"]
print(base_params)

In [None]:
best_rf_clf = cv_clf_final.best_estimator_

In [None]:
valid_data = pd.read_csv("../datasets/test_preprocess.csv")
valid_X = valid_data[feature_names]

In [None]:
y_pred_prob = np.round(best_rf_clf.predict_proba(valid_X), 4)

In [None]:
csv_output = pd.DataFrame(columns=totalTargetEnc.classes_, data=y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/RandomForestClf_best.csv', index=False)

---

训练缺省模型

In [None]:
base_rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=8,
    min_samples_split=20,
    max_features=0.8, 
    n_jobs=-1, random_state=42)
base_rf_clf.fit(total_X, total_y_true)
y_pred_prob_base = np.round(base_rf_clf.predict_proba(valid_X), 4)
csv_output = pd.DataFrame(columns=TargetEnc.classes_, data=y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/RandomForestClf_base.csv', index=False)