In [1]:
import pandas as pd
import numpy as np
import time

import preprocess as datapre

In [2]:
train_data = pd.read_csv("../datasets/train_preprocess.csv")

In [3]:
train_data = datapre.dataset_sample(train_data)

In [4]:
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', \
                 'HasBlock', 'RoadTypeID', 'RoadBlockID', 'RoadName1ID', 'RoadName2ID', 'X', 'Y']

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [6]:
X = train_data[feature_names]
y_true = train_data["Category"]

In [7]:
from sklearn.preprocessing import LabelEncoder
TargetEnc = LabelEncoder()
y_true = TargetEnc.fit_transform(y_true)

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, make_scorer, roc_auc_score

In [9]:
def neg_log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None):
    return -log_loss(y_true, y_pred, eps, normalize, sample_weight, labels)

In [10]:
call_neg_log_loss = make_scorer(neg_log_loss, needs_proba=True)

In [11]:
param_grid = { 
#              "n_estimators": list(range(200, 1500, 200)),
             "criterion": ["gini", "entropy"],
#              "max_depth": list(range(start=6, end=11, step=1)),
#              "min_samples_split": list(range(start=20, end=51, step=10)),
#              "max_features": list(range(start=0.5, end=1.1, step=0.1)),
#              "bootstrap": [True, False],
             }

rfclf = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_split=20, max_features=0.5, bootstrap=False)
cv_clf_criterion = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_criterion.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   41.9s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.3min remaining:   32.2s


Training with total dataset needs 97 seconds.


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.6min finished


In [12]:
cv_clf_criterion.best_params_

{'criterion': 'entropy'}

In [13]:
cv_clf_criterion.cv_results_

{'mean_fit_time': array([28.20599985, 59.28139911]),
 'std_fit_time': array([0.2386047 , 7.69036074]),
 'mean_score_time': array([2.34799914, 1.43119903]),
 'std_score_time': array([0.07327011, 0.18055123]),
 'param_criterion': masked_array(data=['gini', 'entropy'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'gini'}, {'criterion': 'entropy'}],
 'split0_test_score': array([-2.50628454, -2.50634108]),
 'split1_test_score': array([-2.51124175, -2.51079595]),
 'split2_test_score': array([-2.50961146, -2.50443974]),
 'split3_test_score': array([-2.5035699, -2.5003022]),
 'split4_test_score': array([-2.50659297, -2.50406911]),
 'mean_test_score': array([-2.5074607 , -2.50519092]),
 'std_test_score': array([0.00269057, 0.00341944]),
 'rank_test_score': array([2, 1]),
 'split0_train_score': array([-2.47758223, -2.4744618 ]),
 'split1_train_score': array([-2.47914046, -2.47449743]),
 'split2_train_score': array([-2.48137667, 

In [24]:
param_grid = { 
#              "n_estimators": list(range(200, 1500, 200)),
#              "criterion": ["gini", "entropy"],
#              "max_depth": list(range(start=6, end=11, step=1)),
             "min_samples_split": list(range(20, 101, 20)),
#              "max_features": list(range(start=0.5, end=1.1, step=0.1)),
#              "bootstrap": [True, False],
             }

rfclf = RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=6, 
                               max_features=0.5, bootstrap=False)
cv_clf_min_samples_split = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_min_samples_split.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  17 out of  25 | elapsed:  4.0min remaining:  1.9min


Training with total dataset needs 289 seconds.


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  4.8min finished


In [25]:
cv_clf_min_samples_split.best_params_

{'min_samples_split': 80}

In [26]:
cv_clf_min_samples_split.cv_results_

{'mean_fit_time': array([68.14419951, 67.56459689, 68.68700128, 67.9132    , 63.88400178]),
 'std_fit_time': array([ 1.16645819,  0.99905091,  1.234406  ,  0.94243593, 10.47608878]),
 'mean_score_time': array([2.83819904, 2.28400211, 2.67139912, 2.26959844, 2.77899876]),
 'std_score_time': array([0.46051421, 0.57398919, 0.52638038, 0.62955865, 0.88756768]),
 'param_min_samples_split': masked_array(data=[20, 40, 60, 80, 100],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_split': 20},
  {'min_samples_split': 40},
  {'min_samples_split': 60},
  {'min_samples_split': 80},
  {'min_samples_split': 100}],
 'split0_test_score': array([-2.50775761, -2.50677638, -2.50836831, -2.50660572, -2.50900031]),
 'split1_test_score': array([-2.51174334, -2.51103459, -2.51110268, -2.51113005, -2.51043393]),
 'split2_test_score': array([-2.5041221 , -2.50427744, -2.50455218, -2.50526669, -2.50329724]),
 'split3_test_scor

In [17]:
param_grid = { 
#              "n_estimators": list(range(200, 1500, 200)),
#              "criterion": ["gini", "entropy"],
#              "max_depth": list(range(start=6, end=11, step=1)),
#              "min_samples_split": list(range(20, 51, 10)),
             "max_features": [round(i*0.1,1) for i in range(5, 11)],
#              "bootstrap": [True, False],
             }

rfclf = RandomForestClassifier(n_estimators=200, criterion="gini", max_depth=6, 
                               min_samples_split=cv_clf_min_samples_split.best_params_["min_samples_split"], 
                               bootstrap=False)
cv_clf_max_features = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_max_features.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  2.7min remaining:   48.7s


Training with total dataset needs 213 seconds.


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.6min finished


In [18]:
cv_clf_max_features.best_params_

{'max_features': 0.5}

In [19]:
cv_clf_max_features.cv_results_

{'mean_fit_time': array([29.6425993 , 34.13300352, 39.65259891, 42.54479885, 45.75400276,
        47.25319629]),
 'std_fit_time': array([0.32565213, 0.78440851, 0.84080009, 0.70716056, 1.24527831,
        1.4623189 ]),
 'mean_score_time': array([2.53760161, 2.78979607, 3.02099867, 2.24359741, 2.43019743,
        1.6618001 ]),
 'std_score_time': array([0.20454355, 0.67267044, 0.14643594, 0.45188484, 0.49079214,
        0.13698278]),
 'param_max_features': masked_array(data=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_features': 0.5},
  {'max_features': 0.6},
  {'max_features': 0.7},
  {'max_features': 0.8},
  {'max_features': 0.9},
  {'max_features': 1.0}],
 'split0_test_score': array([-2.50610561, -2.50649544, -2.51486803, -2.51845082, -2.52598927,
        -2.66431618]),
 'split1_test_score': array([-2.50912879, -2.50978106, -2.51236546, -2.51689317, -2.5288907 ,
    

In [20]:
param_grid = { 
#              "n_estimators": list(range(200, 1500, 200)),
#              "criterion": ["gini", "entropy"],
#              "max_depth": list(range(start=6, end=11, step=1)),
#              "min_samples_split": list(range(20, 51, 10)),
#              "max_features": [round(i*0.1,1) for i in range(5, 11)],
             "bootstrap": [True, False],
             }

rfclf = RandomForestClassifier(n_estimators=200, criterion="gini", max_depth=6, 
                               min_samples_split=cv_clf_min_samples_split.best_params_["min_samples_split"], 
                               max_features = cv_clf_max_features.best_params_["max_features"])
cv_clf_bootstrap = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_bootstrap.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   38.3s remaining:   57.5s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   43.6s remaining:   18.6s


Training with total dataset needs 63 seconds.


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.1min finished


In [21]:
cv_clf_bootstrap.best_params_

{'bootstrap': True}

In [22]:
cv_clf_bootstrap.cv_results_

{'mean_fit_time': array([23.79880614, 27.24199996]),
 'std_fit_time': array([0.4637407, 6.0367362]),
 'mean_score_time': array([2.47599635, 2.53599834]),
 'std_score_time': array([0.10714402, 1.0778334 ]),
 'param_bootstrap': masked_array(data=[True, False],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'bootstrap': True}, {'bootstrap': False}],
 'split0_test_score': array([-2.50747346, -2.50635769]),
 'split1_test_score': array([-2.51041082, -2.51011742]),
 'split2_test_score': array([-2.50743949, -2.50975602]),
 'split3_test_score': array([-2.50383781, -2.50344168]),
 'split4_test_score': array([-2.50456062, -2.50624662]),
 'mean_test_score': array([-2.50674551, -2.50718446]),
 'std_test_score': array([0.00235254, 0.00248126]),
 'rank_test_score': array([1, 2]),
 'split0_train_score': array([-2.47970352, -2.47807732]),
 'split1_train_score': array([-2.47839122, -2.47794565]),
 'split2_train_score': array([-2.47967723, -2.4821203 ]

In [27]:
param_grid = { 
             "n_estimators": list(range(200, 1500, 200)),
#              "criterion": ["gini", "entropy"],
             "max_depth": list(range(6, 11, 1)),
#              "min_samples_split": list(range(20, 51, 10)),
#              "max_features": [round(i*0.1,1) for i in range(5, 11)],
#              "bootstrap": [True, False],
             }

rfclf = RandomForestClassifier(criterion="gini", 
                               min_samples_split=cv_clf_min_samples_split.best_params_["min_samples_split"], 
                               max_features = cv_clf_max_features.best_params_["max_features"],
                               bootstrap = cv_clf_bootstrap.best_params_["bootstrap"])

cv_clf_final = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_final.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 25.8min


Training with total dataset needs 3872 seconds.


[Parallel(n_jobs=-1)]: Done 175 out of 175 | elapsed: 64.5min finished


In [28]:
cv_clf_final.best_params_

{'max_depth': 10, 'n_estimators': 1200}

In [29]:
cv_clf_final.cv_results_

{'mean_fit_time': array([ 24.65760036,  51.21040149,  78.71460047, 105.93700209,
        133.831004  , 158.75199842, 186.59739947,  30.82140031,
         63.62360497,  89.5189971 , 117.37220125, 145.21100307,
        173.40440149, 206.30680313,  32.41300025,  67.49680061,
         96.57380047, 127.80279846, 160.675002  , 192.21300106,
        231.39140511,  35.37700157,  73.58200049, 106.81480346,
        140.21700144, 175.41700048, 213.65020475, 245.88399696,
         38.55859776,  75.17020183, 112.98800106, 147.98739705,
        185.8455986 , 223.79459829, 233.88320017]),
 'std_fit_time': array([ 0.51771095,  0.8197866 ,  1.51369237,  1.70459985,  2.63219694,
         3.15518078,  3.26365441,  1.31804439,  1.21425327,  1.40346889,
         1.67045245,  1.87636457,  1.72765592,  2.22728743,  0.36628113,
         1.28454483,  1.12478465,  1.25063898,  3.0399443 ,  2.4703355 ,
         2.4857074 ,  0.86948596,  2.3025847 ,  1.1056532 ,  1.88637935,
         2.29648844,  3.46512077,  3.6

使用这些最优的参数值结合真个训练集训练出一个最终的随机森林模型。

In [31]:
total_data = pd.read_csv("../datasets/train_preprocess.csv")

In [32]:
total_X = total_data[feature_names]
total_y_true = total_data["Category"]

In [33]:
totalTargetEnc = LabelEncoder()
total_y_true = totalTargetEnc.fit_transform(total_y_true)

In [36]:
best_rf_clf = RandomForestClassifier(
    n_estimators=cv_clf_final.best_params_["n_estimators"],
    max_depth=cv_clf_final.best_params_["max_depth"],
    criterion=cv_clf_criterion.best_params_["criterion"], 
    min_samples_split=cv_clf_min_samples_split.best_params_["min_samples_split"], 
    max_features = cv_clf_max_features.best_params_["max_features"],
    bootstrap = cv_clf_bootstrap.best_params_["bootstrap"],
    n_jobs=-1,
    random_state=42
)

best_rf_clf.fit(total_X, total_y_true)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=80,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [43]:
valid_data = pd.read_csv("../datasets/test_preprocess.csv")
valid_X = valid_data[feature_names]

In [44]:
y_pred_prob = np.round(best_rf_clf.predict_proba(valid_X), 4)

In [45]:
csv_output = pd.DataFrame(columns=TargetEnc.classes_, data=y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/RandomForestClf_best.csv', index=False)

训练缺省模型

In [46]:
base_rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=8,
    min_samples_split=20,
    max_features=0.8, 
    n_jobs=-1, random_state=42)
base_rf_clf.fit(total_X, total_y_true)
y_pred_prob_base = np.round(base_rf_clf.predict_proba(valid_X), 4)
csv_output = pd.DataFrame(columns=TargetEnc.classes_, data=y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/RandomForestClf_base.csv', index=False)