In [None]:
import pandas as pd
import numpy as np
import time
import preprocess as datapre

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import log_loss, make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
origin_data = pd.read_csv("../datasets/train_preprocess.csv")

In [None]:
train_data = datapre.dataset_sample(origin_data)

In [None]:
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', \
                 'HasBlock', 'RoadTypeID', 'RoadBlockID', 'RoadName1ID', 'RoadName2ID', 'X', 'Y']

In [None]:
X = train_data[feature_names]
y_true = train_data["Category"]

In [None]:
TargetEnc = LabelEncoder()
y_true = TargetEnc.fit_transform(y_true)

In [None]:
def neg_log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None):
    return -log_loss(y_true, y_pred, eps, normalize, sample_weight, labels)

In [None]:
call_neg_log_loss = make_scorer(neg_log_loss, needs_proba=True)

In [None]:
base_params = dict(n_estimators=500, max_depth=5, min_samples_split=20,
                    max_features=0.5, n_jobs=-1, random_state=42)

In [None]:
param_grid = {"criterion": ["gini", "entropy"]}

rfclf = RandomForestClassifier(**base_params)
cv_clf_criterion = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_criterion.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

In [None]:
cv_clf_criterion.best_params_

In [None]:
cv_clf_criterion.cv_results_

In [None]:
param_grid = {"min_samples_split": list(range(20, 101, 20))}

base_params["criterion"] = cv_clf_criterion.best_params_["criterion"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_min_samples_split = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_min_samples_split.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

In [None]:
cv_clf_min_samples_split.best_params_

In [None]:
cv_clf_min_samples_split.cv_results_

In [None]:
param_grid = {"max_features": [round(i*0.1,1) for i in range(5, 11)]}

base_params["min_samples_split"] = cv_clf_min_samples_split.best_params_["min_samples_split"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_max_features = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_max_features.fit(X, y_true)
print("Training with total dataset needs %d seconds." % (time.time()-start))

In [None]:
cv_clf_max_features.best_params_

In [None]:
cv_clf_max_features.cv_results_

In [None]:
param_grid = {"bootstrap": [True, False]}

base_params["max_features"] = cv_clf_max_features.best_params_["max_features"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_bootstrap = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=False)

start = time.time()
cv_clf_bootstrap.fit(X, y_true)
print("Training needs %d seconds." % (time.time()-start))

In [None]:
cv_clf_bootstrap.best_params_

In [None]:
cv_clf_bootstrap.cv_results_

#### 结合已经找到的最优参数，使用整个训练集进行网格搜索交叉验证找到n_estimators和max_depth的最优值，并得到最优模型。

In [None]:
total_X = origin_data[feature_names]
total_y_true = origin_data["Category"]

In [None]:
totalTargetEnc = LabelEncoder()
total_y_true = totalTargetEnc.fit_transform(total_y_true)

In [None]:
param_grid = {"n_estimators": list(range(200, 1500, 200)), "max_depth": list(range(6, 11, 1))}

base_params["bootstrap"] = cv_clf_bootstrap.best_params_["bootstrap"]
rfclf = RandomForestClassifier(**base_params)
cv_clf_final = GridSearchCV(estimator=rfclf, param_grid=param_grid, 
                                scoring=call_neg_log_loss, 
                                n_jobs=-1, cv=5, verbose=4, return_train_score=True, refit=True)

start = time.time()
cv_clf_final.fit(total_X, total_y_true)
print("Training needs %d seconds." % (time.time()-start))

In [None]:
cv_clf_final.best_params_

In [None]:
cv_clf_final.cv_results_

In [None]:
base_params["n_estimators"]=cv_clf_final.best_params_["n_estimators"]
base_params["max_depth"] = cv_clf_final.best_params_["max_depth"]
print(base_params)

In [None]:
best_rf_clf = cv_clf_final.best_estimator_

In [None]:
valid_data = pd.read_csv("../datasets/test_preprocess.csv")
valid_X = valid_data[feature_names]

In [None]:
y_pred_prob = np.round(best_rf_clf.predict_proba(valid_X), 4)

In [None]:
csv_output = pd.DataFrame(columns=TargetEnc.classes_, data=y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/RandomForestClf_best.csv', index=False)

---

训练缺省模型

In [None]:
base_rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=8,
    min_samples_split=20,
    max_features=0.8, 
    n_jobs=-1, random_state=42)
base_rf_clf.fit(total_X, total_y_true)
y_pred_prob_base = np.round(base_rf_clf.predict_proba(valid_X), 4)
csv_output = pd.DataFrame(columns=TargetEnc.classes_, data=y_pred_prob)
csv_output.insert(0, 'Id', valid_data['Id'])
csv_output.to_csv('../results/RandomForestClf_base.csv', index=False)