In [None]:
## There are tree based models. The models are time comsuming, if you want direct result, you could load *.pkl files directly to start the models

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import joblib

In [None]:
accident = pd.read_csv("./accident_encoded.csv", index_col = 0)

In [None]:
X = accident.iloc[:, 2:21]
y = accident.iloc[:, 21]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("X_shape:", X.shape, " y_shape:", y.shape)
print("X_train_shape:", X_train.shape, " y_train_shape:", y_train.shape)
print("X_test_shape:", X_test.shape,"  y_test_shape:", y_test.shape)

## XGboost

In [None]:
xgb_params = {
    #'n_estimators': [100,200,500,750,1000],
    #'max_depth': np.arange(5,8,1),
    #'min_child_weight': [1,3,5],
    #'gamma':[i/10.0 for i in range(0,5)],
    #'subsample':[i/10.0 for i in range(6,10)],
    #'colsample_bytree':[i/10.0 for i in range(6,10)],
    #'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1],
    #'learning_rate': [0.01, 0.02, 0.05, 0.1]
}

In [None]:
xgb = XGBClassifier()
xgb_grid = GridSearchCV(xgb, param_grid=xgb_params, cv = 5)
xgb_grid.fit(X, y)

print("Best score: {}".format(xgb_grid.best_score_))
print("Best params: {}".format(xgb_grid.best_estimator_))

In [None]:
xgb_importance =  xgb_grid.best_estimator_.feature_importances_
xgb_grid.best_estimator_.feature_importances_

In [None]:
sns.set_style('darkgrid')
#plt.figure(figsize=(15, 10))
plt.title('Feature Importance of GBDT', fontsize = 20)
sns.barplot(x=features, y=xgb_importance)
plt.ylabel('Features')
plt.ylabel('Feature Importance')
plt.show()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(max_depth=8)
dt.fit(X, y)
scores = cross_val_score(dt, X, y, cv=5)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()))

In [None]:
dt_importance = dt.feature_importances_
dt.feature_importances_

In [None]:
ns.set_style('darkgrid')
#plt.figure(figsize=(15, 10))
plt.title('Feature Importance of Decision Tree', fontsize = 20)
sns.barplot(x=features, y=dt_importance)
plt.ylabel('Features')
plt.ylabel('Feature Importance')
plt.show()

## Random Forest

In [None]:
# instantiate Random Forest model
forrest = RandomForestClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=6,n_estimators=50)

# build and fit model 
forrest.fit(X, y)
scores = cross_val_score(dt, X, y, cv=5)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()))


In [None]:
forest_importance = forest.feature_importances_
forest.feature_importances_

In [None]:
sns.set_style('darkgrid')
#plt.figure(figsize=(15, 10))
plt.title('Feature Importance of Random Forest', fontsize = 20)
sns.barplot(x=features, y=forest_importance)
plt.ylabel('Features')
plt.ylabel('Feature Importance')
plt.show()

## Adaboost

In [None]:
# instantiate Random Forest model
adaboost = AdaBoostClassifier(base_estimator= dt)

# build and fit model 
adaboost.fit(X, y)
scores = cross_val_score(dt, X, y, cv=5)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()))

In [None]:
adaboost.feature_importances_

## GBDT

In [None]:
gbdt = GradientBoostingClassifier(learning_rate=0.5, max_depth=6, n_estimators=200)

# build and fit model 
gbdt.fit(X, y)
scores = cross_val_score(dt, X, y, cv=5)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()))

In [None]:
gbdt_importance = gbdt.feature_importances_
gbdt.feature_importances_

In [None]:
sns.set_style('darkgrid')
#plt.figure(figsize=(15, 10))
plt.title('Feature Importance of GBDT', fontsize = 20)
sns.barplot(x=features, y=gbdt_importance)
plt.ylabel('Features')
plt.ylabel('Feature Importance')
plt.show()