In [9]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, cohen_kappa_score


import warnings
import pickle
import pandas as pd
warnings.filterwarnings('ignore')

In [10]:
#import dataset created in Data Understanding
accidents = pd.read_csv(r"C:\Users\DETCAO03\V-Case study\02_Dataset\Used\Cleaned_dataset_accidents.csv",low_memory=False, encoding='utf-8')

In [11]:
#define influencing and response variable
X = accidents.drop("Accident_Severity", axis=1)
y = accidents["Accident_Severity"]
# Split the data into a training and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [None]:
#Classifier
dt = DecisionTreeClassifier(criterion = 'gini', min_samples_split = 30, splitter='best')
lin_reg=LinearRegression()
log_reg=LogisticRegression()
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=200)
#svc_model=SVC()

dt = dt.fit(X_train, y_train)
lin_reg=lin_reg.fit(X_train,y_train)
log_reg=log_reg.fit(X_train,y_train)
nb.fit(X_train, y_train)
rf.fit(X_train,y_train)
#svc_model=svc_model.fit(x_train,y_train)

In [None]:
#Random Forest
y_pred = rf.predict(X_test)
rf.score(X_test, y_test)
acc_rf = round(rf.score(X_test, y_test) * 100, 2)
sk_report = classification_report(digits=6, y_true=y_test, y_pred=y_pred)
print("Random Forest")
print("Accuracy", acc_rf)
print("Cohen Kappa: "+str(cohen_kappa_score(y_test,y_pred)))
print("\n")
print(sk_report)

### Confusion Matrix 
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
plt.figure(figsize=(12,6))
feat_importances = pd.Series(rf.feature_importances_, index=accident_ml.columns)
feat_importances.nlargest(5).plot(kind='barh')

In [None]:
#Naive Bayes
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
sk_report = classification_report(digits = 6, y_true = y_test, y_pred = y_pred)
print("NB")
print("Accuracy", round(accuracy_score(y_test, y_pred) * 100,2))
print("Cohen Kappa: "+str(cohen_kappa_score(y_test,y_pred)))
print("\n")
print(sk_report)

### Confusion Matrix 
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
pickle.dump(lin_reg,open('lin_model.pkl','wb'))
pickle.dump(log_reg,open('log_model.pkl','wb'))
pickle.dump(dt,open('dt_model.pkl','wb'))
pickle.dump(nb,open('nb_model.pkl','wb'))
pickle.dump(rf,open('rf_model.pkl','wb'))
#pickle.dump(svc_model,open('svc_model.pkl','wb'))

In [None]:
accidents.head()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [4, 5],
    'min_samples_leaf': [5, 10, 15],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train,y_train)

In [None]:
y_pred = grid_search.predict(X_test)
acc_r1 = round(grid_search.score(X_test, y_test) * 100, 2)

sk_report = classification_report(
    digits=6,
    y_true=y_test, 
    y_pred=y_pred)
print("Cohen Kappa: "+str(cohen_kappa_score(y_test,y_pred)))
print("Accuracy" , acc_rf)
print(sk_report)
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)