### Importing the libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

### Importing the dataset

In [2]:
dataset = pd.read_csv('Cleaned_Data.csv')
dataset.head().T

Unnamed: 0,0,1,2,3,4
VEHICLE_TYPE,station wagon/sport utility vehicle,sedan,station wagon/sport utility vehicle,station wagon/sport utility vehicle,sedan
VEHICLE_OCCUPANTS,1,2,2,1,2
DRIVER_SEX,M,M,F,M,M
PRE_CRASH,going straight ahead,merging,going straight ahead,passing,going straight ahead
POINT_OF_IMPACT,left front bumper,right front bumper,center front end,right front quarter panel,center front end
how_old,19,6,16,6,13
MAKE,toyt,merz,ford,toyt,toyt
Year,2019,2016,2016,2016,2019
Month,9,5,8,4,10
Week,4,1,3,1,4


In [3]:
dataset_trim = dataset.loc[dataset['CONTRIBUTING_FACTOR_1'].isin(dataset['CONTRIBUTING_FACTOR_1'].value_counts().index[dataset['CONTRIBUTING_FACTOR_1'].value_counts() > 40000])]

### Spliting the dataset 

In [4]:
X_training, X_test, y_training, y_test = train_test_split(dataset_trim.iloc[:,:-1], dataset_trim.iloc[:,-1],
                                                          test_size=0.05,
                                                          random_state = 42)

### Constructing a pipeline.

In [5]:
cat_columns = ['VEHICLE_TYPE','DRIVER_SEX','POINT_OF_IMPACT','MAKE','Month','Week','Hour']
num_columns = ['how_old']

num_pipeline = Pipeline([('standardize_num', StandardScaler()),
                        ])

cat_pipeline = Pipeline([('create_dummies_cats', OneHotEncoder(drop='first',handle_unknown='ignore'))
                         ])

processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipeline, num_columns),
                                                      ('create_dummies', cat_pipeline, cat_columns)])

processing_pipeline

ColumnTransformer(transformers=[('proc_numeric',
                                 Pipeline(steps=[('standardize_num',
                                                  StandardScaler())]),
                                 ['how_old']),
                                ('create_dummies',
                                 Pipeline(steps=[('create_dummies_cats',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'))]),
                                 ['VEHICLE_TYPE', 'DRIVER_SEX',
                                  'POINT_OF_IMPACT', 'MAKE', 'Month', 'Week',
                                  'Hour'])])

### Modeling

In [6]:
clf1 = LogisticRegression(max_iter=5000)
clf2 = DecisionTreeClassifier()
clf3 = RandomForestClassifier()

pipe1 = Pipeline([('data_processing', processing_pipeline),('logreg', clf1)])
pipe2 = Pipeline([('data_processing', processing_pipeline),('tree', clf2)])
pipe3 = Pipeline([('data_processing', processing_pipeline),('rf', clf3)])

### Logistic regression

In [12]:
param_grid_lr= [
  {'logreg__class_weight': ['balanced'], 'logreg__C':[ 0.005, 0.01, 0.05, 0.1, 0.5 ],'logreg__solver':['saga','sag']}
 ]
gcv_lr = GridSearchCV(estimator=pipe1, param_grid=param_grid_lr, scoring=['accuracy'], cv=5, refit='accuracy',verbose=5)
gcv_lr = gcv_lr.fit(X_training, y_training)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.260) total time=   5.3s
[CV 2/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.260) total time=   5.7s
[CV 3/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.260) total time=   5.2s
[CV 4/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.260) total time=   5.6s
[CV 5/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.260) total time=   5.1s
[CV 1/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=sag; accuracy: (test=0.260) total time=   4.3s
[CV 2/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=sag; accuracy: (test=0.260) total time=   4.3s
[CV 3/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solv

In [13]:
gcv_lr.best_estimator_

Pipeline(steps=[('data_processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('standardize_num',
                                                                   StandardScaler())]),
                                                  ['how_old']),
                                                 ('create_dummies',
                                                  Pipeline(steps=[('create_dummies_cats',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  ['VEHICLE_TYPE', 'DRIVER_SEX',
                                                   'POINT_OF_IMPACT', 'MAKE',
                                                   'Month', 'Week',
                                                   'Hour'])])),
      

In [14]:
ypred_lr = gcv_lr.predict(X_test)
print(classification_report(y_test, ypred_lr, digits=3))

                                precision    recall  f1-score   support

driver inattention/distraction      0.601     0.031     0.058     12589
 failure to yield right-of-way      0.210     0.370     0.268      3444
         following too closely      0.425     0.627     0.506      4853
                         other      0.167     0.258     0.203      1986
passing or lane usage improper      0.157     0.615     0.251      2047

                      accuracy                          0.260     24919
                     macro avg      0.312     0.380     0.257     24919
                  weighted avg      0.442     0.260     0.202     24919



In [15]:
confusion_matrix(y_test, ypred_lr)

array([[ 387, 3002, 3009, 1603, 4588],
       [  50, 1273,  568,  295, 1258],
       [ 101,  876, 3041,  443,  392],
       [  57,  434,  480,  513,  502],
       [  49,  464,   64,  211, 1259]], dtype=int64)

### Observations
- When we use a logistic regression to classify the contributing factor we get a macro average of 0.312
- When it comes to driver's inattention the model has a 60% precision.
- When it comes to failure to yield right-of-way the model has a 21% precision.
- When it comes to following too closely the model has a 42.5% precision.
- When it comes to improper lane usage the model has a 15.7% precision.
- For others the model has a 16.7% precision.

### Decision tree 

In [16]:
param_grid_tree = [
  {'tree__max_depth': [10, 25, 50],
   'tree__min_samples_split':[10, 25, 50, 100],
   'tree__min_samples_leaf': [50,100],
   'tree__class_weight':['balanced']
  }
 ]

gcv_tree = GridSearchCV(estimator=pipe2, param_grid=param_grid_tree, scoring='accuracy', refit=True, verbose=10)
gcv_tree = gcv_tree.fit(X_training, y_training)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10
[CV 1/5; 1/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10;, score=0.248 total time=   2.6s
[CV 2/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10
[CV 2/5; 1/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10;, score=0.250 total time=   2.6s
[CV 3/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10
[CV 3/5; 1/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10;, score=0.250 total time=   2.7s
[CV 4/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree

[CV 5/5; 6/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=25;, score=0.251 total time=   2.6s
[CV 1/5; 7/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=50
[CV 1/5; 7/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=50;, score=0.246 total time=   2.7s
[CV 2/5; 7/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=50
[CV 2/5; 7/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=50;, score=0.250 total time=   2.7s
[CV 3/5; 7/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=50
[CV 3/5; 7/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=100, tree__min_samples_split=50;, score=0.249 total time=   2.6s
[

[CV 5/5; 12/24] END tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=50, tree__min_samples_split=100;, score=0.257 total time=  13.8s
[CV 1/5; 13/24] START tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=100, tree__min_samples_split=10
[CV 1/5; 13/24] END tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=100, tree__min_samples_split=10;, score=0.257 total time=  12.0s
[CV 2/5; 13/24] START tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=100, tree__min_samples_split=10
[CV 2/5; 13/24] END tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=100, tree__min_samples_split=10;, score=0.256 total time=  11.7s
[CV 3/5; 13/24] START tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=100, tree__min_samples_split=10
[CV 3/5; 13/24] END tree__class_weight=balanced, tree__max_depth=25, tree__min_samples_leaf=100, tree__min_samples_split=10;, score=0.257 total time=  

[CV 4/5; 18/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=25;, score=0.259 total time=  18.3s
[CV 5/5; 18/24] START tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=25
[CV 5/5; 18/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=25;, score=0.261 total time=  18.4s
[CV 1/5; 19/24] START tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=50
[CV 1/5; 19/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=50;, score=0.259 total time=  18.4s
[CV 2/5; 19/24] START tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=50
[CV 2/5; 19/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=50, tree__min_samples_split=50;, score=0.256 total time=  18.6s
[

[CV 3/5; 24/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=100, tree__min_samples_split=100;, score=0.259 total time=  13.3s
[CV 4/5; 24/24] START tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=100, tree__min_samples_split=100
[CV 4/5; 24/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=100, tree__min_samples_split=100;, score=0.260 total time=  13.8s
[CV 5/5; 24/24] START tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=100, tree__min_samples_split=100
[CV 5/5; 24/24] END tree__class_weight=balanced, tree__max_depth=50, tree__min_samples_leaf=100, tree__min_samples_split=100;, score=0.258 total time=  13.6s


In [17]:
gcv_tree.best_estimator_

Pipeline(steps=[('data_processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('standardize_num',
                                                                   StandardScaler())]),
                                                  ['how_old']),
                                                 ('create_dummies',
                                                  Pipeline(steps=[('create_dummies_cats',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  ['VEHICLE_TYPE', 'DRIVER_SEX',
                                                   'POINT_OF_IMPACT', 'MAKE',
                                                   'Month', 'Week',
                                                   'Hour'])])),
      

In [18]:
ypred_tree = gcv_tree.predict(X_test)
print(classification_report(y_test, ypred_tree, digits=3))

                                precision    recall  f1-score   support

driver inattention/distraction      0.563     0.057     0.104     12589
 failure to yield right-of-way      0.211     0.317     0.254      3444
         following too closely      0.411     0.615     0.493      4853
                         other      0.138     0.300     0.189      1986
passing or lane usage improper      0.151     0.509     0.233      2047

                      accuracy                          0.258     24919
                     macro avg      0.295     0.360     0.255     24919
                  weighted avg      0.417     0.258     0.218     24919



In [19]:
confusion_matrix(y_test, ypred_tree)

array([[ 720, 2599, 3101, 2241, 3928],
       [ 191, 1093,  584,  520, 1056],
       [ 158,  605, 2987,  639,  464],
       [  86,  390,  509,  596,  405],
       [ 123,  484,   80,  318, 1042]], dtype=int64)

### Observations
- When we use a decision tree to classify the contributing factor we get a macro average of 0.295
- When it comes to driver's inattention the model has a 56% precision.
- When it comes to failure to yield right-of-way the model has a 21% precision.
- When it comes to following too closely the model has a 42% precision.
- When it comes to improper lane usage the model has a 15% precision.
- For others the model has a 13.8% precision.

### Random Forest

In [20]:
param_grid_rf = [{'rf__max_depth': [5, 10, 15],
               'rf__n_estimators': [10, 50, 100],
               'rf__class_weight': ['balanced', 'balanced_subsample'],
               'rf__max_samples': [1000, 2000, 5000]
              }]

gcv_rf = GridSearchCV(estimator=pipe3, param_grid=param_grid_rf, scoring='accuracy', refit=True, verbose=10)
gcv_rf = gcv_rf.fit(X_training, y_training)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 1/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.255 total time=   1.5s
[CV 2/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 2/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.264 total time=   1.5s
[CV 3/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 3/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.258 total time=   1.5s
[CV 4/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 4/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estim

[CV 4/5; 7/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10;, score=0.279 total time=   1.6s
[CV 5/5; 7/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10
[CV 5/5; 7/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10;, score=0.253 total time=   1.5s
[CV 1/5; 8/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=50
[CV 1/5; 8/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=50;, score=0.248 total time=   3.0s
[CV 2/5; 8/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=50
[CV 2/5; 8/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=50;, score=0.249 total time=   3.0s
[CV 3/5; 8/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=50
[CV 3/5; 8/54] END r

[CV 3/5; 14/54] END rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=50;, score=0.327 total time=   3.8s
[CV 4/5; 14/54] START rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=50
[CV 4/5; 14/54] END rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=50;, score=0.333 total time=   3.8s
[CV 5/5; 14/54] START rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=50
[CV 5/5; 14/54] END rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=50;, score=0.334 total time=   3.8s
[CV 1/5; 15/54] START rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=100
[CV 1/5; 15/54] END rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=100;, score=0.342 total time=   6.4s
[CV 2/5; 15/54] START rf__class_weight=balanced, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=100
[

[CV 1/5; 21/54] END rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100;, score=0.485 total time=   7.4s
[CV 2/5; 21/54] START rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100
[CV 2/5; 21/54] END rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100;, score=0.482 total time=   7.5s
[CV 3/5; 21/54] START rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100
[CV 3/5; 21/54] END rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100;, score=0.489 total time=   7.4s
[CV 4/5; 21/54] START rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100
[CV 4/5; 21/54] END rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=100;, score=0.486 total time=   7.5s
[CV 5/5; 21/54] START rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=

[CV 4/5; 27/54] END rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=100;, score=0.372 total time=  11.3s
[CV 5/5; 27/54] START rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=100
[CV 5/5; 27/54] END rf__class_weight=balanced, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=100;, score=0.363 total time=  11.4s
[CV 1/5; 28/54] START rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 1/5; 28/54] END rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.265 total time=   1.4s
[CV 2/5; 28/54] START rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 2/5; 28/54] END rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.253 total time=   1.4s
[CV 3/5; 28/54] START rf__class_weight=balanced_subsample, rf__max_depth=5

[CV 5/5; 33/54] END rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=2000, rf__n_estimators=100;, score=0.263 total time=   6.0s
[CV 1/5; 34/54] START rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10
[CV 1/5; 34/54] END rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10;, score=0.242 total time=   1.4s
[CV 2/5; 34/54] START rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10
[CV 2/5; 34/54] END rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10;, score=0.254 total time=   1.4s
[CV 3/5; 34/54] START rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10
[CV 3/5; 34/54] END rf__class_weight=balanced_subsample, rf__max_depth=5, rf__max_samples=5000, rf__n_estimators=10;, score=0.264 total time=   1.4s
[CV 4/5; 34/54] START rf__class_weight=balanced_s

[CV 1/5; 40/54] END rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10;, score=0.310 total time=   1.6s
[CV 2/5; 40/54] START rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10
[CV 2/5; 40/54] END rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10;, score=0.326 total time=   1.5s
[CV 3/5; 40/54] START rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10
[CV 3/5; 40/54] END rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10;, score=0.311 total time=   1.6s
[CV 4/5; 40/54] START rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10
[CV 4/5; 40/54] END rf__class_weight=balanced_subsample, rf__max_depth=10, rf__max_samples=2000, rf__n_estimators=10;, score=0.302 total time=   1.6s
[CV 5/5; 40/54] START rf__class_weight=bala

[CV 2/5; 46/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10;, score=0.400 total time=   1.7s
[CV 3/5; 46/54] START rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10
[CV 3/5; 46/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10;, score=0.394 total time=   1.7s
[CV 4/5; 46/54] START rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10
[CV 4/5; 46/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10;, score=0.400 total time=   1.7s
[CV 5/5; 46/54] START rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10
[CV 5/5; 46/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=1000, rf__n_estimators=10;, score=0.401 total time=   1.7s
[CV 1/5; 47/54] START rf__class_weight=bala

[CV 3/5; 52/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=10;, score=0.337 total time=   2.0s
[CV 4/5; 52/54] START rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=10
[CV 4/5; 52/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=10;, score=0.344 total time=   2.0s
[CV 5/5; 52/54] START rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=10
[CV 5/5; 52/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=10;, score=0.334 total time=   2.0s
[CV 1/5; 53/54] START rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=50
[CV 1/5; 53/54] END rf__class_weight=balanced_subsample, rf__max_depth=15, rf__max_samples=5000, rf__n_estimators=50;, score=0.370 total time=   6.7s
[CV 2/5; 53/54] START rf__class_weight=bala

In [21]:
gcv_rf.best_estimator_

Pipeline(steps=[('data_processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('standardize_num',
                                                                   StandardScaler())]),
                                                  ['how_old']),
                                                 ('create_dummies',
                                                  Pipeline(steps=[('create_dummies_cats',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  ['VEHICLE_TYPE', 'DRIVER_SEX',
                                                   'POINT_OF_IMPACT', 'MAKE',
                                                   'Month', 'Week',
                                                   'Hour'])])),
      

In [22]:
ypred_rf = gcv_rf.predict(X_test)
print(classification_report(y_test, ypred_rf, digits=3))

                                precision    recall  f1-score   support

driver inattention/distraction      0.542     0.705     0.613     12589
 failure to yield right-of-way      0.210     0.048     0.078      3444
         following too closely      0.419     0.643     0.507      4853
                         other      0.311     0.016     0.031      1986
passing or lane usage improper      0.166     0.018     0.033      2047

                      accuracy                          0.490     24919
                     macro avg      0.329     0.286     0.252     24919
                  weighted avg      0.423     0.490     0.424     24919



In [29]:
confusion_matrix(y_test, ypred_rf)

array([[8870,  388, 3147,   52,  132],
       [2665,  164,  578,    4,   33],
       [1590,  125, 3119,    6,   13],
       [1356,   47,  543,   32,    8],
       [1880,   57,   64,    9,   37]], dtype=int64)

### Observations
- When we use a Random Forest to classify the contributing factor we get a macro average of 0.329
- When it comes to driver's inattention the model has a 54% precision.
- When it comes to failure to yield right-of-way the model has a 21% precision.
- When it comes to following too closely the model has a 42% precision.
- When it comes to improper lane usage the model has a 16.6% precision.
- For others the model has a 31% precision.

### Gradient Boosting Classifier

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
pipe5 = Pipeline([('processing', processing_pipeline),
             ('gb', GradientBoostingClassifier())])

params_gb = {'gb__max_depth': [1, 3],
         'gb__n_estimators': [10, 25]
         }

gcv_gb = GridSearchCV(pipe5, param_grid = params_gb, cv=2, scoring='accuracy', verbose=10)
gcv_gb = gcv_gb.fit(X_training, y_training)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2; 1/4] START gb__max_depth=1, gb__n_estimators=10........................
[CV 1/2; 1/4] END gb__max_depth=1, gb__n_estimators=10;, score=0.504 total time=   4.5s
[CV 2/2; 1/4] START gb__max_depth=1, gb__n_estimators=10........................
[CV 2/2; 1/4] END gb__max_depth=1, gb__n_estimators=10;, score=0.504 total time=   4.5s
[CV 1/2; 2/4] START gb__max_depth=1, gb__n_estimators=25........................
[CV 1/2; 2/4] END gb__max_depth=1, gb__n_estimators=25;, score=0.504 total time=  10.4s
[CV 2/2; 2/4] START gb__max_depth=1, gb__n_estimators=25........................
[CV 2/2; 2/4] END gb__max_depth=1, gb__n_estimators=25;, score=0.504 total time=  10.1s
[CV 1/2; 3/4] START gb__max_depth=3, gb__n_estimators=10........................
[CV 1/2; 3/4] END gb__max_depth=3, gb__n_estimators=10;, score=0.504 total time=   8.3s
[CV 2/2; 3/4] START gb__max_depth=3, gb__n_estimators=10........................
[CV 2/2; 3/4] E

In [8]:
gcv_gb.best_estimator_

Pipeline(steps=[('processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('standardize_num',
                                                                   StandardScaler())]),
                                                  ['how_old']),
                                                 ('create_dummies',
                                                  Pipeline(steps=[('create_dummies_cats',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  ['VEHICLE_TYPE', 'DRIVER_SEX',
                                                   'POINT_OF_IMPACT', 'MAKE',
                                                   'Month', 'Week',
                                                   'Hour'])])),
           

In [9]:
ypred_gb = gcv_gb.predict(X_test)
print(classification_report(y_test, ypred_gb, digits=3))

                                precision    recall  f1-score   support

driver inattention/distraction      0.511     0.966     0.668     12589
 failure to yield right-of-way      0.000     0.000     0.000      3444
         following too closely      0.454     0.104     0.169      4853
                         other      0.333     0.001     0.001      1986
passing or lane usage improper      0.000     0.000     0.000      2047

                      accuracy                          0.508     24919
                     macro avg      0.260     0.214     0.168     24919
                  weighted avg      0.373     0.508     0.371     24919



In [16]:
confusion_matrix(y_test, ypred_gb)

array([[12157,     0,   430,     2,     0],
       [ 3359,     0,    85,     0,     0],
       [ 4349,     0,   504,     0,     0],
       [ 1902,     0,    83,     1,     0],
       [ 2039,     0,     8,     0,     0]], dtype=int64)

### Observations
- When we use a gradient boosting to classify the contributing factor we get a macro average of 0.260
- When it comes to driver's inattention the model has a 51% precision.
- When it comes to failure to yield right-of-way the model has a 0% precision.
- When it comes to following too closely the model has a 45.4% precision.
- When it comes to improper lane usage the model has a 0% precision.
- For others the model has a 33.3% precision.

In [10]:
tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=1)
abc = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1)
pipe4 = Pipeline([('data_processing', processing_pipeline),('tree', abc)])
pipe4.fit(X_training,y_training)
abc_predicted = pipe4.predict(X_test)

In [11]:
print(classification_report(y_test, abc_predicted, digits=3))

                                precision    recall  f1-score   support

driver inattention/distraction      0.522     0.866     0.651     12589
 failure to yield right-of-way      0.000     0.000     0.000      3444
         following too closely      0.436     0.361     0.395      4853
                         other      0.000     0.000     0.000      1986
passing or lane usage improper      0.000     0.000     0.000      2047

                      accuracy                          0.508     24919
                     macro avg      0.192     0.245     0.209     24919
                  weighted avg      0.349     0.508     0.406     24919



### Observations
- When we use a Adaboosting classifier to classify the contributing factor we get a macro average of 0.192
- When it comes to driver's inattention the model has a 54% precision.
- When it comes to failure to yield right-of-way the model has a 0% precision.
- When it comes to following too closely the model has a 43.6% precision.
- When it comes to improper lane usage the model has a 0% precision.
- For others the model has a 0% precision.

### Summary
I would use random forest or logistic regression as the final model. As both of these model identify the less frequently occuring classes more accurately compared to the complex algorithms like gradient boosting algortihm or adaboosting algorithm.