### Importing the libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

### Importing the dataset

In [2]:
dataset = pd.read_csv('Cleaned_Data.csv')
dataset.head().T

Unnamed: 0,0,1,2,3,4
VEHICLE_TYPE,station wagon/sport utility vehicle,sedan,station wagon/sport utility vehicle,station wagon/sport utility vehicle,sedan
VEHICLE_OCCUPANTS,1.0,2.0,2.0,1.0,2.0
DRIVER_SEX,M,M,F,M,M
PRE_CRASH,going straight ahead,merging,going straight ahead,passing,going straight ahead
POINT_OF_IMPACT,left front bumper,right front bumper,center front end,right front quarter panel,center front end
CONTRIBUTING_FACTOR_1,driver inattention/distraction,driver inattention/distraction,driver inattention/distraction,unsafe lane changing,driver inattention/distraction
VEHICLE_DAMAGE_cleaned,left front quarter panel,right front bumper,center front end,no damage,no damage
how_old,19.0,6.0,16.0,6.0,13.0
MAKE,toyt,merz,ford,toyt,toyt
Year,2019,2016,2016,2016,2019


### Splitting the dataset

In [3]:
dataset_trim = dataset.loc[dataset['CONTRIBUTING_FACTOR_1'].isin(dataset['CONTRIBUTING_FACTOR_1'].value_counts().index[dataset['CONTRIBUTING_FACTOR_1'].value_counts() > 500])]
                                                                                                                               

In [4]:
dataset_trim

Unnamed: 0,VEHICLE_TYPE,VEHICLE_OCCUPANTS,DRIVER_SEX,PRE_CRASH,POINT_OF_IMPACT,CONTRIBUTING_FACTOR_1,VEHICLE_DAMAGE_cleaned,how_old,MAKE,Year,Month,Week,Hour
0,station wagon/sport utility vehicle,1.0,M,going straight ahead,left front bumper,driver inattention/distraction,left front quarter panel,19.0,toyt,2019,9,4,8
1,sedan,2.0,M,merging,right front bumper,driver inattention/distraction,right front bumper,6.0,merz,2016,5,1,17
2,station wagon/sport utility vehicle,2.0,F,going straight ahead,center front end,driver inattention/distraction,center front end,16.0,ford,2016,8,3,12
3,station wagon/sport utility vehicle,1.0,M,passing,right front quarter panel,unsafe lane changing,no damage,6.0,toyt,2016,4,1,15
4,sedan,2.0,M,going straight ahead,center front end,driver inattention/distraction,no damage,13.0,toyt,2019,10,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
779251,other,1.0,M,going straight ahead,center front end,failure to yield right-of-way,no damage,24.0,ptr,2021,11,3,14
779252,box truck,1.0,M,going straight ahead,right front bumper,driver inattention/distraction,right front bumper,1.0,frht,2021,11,1,14
779253,sedan,1.0,F,going straight ahead,no damage,other,no damage,13.0,toyt,2021,11,3,17
779254,station wagon/sport utility vehicle,1.0,M,making u turn,right front bumper,turning improperly,right front bumper,17.0,niss,2021,12,1,19


### Constructing  a pipeline

In [5]:
y = dataset_trim['CONTRIBUTING_FACTOR_1']
dataset_trim.drop(['CONTRIBUTING_FACTOR_1'],inplace=True, axis = 1)
x = dataset_trim

In [6]:
X_training, X_test, y_training, y_test = train_test_split(x, y,
                                                          test_size=0.05,
                                                          random_state = 42)

In [7]:
y_training

431917                             other
51255     driver inattention/distraction
55537     driver inattention/distraction
143733    passing or lane usage improper
201196               driver inexperience
                       ...              
259178    driver inattention/distraction
365838                  backing unsafely
131932     failure to yield right-of-way
671155                             other
121958    passing or lane usage improper
Name: CONTRIBUTING_FACTOR_1, Length: 740293, dtype: object

In [8]:
cat_columns = ['VEHICLE_TYPE','DRIVER_SEX','POINT_OF_IMPACT','MAKE','Month','Week','Hour']
num_columns = ['how_old']

num_pipeline = Pipeline([('standardize_num', StandardScaler()),
                        ])

cat_pipeline = Pipeline([('create_dummies_cats', OneHotEncoder(drop='first',handle_unknown='ignore'))
                         ])

processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipeline, num_columns),
                                                      ('create_dummies', cat_pipeline, cat_columns)])

processing_pipeline

### Modeling

In [9]:
clf1 = LogisticRegression(max_iter=5000)
clf2 = DecisionTreeClassifier()
clf3 = RandomForestClassifier()

pipe1 = Pipeline([('data_processing', processing_pipeline),('logreg', clf1)])
pipe2 = Pipeline([('data_processing', processing_pipeline),('tree', clf2)])
pipe3 = Pipeline([('data_processing', processing_pipeline),('rf', clf3)])

### Logistic Regression

In [10]:
param_grid_lr= [
  {'logreg__class_weight': ['balanced'], 'logreg__C':[ 0.005, 0.01, 0.05, 0.1, 0.5 ],'logreg__solver':['saga','sag']}
 ]
gcv_lr = GridSearchCV(estimator=pipe1, param_grid=param_grid_lr, scoring=['accuracy'], cv=5, refit='accuracy',verbose=5)
gcv_lr = gcv_lr.fit(X_training, y_training)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.163) total time=  16.0s
[CV 2/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.164) total time=  15.0s
[CV 3/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.165) total time=  15.1s
[CV 4/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.162) total time=  15.1s
[CV 5/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=saga; accuracy: (test=0.164) total time=  14.5s
[CV 1/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=sag; accuracy: (test=0.163) total time=  11.9s
[CV 2/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solver=sag; accuracy: (test=0.164) total time=  11.3s
[CV 3/5] END logreg__C=0.005, logreg__class_weight=balanced, logreg__solv

In [11]:
gcv_lr.best_estimator_

In [12]:
ypred_lr = gcv_lr.predict(X_test)
print(classification_report(y_test, ypred_lr, digits=3))

                                precision    recall  f1-score   support

           alcohol involvement      0.071     0.424     0.122       596
              backing unsafely      0.318     0.669     0.431      1931
driver inattention/distraction      1.000     0.000     0.000     12486
           driver inexperience      0.041     0.028     0.033       894
 failure to yield right-of-way      0.157     0.196     0.174      3467
         following too closely      0.380     0.519     0.438      4994
                         other      0.142     0.027     0.045      2061
               other vehicular      0.084     0.025     0.038      1899
             oversized vehicle      0.085     0.746     0.152       334
passing or lane usage improper      0.111     0.106     0.108      2136
           passing too closely      0.125     0.247     0.166      1681
             pavement slippery      0.046     0.249     0.077       546
reaction to uninvolved vehicle      0.176     0.064     0.094  

In [13]:

confusion_matrix(y_test, ypred_lr)

array([[ 253,   52,    0,    4,   47,   56,    7,    1,   11,   18,   12,
          57,   12,    4,    8,   20,   28,    6],
       [  35, 1291,    0,    9,   53,   14,    5,   21,  158,   33,   73,
          22,   49,    5,   37,  108,    9,    9],
       [1105,  947,    2,  216, 1569, 2375,  124,  207,  806,  746, 1002,
         965,   57,   67,  255, 1361,  463,  219],
       [ 110,   77,    0,   25,  123,  113,    9,   13,   61,   40,   65,
          60,    6,    7,   36,  101,   38,   10],
       [ 250,  143,    0,   89,  679,  441,   19,   47,  122,  235,  313,
         319,   10,   29,  104,  458,  127,   82],
       [ 366,  214,    0,   53,  404, 2590,   35,   17,  161,   45,  105,
         606,   36,    8,   17,  120,  152,   65],
       [ 341,  141,    0,   31,  192,  397,   55,   36,   72,   92,  158,
         192,   14,   18,   29,  140,  107,   46],
       [ 163,  364,    0,   22,  149,  128,   31,   47,  170,  102,  196,
          81,   40,   16,   51,  268,   48,   23],


### Observations
- When we use a logistic regression to classify the contributing factor we get a macro average of 0.312
- When it comes to driver's inattention the model has a 60% precision.
- When it comes to failure to yield right-of-way the model has a 21% precision.
- When it comes to following too closely the model has a 42.5% precision.
- When it comes to improper lane usage the model has a 15.7% precision.
- For others the model has a 16.7% precision.

### Decision Tree

In [14]:
param_grid_tree = [
  {'tree__max_depth': [10, 25, 50],
   'tree__min_samples_split':[10, 25, 50, 100],
   'tree__min_samples_leaf': [50,100],
   
   'tree__class_weight':['balanced']
  }
 ]

gcv_tree = GridSearchCV(estimator=pipe2, param_grid=param_grid_tree, scoring='accuracy', refit=True, verbose=10)
gcv_tree = gcv_tree.fit(X_training, y_training)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10
[CV 1/5; 1/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10;, score=0.149 total time=   4.6s
[CV 2/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10
[CV 2/5; 1/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10;, score=0.151 total time=   4.6s
[CV 3/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10
[CV 3/5; 1/24] END tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree__min_samples_split=10;, score=0.152 total time=   4.6s
[CV 4/5; 1/24] START tree__class_weight=balanced, tree__max_depth=10, tree__min_samples_leaf=50, tree

In [15]:
gcv_tree.best_estimator_

In [16]:
ypred_tree = gcv_tree.predict(X_test)
print(classification_report(y_test, ypred_tree, digits=3))

                                precision    recall  f1-score   support

           alcohol involvement      0.077     0.295     0.122       596
              backing unsafely      0.331     0.592     0.425      1931
driver inattention/distraction      0.000     0.000     0.000     12486
           driver inexperience      0.040     0.037     0.038       894
 failure to yield right-of-way      0.147     0.224     0.177      3467
         following too closely      0.372     0.459     0.411      4994
                         other      0.137     0.036     0.057      2061
               other vehicular      0.073     0.022     0.033      1899
             oversized vehicle      0.080     0.713     0.143       334
passing or lane usage improper      0.106     0.044     0.063      2136
           passing too closely      0.127     0.223     0.162      1681
             pavement slippery      0.042     0.165     0.067       546
reaction to uninvolved vehicle      0.104     0.131     0.116  

In [17]:
confusion_matrix(y_test, ypred_tree)

array([[ 176,   45,    0,   10,   74,   62,    3,    3,   16,    5,   23,
          39,   26,   16,    6,   32,   56,    4],
       [   7, 1144,    0,   13,   62,   11,    5,   18,  151,   24,   67,
          18,  213,   17,   23,  124,   23,   11],
       [ 742,  792,    0,  275, 1847, 2145,  186,  195,  806,  308,  917,
         726,  276,  338,  303, 1448,  899,  283],
       [  66,   58,    0,   33,  151,  106,   13,   11,   66,   17,   66,
          49,   27,   27,   24,  101,   57,   22],
       [ 154,  128,    0,  114,  778,  389,   34,   66,  141,   95,  283,
         177,   33,  114,  117,  467,  278,   99],
       [ 271,  156,    0,   67,  517, 2290,   31,   11,  161,   21,   96,
         528,  104,  131,   26,  129,  382,   73],
       [ 223,  124,    0,   44,  257,  372,   75,   27,   92,   44,  116,
         143,   51,   58,   36,  166,  186,   47],
       [  93,  270,    0,   36,  187,  111,   41,   41,  177,   74,  145,
          54,  141,   52,   61,  274,  104,   38],


### Observations
- When we use a decision tree to classify the contributing factor we get a macro average of 0.295
- When it comes to driver's inattention the model has a 56% precision.
- When it comes to failure to yield right-of-way the model has a 21% precision.
- When it comes to following too closely the model has a 42% precision.
- When it comes to improper lane usage the model has a 15% precision.
- For others the model has a 13.8% precision.

### Random Forest

In [18]:
param_grid_rf = [{'rf__max_depth': [5, 10, 15],
               'rf__n_estimators': [10, 50, 100],
               'rf__class_weight': ['balanced', 'balanced_subsample'],
               'rf__max_samples': [1000, 2000, 5000]
              }]

gcv_rf = GridSearchCV(estimator=pipe3, param_grid=param_grid_rf, scoring='accuracy', refit=True, verbose=10)
gcv_rf = gcv_rf.fit(X_training, y_training)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 1/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.113 total time=   3.4s
[CV 2/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 2/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.055 total time=   3.4s
[CV 3/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 3/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10;, score=0.039 total time=   3.3s
[CV 4/5; 1/54] START rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estimators=10
[CV 4/5; 1/54] END rf__class_weight=balanced, rf__max_depth=5, rf__max_samples=1000, rf__n_estim

In [19]:
gcv_rf.best_estimator_

In [20]:
ypred_rf = gcv_rf.predict(X_test)
print(classification_report(y_test, ypred_rf, digits=3))

                                precision    recall  f1-score   support

           alcohol involvement      0.086     0.012     0.021       596
              backing unsafely      0.280     0.247     0.262      1931
driver inattention/distraction      0.323     0.670     0.436     12486
           driver inexperience      0.000     0.000     0.000       894
 failure to yield right-of-way      0.120     0.056     0.076      3467
         following too closely      0.336     0.452     0.385      4994
                         other      0.097     0.015     0.026      2061
               other vehicular      0.046     0.006     0.011      1899
             oversized vehicle      0.112     0.063     0.080       334
passing or lane usage improper      0.094     0.028     0.043      2136
           passing too closely      0.132     0.048     0.070      1681
             pavement slippery      0.034     0.002     0.003       546
reaction to uninvolved vehicle      0.045     0.002     0.004  

In [21]:
confusion_matrix(y_test, ypred_rf)

array([[   7,   16,  399,    0,   26,  111,   12,    2,    0,    4,    3,
           2,    1,    0,    1,    6,    5,    1],
       [   3,  476, 1170,    1,   62,   46,   10,   25,   23,   28,   43,
           1,    5,    2,    1,   25,    9,    1],
       [  19,  414, 8369,    4,  532, 2275,  101,   80,   55,  177,  194,
           8,   15,   12,   28,  179,   15,    9],
       [   2,   32,  649,    0,   37,  113,    9,    4,    3,   14,   10,
           4,    2,    1,    3,    9,    2,    0],
       [   6,   77, 2435,    1,  194,  470,   34,   30,    5,   69,   34,
           2,    1,    4,   10,   80,   13,    2],
       [   7,   53, 2400,    1,  129, 2258,   19,   13,    9,   25,   23,
           1,    2,    1,    6,   36,   11,    0],
       [   9,   48, 1357,    3,   79,  405,   31,   11,    7,   40,   17,
           3,    3,    2,    4,   29,   13,    0],
       [   7,  130, 1361,    1,   85,  157,   10,   12,    8,   45,   32,
           0,    2,    0,    2,   45,    1,    1],


### Observations
- When we use a Random Forest to classify the contributing factor we get a macro average of 0.329
- When it comes to driver's inattention the model has a 54% precision.
- When it comes to failure to yield right-of-way the model has a 21% precision.
- When it comes to following too closely the model has a 42% precision.
- When it comes to improper lane usage the model has a 16.6% precision.
- For others the model has a 31% precision

### Gradient Booster

In [22]:
from sklearn.ensemble import GradientBoostingClassifier
pipe5 = Pipeline([('processing', processing_pipeline),
             ('gb', GradientBoostingClassifier())])

params_gb = {'gb__max_depth': [1, 3],
         'gb__n_estimators': [10, 25]
         }

gcv_gb = GridSearchCV(pipe5, param_grid = params_gb, cv=2, scoring='accuracy', verbose=10)
gcv_gb = gcv_gb.fit(X_training, y_training)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2; 1/4] START gb__max_depth=1, gb__n_estimators=10........................
[CV 1/2; 1/4] END gb__max_depth=1, gb__n_estimators=10;, score=0.325 total time=  25.6s
[CV 2/2; 1/4] START gb__max_depth=1, gb__n_estimators=10........................
[CV 2/2; 1/4] END gb__max_depth=1, gb__n_estimators=10;, score=0.325 total time=  25.5s
[CV 1/2; 2/4] START gb__max_depth=1, gb__n_estimators=25........................
[CV 1/2; 2/4] END gb__max_depth=1, gb__n_estimators=25;, score=0.325 total time= 1.0min
[CV 2/2; 2/4] START gb__max_depth=1, gb__n_estimators=25........................
[CV 2/2; 2/4] END gb__max_depth=1, gb__n_estimators=25;, score=0.325 total time=14.7min
[CV 1/2; 3/4] START gb__max_depth=3, gb__n_estimators=10........................
[CV 1/2; 3/4] END gb__max_depth=3, gb__n_estimators=10;, score=0.326 total time=  42.8s
[CV 2/2; 3/4] START gb__max_depth=3, gb__n_estimators=10........................
[CV 2/2; 3/4] E

In [23]:
gcv_gb.best_estimator_

In [24]:
ypred_gb = gcv_gb.predict(X_test)
print(classification_report(y_test, ypred_gb, digits=3))

                                precision    recall  f1-score   support

           alcohol involvement      0.000     0.000     0.000       596
              backing unsafely      0.322     0.356     0.338      1931
driver inattention/distraction      0.325     0.873     0.474     12486
           driver inexperience      0.667     0.002     0.004       894
 failure to yield right-of-way      0.000     0.000     0.000      3467
         following too closely      0.396     0.263     0.316      4994
                         other      0.333     0.000     0.001      2061
               other vehicular      0.000     0.000     0.000      1899
             oversized vehicle      0.182     0.006     0.012       334
passing or lane usage improper      0.000     0.000     0.000      2136
           passing too closely      0.000     0.000     0.000      1681
             pavement slippery      0.000     0.000     0.000       546
reaction to uninvolved vehicle      0.000     0.000     0.000  

In [25]:
confusion_matrix(y_test, ypred_gb)

array([[    0,    23,   544,     0,     0,    29,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [    0,   688,  1240,     0,     0,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [    0,   499, 10895,     0,     0,  1085,     0,     0,     6,
            0,     0,     0,     0,     0,     0,     0,     0,     1],
       [    0,    37,   798,     2,     0,    55,     1,     0,     0,
            0,     0,     0,     0,     0,     1,     0,     0,     0],
       [    0,    78,  3218,     0,     0,   171,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [    0,    57,  3625,     0,     0,  1311,     0,     0,     0,
            0,     0,     0,     1,     0,     0,     0,     0,     0],
       [    0,    56,  1802,     1,     0,   199,     1,     0,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     1],

### Observations
- When we use a gradient boosting to classify the contributing factor we get a macro average of 0.260
- When it comes to driver's inattention the model has a 51% precision.
- When it comes to failure to yield right-of-way the model has a 0% precision.
- When it comes to following too closely the model has a 45.4% precision.
- When it comes to improper lane usage the model has a 0% precision.
- For others the model has a 33.3% precision.

In [28]:
tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=1)
abc = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1)
pipe4 = Pipeline([('data_processing', processing_pipeline),('tree', abc)])
pipe4.fit(X_training,y_training)
abc_predicted = pipe4.predict(X_test)

In [29]:
print(classification_report(y_test, abc_predicted, digits=3))

                                precision    recall  f1-score   support

           alcohol involvement      0.000     0.000     0.000       596
              backing unsafely      0.319     0.068     0.112      1931
driver inattention/distraction      0.321     0.976     0.483     12486
           driver inexperience      0.000     0.000     0.000       894
 failure to yield right-of-way      0.000     0.000     0.000      3467
         following too closely      0.361     0.042     0.075      4994
                         other      0.000     0.000     0.000      2061
               other vehicular      0.000     0.000     0.000      1899
             oversized vehicle      0.375     0.009     0.018       334
passing or lane usage improper      0.000     0.000     0.000      2136
           passing too closely      0.000     0.000     0.000      1681
             pavement slippery      0.000     0.000     0.000       546
reaction to uninvolved vehicle      0.000     0.000     0.000  

### Observations
- When we use a Adaboosting classifier to classify the contributing factor we get a macro average of 0.192
- When it comes to driver's inattention the model has a 54% precision.
- When it comes to failure to yield right-of-way the model has a 0% precision.
- When it comes to following too closely the model has a 43.6% precision.
- When it comes to improper lane usage the model has a 0% precision.
- For others the model has a 0% precision.

### Summary
I would use random forest or logistic regression as the final model. As both of these model identify the less frequently occuring classes more accurately compared to the complex algorithms like gradient boosting algortihm or adaboosting algorithm.

 