In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data = pd.read_csv('G:/IISc Bangalore/sample/Accdataset_hk_PS_BAEL_Combined.csv')

In [3]:
data.head()

Unnamed: 0,Sl No,Accident_Index,Date,Day_of_Week,Time_of_Accident,Accident_Location_A,Accident_Location_A_Chainage_km,Accident_Location_A_Chainage_km_RoadSide,Nature_of_Accident_B1,Nature_of_Accident_B2,...,Vehicle_Type_Involved_J_V2,Vehicle_Type_Involved_J_V3,Vehicle_Type_Involved_J_V4,Number_of_Vehicles,Number_of_Casualties_Fatel,Number_of_Casualties_GrievousInjury,Number_of_Casualties_MinorInjury,Number_of_Casualties_NotInjured,Number_of_Casualties,Remarks
0,1,201300001,41518,7,13:00,2,187.6,RHS,8,,...,,,,1,,,2.0,,2,
1,2,201300002,41521,3,15:25,2,152.3,LHS,4,,...,,,,1,,,,1.0,0,
2,3,201300003,41526,1,00:40,2,177.8,RHS,4,,...,,,,1,,,,1.0,0,
3,4,201300004,41528,3,07:30,2,186.8,LHS,8,,...,,,,1,,,1.0,,1,
4,5,201300005,41534,2,13:00,2,173.4,LHS,5,,...,,,,1,,,,1.0,0,


In [4]:
y = data['Accident_Severity_C'].fillna(0)


# In[23]:


# Select feature variables
X = data.drop(['Accident_Severity_C','Accident_Index','Date','Day_of_Week','Time_of_Accident','Accident_Location_A','Accident_Location_A_Chainage_km','Accident_Location_A_Chainage_km_RoadSide','Nature_of_Accident_B1','Nature_of_Accident_B2','Nature_of_Accident_B3','Classification_of_Accident_C1','Classification_of_Accident_C2','Classification_of_Accident_C3','Causes_D1','Causes_D2','Causes_D3','Causes_D4','Causes_D5','Road_Feature_E','Road_Condition_F','Intersection_Type_G','Weather_Conditions_H','Vehicle_Type_Involved_J_V1','Vehicle_Type_Involved_J_V2','Vehicle_Type_Involved_J_V3','Vehicle_Type_Involved_J_V4','Number_of_Vehicles','Number_of_Casualties_Fatel','Number_of_Casualties_GrievousInjury','Number_of_Casualties_MinorInjury','Number_of_Casualties_NotInjured','Number_of_Casualties','Remarks'], axis=1)
X = X.fillna(0)

In [5]:
y.head()

0    3
1    4
2    4
3    3
4    4
Name: Accident_Severity_C, dtype: int64

In [6]:
# Mapping feature variables
accident_severity_map = {'1': 'Fatal', '2': 'Grevious Injury', '3': 'Minor Injury', '4': 'No Injury'}
data['Accident_Severity_C'] = data['Accident_Severity_C'].map(accident_severity_map)

day_of_week_map = {'1': 'Sunday', '2': 'Monday', '3': 'Tuesday', '4': 'Wednesday', '5': 'Thursday', '6': 'Friday', '7': 'Saturday'}
data['Day_of_Week'] = data['Day_of_Week'].map(day_of_week_map)

In [7]:
# Mapping remaining feature variables
mapping = {'1': 'Urban', '2': 'Rural', '3': 'Unallocated'}
data['Accident_Location_A'] = data['Accident_Location_A'].map(mapping)

mapping = {'1': 'Overturning', '2': 'Head on collision', '3': 'Rear End Collision',
           '4': 'Collision Brush/Side Wipe', '5': 'Right Turn Collision', '6': 'Skidding',
           '7a': 'Others-Hit Cyclist', '7b': 'Others-Hit Pedestrian', '7C': 'Others-Hit Parked Vehicle',
           '7d': 'Others-Hit Fixed Object', '7e': 'Others-Wrong Side Driving', '7f': 'Others-Hit Animal',
           '7g': 'Others-Hit Two Wheeler', '7h': 'Others-Unknown', '7i': 'Others-Fallen down',
           '8': 'Overtaking vehicle', '9': 'Left Turn Collision'}

columns = ['Nature_of_Accident_B1', 'Nature_of_Accident_B2', 'Nature_of_Accident_B3']
for col in columns:
    data[col] = data[col].map(mapping)
    
mapping = {'1': 'Fatal', '2': 'Grevious Injury', '3': 'Minor Injury', '4': 'Non - Injury (Damage only)'}

columns = ['Classification_of_Accident_C1', 'Classification_of_Accident_C2', 'Classification_of_Accident_C3']
for col in columns:
    data[col] = data[col].map(mapping)

mapping = {'1': 'Drunken', '2': 'Overspeeding', '3': 'Vehicle out of control',
'4a': 'Fault of driver of motor vehicle', '4b': 'Driver of other vehicle', '4C': 'Cyclist',
'4d': 'Pedestrian', '4e': 'Passenger', '4f': 'Animal',
'5a': 'Defect in mechanical condition of motor vehicle', '5b': 'Road condition'}

columns = ['Causes_D1', 'Causes_D2', 'Causes_D3', 'Causes_D4', 'Causes_D5']
for col in columns:
    data[col] = data[col].map(mapping)
    
mapping = {'1': 'Single lane', '2': 'Two lanes', '3': 'Three lanes or more without central divider median',
           '4': 'Four lanes or more with central divider alongwith carriageway width'}
data['Road_Feature_E'] = data['Road_Feature_E'].map(mapping)

mapping = {'1': 'Straight Road', '2': 'Slight Curve', '3': 'Sharp Curve', '4': 'Flat Road', '5': 'Gentle incline',
           '6': 'Steep incline', '7': 'Hump', '8': 'Dip'}
data['Road_Condition_F'] = data['Road_Condition_F'].map(mapping)

mapping = {'1': 'T Junction', '2': 'Y Junction', '3': 'Four arm junction', '4': 'Staggered junction',
           '5': 'Roundabout', '6': 'Uncontrolled junction'}
data['Intersection_Type_G'] = data['Intersection_Type_G'].map(mapping)

mapping = {'1': 'Fine', '2': 'Mist/Fog', '3': 'Cloud', '4': 'Light Rain',
           '5': 'Heavy Rain', '6': 'Hail/sleet', '7': 'Snow', '8': 'Strong Wind', 
           '9': 'Dust Storm', '10': 'Very Hot', '11': 'Very Cold', '12': 'Other extraordinary weather condition'}
data['Weather_Conditions_H'] = data['Weather_Conditions_H'].map(mapping)

mapping = {'1': 'Car/Jeep/Van', '2': 'SUV', '3': 'Bus', '4': 'Mini Bus', '5': 'Truck', '6': 'Two Wheeler',
           '7': 'Three Wheeler', '8': 'Cycle', '9': 'Pedestrian', '10': 'Tractor', '11': 'Unknown', '12': 'Animal',
           '13': 'Objects', '14': 'LCV', '15': 'MAV'}

columns = ['Vehicle_Type_Involved_J_V1', 'Vehicle_Type_Involved_J_V2', 'Vehicle_Type_Involved_J_V3', 'Vehicle_Type_Involved_J_V4']
for col in columns:
    data[col] = data[col].map(mapping)

In [8]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
y_train.head()

379     4
512     1
1472    3
2471    2
1676    2
Name: Accident_Severity_C, dtype: int64

**RANDOM FOREST MODEL**

In [10]:
# Define the model
model = RandomForestClassifier(random_state=42)

In [11]:
# Define the hyperparameters to be tuned
param_grid = {'n_estimators': [100, 500, 1000, 5000],
              'max_depth': [2, 4, 6, 8]}


In [12]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [13]:
# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'max_depth': 4, 'n_estimators': 5000}
Best score: 0.4265967813376642


In [14]:
results = grid_search.cv_results_
results

{'mean_fit_time': array([ 0.1796412 ,  0.89055271,  1.66118727,  8.22564735,  0.21405101,
         1.02242813,  2.24531116, 10.41813641,  0.22928829,  1.17706537,
         2.3345355 , 11.5954236 ,  0.2680553 ,  1.34247503,  2.66561313,
        13.20540481]),
 'std_fit_time': array([0.01964196, 0.06646808, 0.01446292, 0.15610077, 0.00817357,
        0.01846907, 0.09806919, 0.53519049, 0.00190745, 0.02972002,
        0.05309099, 0.09963177, 0.00836512, 0.01690042, 0.04245623,
        0.21906501]),
 'mean_score_time': array([0.01020203, 0.04340777, 0.08281217, 0.39628386, 0.01080384,
        0.0432106 , 0.08841581, 0.49911194, 0.01020374, 0.048211  ,
        0.09461751, 0.46131158, 0.01100526, 0.05101566, 0.10043411,
        0.50891285]),
 'std_score_time': array([4.02134069e-04, 2.80602648e-03, 2.22274458e-03, 2.03958942e-03,
        4.00928913e-04, 4.06841884e-04, 4.12702820e-03, 1.16129273e-01,
        3.99048679e-04, 9.72072453e-04, 5.78593196e-03, 9.81931990e-03,
        8.73874834e-

In [15]:
# Use the best parameters to fit the model
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)


In [16]:
# Make predictions on the test data
y_pred = clf.predict(X_test)


In [17]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
#Save the predictions as an Excel file
df = pd.DataFrame({'Predictions': y_pred})
df.to_excel('G:/IISc Bangalore/sample/predicted_output3.xlsx', index=False)



In [19]:
# Print the accuracy of the model
print(clf.score(X_test, y_test))

0.43164362519201227


In [20]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[  0  27  16   3]
 [  0 176  67   6]
 [  0 150  99   6]
 [  0  61  34   6]]
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        46
           2       0.43      0.71      0.53       249
           3       0.46      0.39      0.42       255
           4       0.29      0.06      0.10       101

    accuracy                           0.43       651
   macro avg       0.29      0.29      0.26       651
weighted avg       0.39      0.43      0.38       651



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import joblib

# Save the trained model
joblib.dump(clf, 'G:/IISc Bangalore/sample/random_forest_model.joblib')

# Load the model when needed
loaded_model = joblib.load('G:/IISc Bangalore/sample/random_forest_model.joblib')

**ADABOOST MODEL**

In [22]:
from sklearn.ensemble import AdaBoostClassifier
# Define the AdaBoost model
model = AdaBoostClassifier(random_state=0)

In [23]:
# Define the hyperparameters to be tuned
param_grid = {'n_estimators': [50, 100, 200],
              'learning_rate': [0.01, 0.1, 0.5, 1.0]}

In [24]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [25]:
# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'learning_rate': 1.0, 'n_estimators': 200}
Best score: 0.42160785471725976


In [26]:
# Use the best parameters to fit the model
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)

In [27]:
# Make predictions on the test data
y_pred = clf.predict(X_test)

In [28]:
# Save the predictions as an Excel file
df = pd.DataFrame({'Predictions': y_pred})
df.to_excel('predicted_output_adaboost.xlsx', index=False)


In [29]:
# Print the accuracy of the model
print(clf.score(X_test, y_test))


0.4162826420890937


In [30]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[  0  28  17   1]
 [  2 169  75   3]
 [  1 151  98   5]
 [  0  67  30   4]]
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        46
           2       0.41      0.68      0.51       249
           3       0.45      0.38      0.41       255
           4       0.31      0.04      0.07       101

    accuracy                           0.42       651
   macro avg       0.29      0.28      0.25       651
weighted avg       0.38      0.42      0.37       651



**XGBOOST MODEL**

In [31]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/99.8 MB 1.3 MB/s eta 0:01:18
     ---------------------------------------- 0.1/99.8 MB 1.0 MB/s eta 0:01:37
     ---------------------------------------- 0.2/99.8 MB 1.2 MB/s eta 0:01:22
     --------------------------------------- 0.2/99.8 MB 871.5 kB/s eta 0:01:55
     ---------------------------------------- 0.2/99.8 MB 1.2 MB/s eta 0:01:27
     --------------------------------------- 0.3/99.8 MB 983.0 kB/s eta 0:01:42
     --------------------------------------- 0.3/99.8 MB 996.0 kB/s eta 0:01:40
     ---------------------------------------- 0.4/99.8 MB 1.1 MB/s eta 0:01:30
     ---------------------------------------- 0.5/99.8 MB 1.1 MB/s eta 0:01:32
     ---------------------------------------- 0.5/99.8 MB 1.1 MB/s eta 0:01:29
     ---------------------------------------- 0.6/99.8 MB 1.1 MB/s eta 0:

In [32]:
from xgboost import XGBClassifier
# Define the XGBoost model
model = XGBClassifier(random_state=0)

In [33]:
# Subtract 1 from the values in y to map them to [0, 1, 2, 3]
y_train = y_train.astype(int) - 1
y_test = y_test.astype(int) - 1

In [34]:
# Define the hyperparameters to be tuned
param_grid = {'n_estimators': [50, 100, 200],
              'learning_rate': [0.01, 0.1, 0.5, 1.0],
              'max_depth': [3, 4, 5]}

In [35]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [36]:
# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best score: 0.42621364240366155


In [37]:
# Use the best parameters to fit the model
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)

In [38]:
# Make predictions on the test data
y_pred = clf.predict(X_test)

In [39]:
# Map predicted classes back to the original range if needed
y_pred_original_range = y_pred + 1

In [40]:
# Save the predictions as an Excel file
df = pd.DataFrame({'Predictions': y_pred_original_range})
df.to_excel('predicted_output_xgboost.xlsx', index=False)

In [41]:
# Print the accuracy of the model
print(clf.score(X_test, y_test))

0.41781874039938555


In [42]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[  0  22  21   3]
 [  0 140 103   6]
 [  0 124 127   4]
 [  0  56  40   5]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        46
           1       0.41      0.56      0.47       249
           2       0.44      0.50      0.47       255
           3       0.28      0.05      0.08       101

    accuracy                           0.42       651
   macro avg       0.28      0.28      0.26       651
weighted avg       0.37      0.42      0.38       651



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
from sklearn.svm import SVC
# Define the SVM model
model = SVC(random_state=0)

In [44]:
# Define the hyperparameters to be tuned
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1]}

In [45]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [46]:
# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'C': 10, 'gamma': 0.01}
Best score: 0.42160342536542156


In [47]:
# Use the best parameters to fit the model
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)

In [48]:
# Make predictions on the test data
y_pred = clf.predict(X_test)

In [49]:
# Save the predictions as an Excel file
df = pd.DataFrame({'Predictions': y_pred})
df.to_excel('predicted_output_svm.xlsx', index=False)

In [50]:
# Print the accuracy of the model
accuracy = clf.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.3655913978494624


In [51]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[  2  19  19   6]
 [  5 126  94  24]
 [  8 127  98  22]
 [  2  38  49  12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.12      0.04      0.06        46
           1       0.41      0.51      0.45       249
           2       0.38      0.38      0.38       255
           3       0.19      0.12      0.15       101

    accuracy                           0.37       651
   macro avg       0.27      0.26      0.26       651
weighted avg       0.34      0.37      0.35       651

