In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the Dataset
data = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/cleaned_data.csv")

In [4]:
# View first few rows in the dataset
data.head()

Unnamed: 0,ROAD_CLASS,DISTRICT,ACCLOC,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,INITDIR,VEHTYPE,MANOEUVER,DRIVACT,DRIVCOND,DIVISION
0,Major Arterial,Toronto and East York,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,East,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,D55
1,Major Arterial,Toronto and East York,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,15 to 19,Minor,East,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,D55
2,Major Arterial,Toronto and East York,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Driver,55 to 59,Minor,North,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,D55
3,Major Arterial,Toronto and East York,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,20 to 24,Minor,East,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,D55
4,Major Arterial,Toronto and East York,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,15 to 19,Minor,East,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,D55


### LABEL ENCODING AND HYPER PARAMTER TUNING USING GRID SEARCH

In [5]:
# applying label encoding
le = LabelEncoder()
label_encoded = data.apply(le.fit_transform)
label_encoded.head()

Unnamed: 0,ROAD_CLASS,DISTRICT,ACCLOC,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,INITDIR,VEHTYPE,MANOEUVER,DRIVACT,DRIVCOND,DIVISION
0,5,3,2,0,0,0,8,1,1,11,10,1,0,1,2,1,7,15
1,5,3,2,0,0,0,8,1,1,11,2,3,0,1,2,1,7,15
2,5,3,2,0,0,0,8,1,1,2,11,3,1,1,2,1,7,15
3,5,3,2,0,0,0,8,1,1,11,3,3,0,1,2,1,7,15
4,5,3,2,0,0,0,8,1,1,11,2,3,0,1,2,1,7,15


In [6]:
#separating features and target
X_label = label_encoded.drop('INJURY', axis=1)
y_label = label_encoded['INJURY']

In [7]:
# splitting the data into train and test
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_label, y_label, test_size=0.3, random_state=42)

In [8]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 150, 200, 300, 500], # number of trees
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 100],
    'min_samples_leaf': [1, 2, 4]
}

In [9]:
rf = RandomForestClassifier(random_state=42)
grid_label = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_label.fit(X_train_l, y_train_l)

In [10]:
best_label_model = grid_label.best_estimator_
y_pred_label = best_label_model.predict(X_test_l)

In [11]:
print("Best Params:", grid_label.best_params_)
print("Accuracy:", round(accuracy_score(y_test_l, y_pred_label) * 100, 2), "%")

Best Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}
Accuracy: 84.97 %


### ONE-HOT ENCODING AND HYPER PARAMTER TUNING USING GRID SEARCH

In [12]:
X = data.drop('INJURY', axis=1)
y = data['INJURY']

In [13]:
X = pd.get_dummies(X, drop_first=True) #applying one hot encoding

In [14]:
X.head()

Unnamed: 0,ROAD_CLASS_Expressway,ROAD_CLASS_Expressway Ramp,ROAD_CLASS_Laneway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Major Arterial.1,ROAD_CLASS_Major Shoreline,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,ROAD_CLASS_Pending,...,DIVISION_D32,DIVISION_D33,DIVISION_D41,DIVISION_D42,DIVISION_D43,DIVISION_D51,DIVISION_D52,DIVISION_D53,DIVISION_D55,DIVISION_NSA
0,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [15]:
X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 150, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [17]:
rf = RandomForestClassifier(random_state=42)
grid_ohe = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_ohe.fit(X_train_o, y_train_o)

In [18]:
best_ohe_model = grid_ohe.best_estimator_
y_pred_ohe = best_ohe_model.predict(X_test_o)

In [19]:
print("Best Params:", grid_ohe.best_params_)
print("Accuracy:", round(accuracy_score(y_test_o, y_pred_ohe) * 100, 2), "%")

Best Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Accuracy: 84.67 %


### ORDINAL ENCODING AND HYPER PARAMTER TUNING USING GRID SEARCH

In [20]:
X = data.drop('INJURY', axis=1)
y = data['INJURY']

In [21]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_features = X.select_dtypes(include=['object']).columns
oe = OrdinalEncoder()
X_ordinal = X.copy()
X_ordinal[ordinal_features] = oe.fit_transform(X_ordinal[ordinal_features])

In [22]:
X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X_ordinal, y, test_size=0.3, random_state=42)

In [23]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 150, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [24]:
rf = RandomForestClassifier(random_state=42)
grid_ohe = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_ohe.fit(X_train_o, y_train_o)

In [25]:
best_ohe_model = grid_ohe.best_estimator_
y_pred_ohe = best_ohe_model.predict(X_test_o)

In [26]:
print("Best Params:", grid_ohe.best_params_)
print("Accuracy:", round(accuracy_score(y_test_o, y_pred_ohe) * 100, 2), "%")

Best Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}
Accuracy: 84.97 %


**SUMMARY:**

  | Encoding         | Best Parameters | Accuracy |
|------------------|------------------|----------|
| Label Encoding   | 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150                | 84.97%      |
| One-Hot Encoding | 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300                | 84.67%      |
| Ordinal Encoding | 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150                | 84.97%      |
