In [2]:
## Importing foundation libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## Loading dataset

df_road = pd.read_csv('/content/RTADataset.csv')

In [4]:
## Checking initial 5 rows of the data

df_road.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [5]:
## Checking columns in the dataframe

df_road.columns

Index(['Time', 'Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle',
       'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Number_of_vehicles_involved',
       'Number_of_casualties', 'Vehicle_movement', 'Casualty_class',
       'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
       'Work_of_casuality', 'Fitness_of_casuality', 'Pedestrian_movement',
       'Cause_of_accident', 'Accident_severity'],
      dtype='object')

In [6]:
## Lets start EDA
## Lets separate hour of the accident out of Time column

import datetime as dt

df_road['hour'] = pd.to_datetime(df_road['Time']).dt.hour

In [7]:
## Dropping time column which is now converted into Hour

df_road.drop(columns=['Time'],inplace=True)

In [8]:
### Create a function to find out missing values and parcentages in comparison to length of data frame

def missing_values_table(df):
        # Total missing values

        ## Taking all sorts of NA values
        missing_values = ['N/A', 'na', 'NA', 'NaN', 'None', 'null']
        df.replace(missing_values, np.nan, inplace=True)

        mis_val = df.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

        # Return the dataframe with missing information
        return mis_val_table_ren_columns



In [9]:
## Calling function to show missing values

missing_val_table = missing_values_table(df_road)

Your selected dataframe has 32 columns.
There are 20 columns that have missing values.


In [10]:
drop_columns = missing_val_table[missing_val_table['% of Total Values']>20].index

In [11]:
### Lets drop the columns which have more than 20% of missing values

def drop_missing_columns(df,threshold=20):

  drop_cols = missing_val_table[missing_val_table['% of Total Values']>threshold].index
  new_df = df.drop(drop_cols,axis=1)
  return new_df


In [12]:
df_road = drop_missing_columns(df_road)

In [13]:
## Now lets replace the missing values
## So if top category is more than 50% then we should replace missing values with that
## else we should consider randomly applying missing values with first 2 categories


import random

def replace_null_with_top_category(df):
    for column in df.columns:
        top_category = df[column].mode()[0]  # Get the mode (most frequent category)
        top_2_categories = df[column].value_counts().index[:2].to_list()
        missing_indices = df[df[column].isnull()].index
        top_category_percentage = df[column].value_counts(normalize=True).max()  # Calculate the percentage of the top category

        if top_category_percentage > 0.5:
            df[column].fillna(top_category, inplace=True)

        else:
            random.shuffle(top_2_categories)
            half_count = len(missing_indices) // 2
            for i in range(half_count):
                df.at[missing_indices[i], column] = top_2_categories[0]
            for i in range(half_count, len(missing_indices)):
                df.at[missing_indices[i], column] = top_2_categories[1]
    return df



In [14]:
### df_road_v2 is the data frame where missing values are replaced with top categories

df_road = replace_null_with_top_category(df_road)

In [15]:
df_road.head()

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,...,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Pedestrian_movement,Cause_of_accident,Accident_severity,hour
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Residential areas,Two-way (divided with broken lines road marking),...,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,Not a Pedestrian,Moving Backward,Slight Injury,17
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,Office areas,Undivided Two way,...,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,Not a Pedestrian,Overtaking,Slight Injury,17
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Recreational areas,other,...,Daylight,Normal,Collision with roadside objects,2,2,Going straight,Not a Pedestrian,Changing lane to the left,Serious Injury,17
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Office areas,other,...,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Not a Pedestrian,Changing lane to the right,Slight Injury,1
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,Industrial areas,other,...,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Not a Pedestrian,Overtaking,Slight Injury,1


In [16]:
categorical_columns = df_road.select_dtypes(include=['object']).columns

In [17]:
categorical_columns

Index(['Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Area_accident_occured',
       'Lanes_or_Medians', 'Road_allignment', 'Types_of_Junction',
       'Road_surface_type', 'Road_surface_conditions', 'Light_conditions',
       'Weather_conditions', 'Type_of_collision', 'Vehicle_movement',
       'Pedestrian_movement', 'Cause_of_accident', 'Accident_severity'],
      dtype='object')

In [18]:
numeric_columns = df_road.select_dtypes(include=['number']).columns

In [19]:
numeric_columns

Index(['Number_of_vehicles_involved', 'Number_of_casualties', 'hour'], dtype='object')

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
le = LabelEncoder()

In [22]:
column_to_encode = categorical_columns

In [23]:
for col in column_to_encode:
  df_road[col + '_encoded'] = le.fit_transform(df_road[col])

In [24]:
df_road.head()

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,...,Types_of_Junction_encoded,Road_surface_type_encoded,Road_surface_conditions_encoded,Light_conditions_encoded,Weather_conditions_encoded,Type_of_collision_encoded,Vehicle_movement_encoded,Pedestrian_movement_encoded,Cause_of_accident_encoded,Accident_severity_encoded
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Residential areas,Two-way (divided with broken lines road marking),...,1,0,0,3,2,3,2,5,9,2
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,Office areas,Undivided Two way,...,1,0,0,3,2,8,2,5,16,2
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Recreational areas,other,...,1,0,0,3,2,2,2,5,0,1
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Office areas,other,...,7,2,0,0,2,8,2,5,1,2
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,Industrial areas,other,...,7,0,0,0,2,8,2,5,16,2


In [25]:
for col in df_road.columns:
  if '_encoded' not in col and col not in numeric_columns:
    df_road = df_road.drop(col,axis=1)

In [26]:
df_road.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,hour,Day_of_week_encoded,Age_band_of_driver_encoded,Sex_of_driver_encoded,Educational_level_encoded,Vehicle_driver_relation_encoded,Driving_experience_encoded,Type_of_vehicle_encoded,...,Types_of_Junction_encoded,Road_surface_type_encoded,Road_surface_conditions_encoded,Light_conditions_encoded,Weather_conditions_encoded,Type_of_collision_encoded,Vehicle_movement_encoded,Pedestrian_movement_encoded,Cause_of_accident_encoded,Accident_severity_encoded
0,2,2,17,1,0,1,0,0,0,0,...,1,0,0,3,2,3,2,5,9,2
1,2,2,17,1,1,1,4,0,3,11,...,1,0,0,3,2,8,2,5,16,2
2,2,2,17,1,0,1,4,0,0,5,...,1,0,0,3,2,2,2,5,0,1
3,2,2,1,3,0,1,4,0,2,11,...,7,2,0,0,2,8,2,5,1,2
4,2,2,1,3,0,1,4,0,1,0,...,7,0,0,0,2,8,2,5,16,2


In [27]:
X = df_road.drop('Accident_severity_encoded',axis=1)
y= df_road['Accident_severity_encoded']

In [28]:
#from imblearn.over_sampling import SMOTE

from imblearn.combine import SMOTETomek
smt = SMOTETomek()

#smote = SMOTE(sampling_strategy='auto',random_state=42)
X_resampled, y_resampled = smt.fit_resample(X,y)

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.25,random_state=42)

In [30]:
from sklearn.feature_selection import SelectKBest, chi2
chi2_selector = SelectKBest(chi2, k=5)
X_kbest = chi2_selector.fit_transform(X_train, y_train)

# Get the indices of the selected features
selected_indices = chi2_selector.get_support(indices=True)

# Print the indices of the selected features
print(X_train.columns[selected_indices])

Index(['Number_of_vehicles_involved', 'Number_of_casualties',
       'Age_band_of_driver_encoded', 'Driving_experience_encoded',
       'Light_conditions_encoded'],
      dtype='object')


In [31]:
from sklearn.feature_selection import f_classif

# Create an SelectKBest object to select features with two best ANOVA F-Values
fvalue_selector = SelectKBest(f_classif, k=5)

# Apply the SelectKBest object to the features and target
X_kbest = fvalue_selector.fit_transform(X_train, y_train)
selected_indices = chi2_selector.get_support(indices=True)
print(X_train.columns[selected_indices])

Index(['Number_of_vehicles_involved', 'Number_of_casualties',
       'Age_band_of_driver_encoded', 'Driving_experience_encoded',
       'Light_conditions_encoded'],
      dtype='object')


In [32]:
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import make_scorer, f1_score

f1_scorer = make_scorer(f1_score, average='weighted')
sfs1 = SFS(RandomForestClassifier(),
           k_features=7,
           forward=True,
           floating=False,
           verbose=2,
           scoring=f1_scorer,
           cv=3)

sfs1 = sfs1.fit(np.array(X_train), y_train)
sfs1.k_feature_idx_
X_train.columns[list(sfs1.k_feature_idx_)]


[2023-09-23 10:38:48] Features: 1/7 -- score: 0.43057246680252503
[2023-09-23 10:39:24] Features: 2/7 -- score: 0.5006005806743752
[2023-09-23 10:40:01] Features: 3/7 -- score: 0.5863365509429533
[2023-09-23 10:41:03] Features: 4/7 -- score: 0.6643624267909652
[2023-09-23 10:42:12] Features: 5/7 -- score: 0.7346908642565259
[2023-09-23 10:43:33] Features: 6/7 -- score: 0.8016799164628362
[2023-09-23 10:44:52] Features: 7/7 -- score: 0.8384179610650758

Index(['Number_of_vehicles_involved', 'hour', 'Day_of_week_encoded',
       'Type_of_vehicle_encoded', 'Lanes_or_Medians_encoded',
       'Types_of_Junction_encoded', 'Cause_of_accident_encoded'],
      dtype='object')

In [33]:
X_train_new = X_train[['Number_of_vehicles_involved', 'hour', 'Day_of_week_encoded',
       'Type_of_vehicle_encoded', 'Area_accident_occured_encoded',
       'Types_of_Junction_encoded', 'Cause_of_accident_encoded']]

X_test_new = X_test[['Number_of_vehicles_involved', 'hour', 'Day_of_week_encoded',
       'Type_of_vehicle_encoded', 'Area_accident_occured_encoded',
       'Types_of_Junction_encoded', 'Cause_of_accident_encoded']]

In [34]:
pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [35]:
### Random Forest is giving the best F1 Score. Trying to do Hyper Param tuning using optuna

import optuna

def objective(trial):
  n_estimators = trial.suggest_int("n_estimators",50,200)
  max_depth = trial.suggest_int("max_depth",1,30)
  min_samples_split = trial.suggest_int("min_samples_split",2,10)
  min_samples_leaf = trial.suggest_int("min_samples_leaf",2,10)
  max_features = trial.suggest_categorical("max_features",["auto"])
  bootstrap = trial.suggest_categorical("bootstrap",[True,False])

  rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42,
    )

  rf.fit(X_train_new,y_train)
  y_pred = rf.predict(X_test_new)

  f1 = f1_score(y_test,y_pred,average='weighted')

  return f1

if __name__ == "__main__":
  study = optuna.create_study(direction="maximize")
  study.optimize(objective,n_trials=20)

  best_params = study.best_params
  best_f1 = study.best_value

  print(f"Best Paramters are : {best_params}")
  print(f"Best F1 score is:{best_f1}")


[I 2023-09-23 10:45:00,315] A new study created in memory with name: no-name-b105cf0d-bdca-43d6-b168-0b51affb42ca
  warn(
[I 2023-09-23 10:45:03,586] Trial 0 finished with value: 0.8157033222152222 and parameters: {'n_estimators': 178, 'max_depth': 28, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'auto', 'bootstrap': True}. Best is trial 0 with value: 0.8157033222152222.
  warn(
[I 2023-09-23 10:45:05,914] Trial 1 finished with value: 0.8382550670240736 and parameters: {'n_estimators': 104, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'bootstrap': False}. Best is trial 1 with value: 0.8382550670240736.
  warn(
[I 2023-09-23 10:45:13,951] Trial 2 finished with value: 0.8414753762153526 and parameters: {'n_estimators': 164, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'bootstrap': False}. Best is trial 2 with value: 0.8414753762153526.
  warn(
[I 2023-09-23 10:45:15,746] Trial 3 finish

Best Paramters are : {'n_estimators': 151, 'max_depth': 22, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'bootstrap': False}
Best F1 score is:0.8584495723433081


In [38]:
# ###

# Best Paramters are : {'n_estimators': 163, 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'auto', 'bootstrap': True}
# Best F1 score is:0.8426067449898177
# Top Features - 'Number_of_vehicles_involved', 'hour', 'Day_of_week_encoded',
# 'Type_of_vehicle_encoded', 'Lanes_or_Medians_encoded',
# 'Types_of_Junction_encoded', 'Cause_of_accident_encoded'



In [39]:
model = RandomForestClassifier(
        n_estimators=163,
        max_depth=19,
        min_samples_split=8,
        min_samples_leaf=2,
        max_features='auto',
        bootstrap=True,
        random_state=42,
    )


In [40]:
import joblib

# Specify the file path where you want to save the model
file_path = "random_forest.joblib"

# Save the model to the specified file path
joblib.dump(model, file_path)

['random_forest.joblib']