In [16]:
## Importing foundation libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
## Loading dataset

df_road = pd.read_csv('/content/RTADataset.csv')

In [18]:
## Lets separate hour of the accident out of Time column

import datetime as dt

df_road['hour'] = pd.to_datetime(df_road['Time']).dt.hour

In [19]:
## Dropping time column which is now converted into Hour

df_road.drop(columns=['Time'],inplace=True)

In [20]:
### Create a function to find out missing values and parcentages in comparison to length of data frame

def missing_values_table(df):
        # Total missing values

        ## Taking all sorts of NA values
        missing_values = ['N/A', 'na', 'NA', 'NaN', 'None', 'null']
        df.replace(missing_values, np.nan, inplace=True)

        mis_val = df.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

        # Return the dataframe with missing information
        return mis_val_table_ren_columns



In [21]:
## Calling function to show missing values

missing_val_table = missing_values_table(df_road)

Your selected dataframe has 32 columns.
There are 20 columns that have missing values.


In [22]:
## Find out the columns with more than 20% null values

drop_columns = missing_val_table[missing_val_table['% of Total Values']>20].index

In [23]:
### Lets drop the columns which have more than 20% of missing values

def drop_missing_columns(df,threshold=20):

  drop_cols = missing_val_table[missing_val_table['% of Total Values']>threshold].index
  new_df = df.drop(drop_cols,axis=1)
  return new_df


In [24]:
## df_road is now reduced to 24 columns

df_road = drop_missing_columns(df_road)

In [25]:
df_road.shape

(12316, 24)

In [26]:
## Now lets replace the missing values
## So if top category is more than 50% then we should replace missing values with that
## else we should consider randomly applying missing values with first 2 categories


import random

def replace_null_with_top_category(df):
    for column in df.columns:
        top_category = df[column].mode()[0]  # Get the mode (most frequent category)
        top_2_categories = df[column].value_counts().index[:2].to_list()
        missing_indices = df[df[column].isnull()].index
        top_category_percentage = df[column].value_counts(normalize=True).max()  # Calculate the percentage of the top category

        if top_category_percentage > 0.5:
            df[column].fillna(top_category, inplace=True)

        else:
            random.shuffle(top_2_categories)
            half_count = len(missing_indices) // 2
            for i in range(half_count):
                df.at[missing_indices[i], column] = top_2_categories[0]
            for i in range(half_count, len(missing_indices)):
                df.at[missing_indices[i], column] = top_2_categories[1]
    return df



In [27]:
### df_road is the data frame where missing values are replaced with top categories

df_road = replace_null_with_top_category(df_road)

In [28]:
df_road.head()

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,...,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Pedestrian_movement,Cause_of_accident,Accident_severity,hour
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Residential areas,Two-way (divided with broken lines road marking),...,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,Not a Pedestrian,Moving Backward,Slight Injury,17
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,Office areas,Undivided Two way,...,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,Not a Pedestrian,Overtaking,Slight Injury,17
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Recreational areas,other,...,Daylight,Normal,Collision with roadside objects,2,2,Going straight,Not a Pedestrian,Changing lane to the left,Serious Injury,17
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Office areas,other,...,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Not a Pedestrian,Changing lane to the right,Slight Injury,1
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Lorry (41?100Q),Owner,Industrial areas,other,...,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Not a Pedestrian,Overtaking,Slight Injury,1


In [29]:
#Installing libraries and finding out versions

!pip install scikit-learn
!pip install joblib
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.27.0-py2.py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Collecting validators<1,>=0.2 (from streamlit)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.37-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.0/190.0 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:

In [30]:
### Versions of each library

import sklearn
import joblib
import numpy
import pandas
import streamlit

print(sklearn.__version__)
print(joblib.__version__)
print(numpy.__version__)
print(pandas.__version__)
print(streamlit.__version__)

1.2.2
1.3.2
1.23.5
1.5.3
1.27.0


In [31]:
## From pre processing of data frame we found that below columns are the most important one
## Random Forest is the giving highest results from all other models

from sklearn.metrics import f1_score

X = df_road[['Number_of_vehicles_involved', 'hour', 'Day_of_week',
       'Type_of_vehicle', 'Lanes_or_Medians',
       'Types_of_Junction', 'Cause_of_accident']]

y= df_road['Accident_severity']

### Idea is to apply one hot encoding to columns which are categorical and then apply random forest classfier using pipeline

import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

categorical_cols = X.select_dtypes(include=['object']).columns

# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Create the full pipeline including the model (RandomForestClassifier in this case)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())])

# Train test split the data frame

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred,average='micro')

print("F1 Score:", f1)

import pickle

import joblib

# Assuming 'pipeline' is your scikit-learn pipeline
# Save the pipeline to a file
joblib.dump(pipeline, 'pipeline_v1.pkl')

# Loading the pipeline to check if results are returned successfully
loaded_pipeline = joblib.load('pipeline_v1.pkl')


## Passing manual input
input_array = np.array([[2,15,'Sunday','Automobile','Double carriageway (median)','No junction','Moving Backward']])

# Passing column names
input_columns = ['Number_of_vehicles_involved', 'hour', 'Day_of_week', 'Type_of_vehicle',
       'Lanes_or_Medians', 'Types_of_Junction', 'Cause_of_accident']

# Created a data frame
input_df = pd.DataFrame(input_array,columns=input_columns)

# Predicting results
loaded_pipeline.predict(input_df)

F1 Score: 0.812905844155844


array(['Fatal injury'], dtype=object)