# Accident severity Prediction with ANN

## Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Input, Conv2D, MaxPool2D, Flatten, Dense, Dropout
from keras.optimizers import Adam

In [2]:
# Load data
data = pd.read_csv('../data/merged_information_clean.csv')

In [3]:
# Only Car accidents

data = data[data['Vehicle_Type'] == 'Car']

data.shape[0]

1528540

## Drop NaN

In [4]:
df = data.dropna()

In [5]:
df

Unnamed: 0.1,Unnamed: 0,Accident_Index,Accident_Severity,Date,Latitude,Longitude,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Speed_limit,...,Age_of_Vehicle,make,model,Sex_of_Driver,Vehicle_Type,Hit_Object_in_Carriageway,Hit_Object_off_Carriageway,Vehicle_Manoeuvre,Driver_Home_Area_Type,X1st_Point_of_Impact
2,2,200501BS00004,Slight,2005-01-07,51.482442,-0.173862,Daylight,1,1,30.0,...,4.0,NISSAN,ALMERA SE AUTO,Female,Car,No data,No data,Going ahead other,Urban,Front
4,4,200501BS00006,Slight,2005-01-11,51.515540,-0.203238,Daylight,1,2,30.0,...,1.0,AUDI,A4 SPORT CABRIOLET AUTO,Male,Car,No data,No data,Moving off,Urban,Did not impact
7,7,200501BS00007,Slight,2005-01-13,51.512695,-0.211277,Darkness - lights lit,1,2,30.0,...,4.0,MERCEDES,ML 430 AUTO,Male,Car,No data,No data,Parked,Urban,Back
8,8,200501BS00009,Slight,2005-01-14,51.502260,-0.187623,Daylight,2,1,30.0,...,16.0,JAGUAR,XJ-S CONVERTIBLE AUTO,Male,Car,No data,No data,Going ahead other,Urban,Front
9,9,200501BS00012,Slight,2005-01-16,51.494902,-0.182872,Darkness - lights lit,1,1,30.0,...,6.0,FORD,MONDEO GHIA X AUTO,Male,Car,No data,No data,Going ahead other,Urban,Front
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2058276,2058401,2016984130916,Slight,2016-10-28,55.058998,-3.265390,Darkness - lights lit,1,2,30.0,...,8.0,SEAT,IBIZA ECOMOTIVE TDI,Female,Car,Parked vehicle,No data,Going ahead other,Rural,Front
2058277,2058402,2016984130916,Slight,2016-10-28,55.058998,-3.265390,Darkness - lights lit,1,2,30.0,...,11.0,RENAULT,MEGANE DYNAMIQUE 16V,Not known,Car,No data,Other permanent object,Parked,No data,Front
2058278,2058403,2016984131116,Slight,2016-11-01,55.005033,-3.312631,Daylight,2,2,60.0,...,14.0,MINI,MINI COOPER,Female,Car,No data,No data,Going ahead other,Urban,Front
2058281,2058406,2016984131316,Slight,2016-10-29,54.989597,-3.272584,Darkness - lights lit,3,1,40.0,...,13.0,VAUXHALL,VECTRA SXI 16V,Male,Car,No data,Tree,Going ahead other,Urban,Front


## Is the number of each outcomes large enough?

In [6]:
severity_counts = df['Accident_Severity'].value_counts()

print(severity_counts)

Accident_Severity
Slight     1159828
Serious     146639
Fatal        14446
Name: count, dtype: int64


In [13]:
# Select relevant features
features = [
    #"Time",
    "Light_Conditions",
    "Weather_Conditions",
    "Speed_limit",
    "Road_Type",
    "Road_Surface_Conditions",
    "Urban_or_Rural_Area",
    "Age_Band_of_Driver",
    "Sex_of_Driver",
    "Age_of_Vehicle",
    "Vehicle_Manoeuvre"
]


target = 'Accident_Severity'

X = df[features]
y = df[target]

In [14]:
X.isna().sum()

Light_Conditions           0
Weather_Conditions         0
Speed_limit                0
Road_Type                  0
Road_Surface_Conditions    0
Urban_or_Rural_Area        0
Age_Band_of_Driver         0
Sex_of_Driver              0
Age_of_Vehicle             0
Vehicle_Manoeuvre          0
dtype: int64

In [15]:
X

Unnamed: 0,Light_Conditions,Weather_Conditions,Speed_limit,Road_Type,Road_Surface_Conditions,Urban_or_Rural_Area,Age_Band_of_Driver,Sex_of_Driver,Age_of_Vehicle,Vehicle_Manoeuvre
2,Daylight,Fine no high winds,30.0,Single carriageway,Dry,Urban,46 - 55,Female,4.0,Going ahead other
4,Daylight,Raining no high winds,30.0,Single carriageway,Wet or damp,Urban,46 - 55,Male,1.0,Moving off
7,Darkness - lights lit,Fine no high winds,30.0,Single carriageway,Dry,Urban,36 - 45,Male,4.0,Parked
8,Daylight,Fine no high winds,30.0,Dual carriageway,Dry,Urban,66 - 75,Male,16.0,Going ahead other
9,Darkness - lights lit,Fine no high winds,30.0,Single carriageway,Dry,Urban,26 - 35,Male,6.0,Going ahead other
...,...,...,...,...,...,...,...,...,...,...
2058276,Darkness - lights lit,Fine no high winds,30.0,Single carriageway,Wet or damp,Rural,46 - 55,Female,8.0,Going ahead other
2058277,Darkness - lights lit,Fine no high winds,30.0,Single carriageway,Wet or damp,Rural,No data,Not known,11.0,Parked
2058278,Daylight,Fine no high winds,60.0,Single carriageway,Dry,Rural,21 - 25,Female,14.0,Going ahead other
2058281,Darkness - lights lit,Fine no high winds,40.0,Single carriageway,Dry,Rural,16 - 20,Male,13.0,Going ahead other


In [16]:
# Convert target to numerical categories
severity_mapping = {'Slight': 0, 'Serious': 1, 'Fatal': 2}
y = y.map(severity_mapping)

# Data preprocessing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X_processed = preprocessor.fit_transform(X)

In [18]:
# Convert the sparse matrix to a dense array
X_processed_dense = X_processed.toarray()

# Convert target to categorical
y_categorical = tf.keras.utils.to_categorical(y)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_processed_dense, y_categorical, test_size=0.2, random_state=42)

In [19]:
# Building the ANN model
model = Sequential()
model.add(Dense(64, input_dim=X_processed_dense.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
# Number of epochs
epochs = 10

# Learning rate
learning_rate = 0.001

# Compile the model
model.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
model.fit(X_train, y_train, epochs=epochs, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m26419/26419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 1ms/step - accuracy: 0.8772 - loss: 0.3932 - val_accuracy: 0.8785 - val_loss: 0.3863
Epoch 2/10
[1m26419/26419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.8771 - loss: 0.3892 - val_accuracy: 0.8785 - val_loss: 0.3858
Epoch 3/10
[1m26419/26419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.8776 - loss: 0.3877 - val_accuracy: 0.8785 - val_loss: 0.3854
Epoch 4/10
[1m26419/26419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.8782 - loss: 0.3851 - val_accuracy: 0.8785 - val_loss: 0.3853
Epoch 5/10
[1m26419/26419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 1ms/step - accuracy: 0.8778 - loss: 0.3872 - val_accuracy: 0.8785 - val_loss: 0.3851
Epoch 6/10
[1m26419/26419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 1ms/step - accuracy: 0.8775 - loss: 0.3869 - val_accuracy: 0.8785 - val_loss: 0.385

<keras.src.callbacks.history.History at 0x1ed366ed5e0>

In [22]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# Predictions
y_pred = model.predict(X_test)

[1m8256/8256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 933us/step - accuracy: 0.8791 - loss: 0.3863
Test Accuracy: 0.8788340091705322
[1m8256/8256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 941us/step


## Area Under the Curve (AUC) score

In [23]:
if len(y_pred[0]) > 2:  # Multi-class classification
    from sklearn.metrics import roc_auc_score

    # Calculate AUC score
    auc_score = roc_auc_score(y_test, y_pred, multi_class='ovr')
    print(f"AUC Score: {auc_score}")


AUC Score: 0.705817223827323


Interpretation of AUC Score:

An AUC score ranges from 0 to 1.
A score of 0.5 suggests no discrimination (i.e., random guessing).
A score above 0.7 is generally considered acceptable and indicates that the model has some ability to distinguish between the positive and negative classes.
A score above 0.8 is considered good, while a score above 0.9 is considered excellent.