In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
import setuptools._distutils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

tf.keras.backend.clear_session()





In [2]:
# import data
csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
# # print(csv_data)

# clean up typo in data
# # print()
csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])

# extracting out rows that we want to look at
# # print()
working_data = pd.DataFrame(csv_data[[
    'Day_of_Week',
    'Light_Conditions', 
    'Accident_Severity',  
    'Road_Surface_Conditions', 
    'Speed_limit',
    'Weather_Conditions',
    'Vehicle_Type'
]])

# # print(working_data)

# Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
# # print()
col = 'Accident_Severity'
unique_values = working_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {working_data[col].value_counts()}")

# seperate data with 'Accident_Severity' Slight and Serious and Fatal
# # print()
Serious_data = working_data[working_data['Accident_Severity'] == "Serious"]
slight_data = working_data[working_data['Accident_Severity'] == "Slight"]
fatal_data = working_data[working_data['Accident_Severity'] == "Fatal"]

col = 'Accident_Severity'
unique_values = Serious_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {Serious_data[col].value_counts()}")
# # print()

unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")
# # print()

unique_values = fatal_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {fatal_data[col].value_counts()}")

# limit all accident_severty type to 4000 columns
# # print()
slight_data = slight_data.sample(4000)
Serious_data = Serious_data.sample(4000)

col = 'Accident_Severity'
unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")
# # print()

unique_values = Serious_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {Serious_data[col].value_counts()}")

# combine the dataframe
# # print()
combined_df = pd.concat([Serious_data, slight_data, fatal_data], ignore_index=True)
combined_df = combined_df.sample(frac=1)

combined_df.head()

# show the end result
# # print()
col = 'Accident_Severity'
unique_values = combined_df[col].unique()
# print(f"Unique values in '{col}': {unique_values}")
# print(f"No. of val: {combined_df[col].value_counts()}")


In [3]:
# Assuming 'working_data' is your DataFrame with the required columns
data = combined_df.copy()  # Make a copy of the original DataFrame

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for col in ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

In [5]:
print("==========Train data==========")
y_train_pred = rf_classifier.predict(X_train)

# Evaluate model performance on training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred))

# Confusion Matrix for training data
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
print("Train Confusion Matrix:")
print(train_conf_matrix)


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Train Accuracy: 0.56
Train Classification Report:
              precision    recall  f1-score   support

       Fatal       0.59      0.62      0.60      3187
     Serious       0.54      0.44      0.49      3188
      Slight       0.54      0.60      0.57      3187

    accuracy                           0.56      9562
   macro avg       0.55      0.56      0.55      9562
weighted avg       0.55      0.56      0.55      9562

Train Confusion Matrix:
[[1979  514  694]
 [ 825 1415  948]
 [ 578  694 1915]]

Accuracy: 0.40
              precision    recall  f1-score   support

       Fatal       0.44      0.49      0.46       766
     Serious       0.37      0.31      0.33       812
      Slight       0.40      0.42      0.41       813

    accuracy                           0.40      2391
   macro avg       0.40      0.41      0.40      2391
weighted avg       0.40      0.40      0.40      2391

Confusion Matrix:
[[379 181 206]
 [269 250 293]
 [222 253 338]]


# Implement Neural Networks

In [6]:
# Create a copy of the DataFrame to avoid modifying the original data
encoded_data = combined_df.copy()

# List of categorical columns to encode
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Accident_Severity', 
                        'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']

# Initialize LabelEncoder for each categorical column
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    encoded_data[col] = label_encoders[col].fit_transform(encoded_data[col])

# Display the encoded DataFrame
print("Encoded DataFrame:")
print(encoded_data)


Encoded DataFrame:
       Day_of_Week  Light_Conditions  Accident_Severity  \
1663             5                 4                  1   
4784             3                 1                  2   
10303            5                 4                  0   
7791             2                 4                  2   
5579             0                 4                  2   
...            ...               ...                ...   
11575            1                 1                  0   
8772             2                 3                  0   
7536             1                 4                  2   
10056            6                 1                  0   
4773             2                 4                  2   

       Road_Surface_Conditions  Speed_limit  Weather_Conditions  Vehicle_Type  
1663                         4           60                   5             2  
4784                         2           30                   3             2  
10303                        0  

In [7]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.40
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.49      0.46       766
           1       0.37      0.31      0.34       812
           2       0.41      0.42      0.41       813

    accuracy                           0.40      2391
   macro avg       0.40      0.41      0.40      2391
weighted avg       0.40      0.40      0.40      2391

Confusion Matrix:
[[379 183 204]
 [271 251 290]
 [223 252 338]]


In [8]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type', 'Accident_Severity']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Convert DataFrame to float dtype (after encoding all categorical variables)
data = data.astype(float)

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ensure target variable y is numeric
y_train = y_train.astype(int)  # Ensure y_train is of integer dtype
y_test = y_test.astype(int)    # Ensure y_test is of integer dtype

# Initialize Neural Network model
model = Sequential()

# Add input layer and hidden layers
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Add output layer
model.add(Dense(3, activation='softmax'))  # 3 output classes (Fatal, Serious, Slight)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Make predictions on test data
y_pred_prob = model.predict(X_test_scaled)  # Get the raw probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Determine the class with the highest probability

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))  # Set zero_division=0 to handle division by zero

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.3422 - loss: 1.1479 - val_accuracy: 0.4514 - val_loss: 1.0755
Epoch 2/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3925 - loss: 1.0850 - val_accuracy: 0.4566 - val_loss: 1.0685
Epoch 3/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3822 - loss: 1.0836 - val_accuracy: 0.4525 - val_loss: 1.0662
Epoch 4/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3968 - loss: 1.0832 - val_accuracy: 0.4681 - val_loss: 1.0617
Epoch 5/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4142 - loss: 1.0741 - val_accuracy: 0.4629 - val_loss: 1.0647
Epoch 6/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4044 - loss: 1.0780 - val_accuracy: 0.4639 - val_loss: 1.0631
Epoch 7/50
[1m269/269

In [9]:
print("==========Train data==========")
# Make predictions on training data
y_train_pred_prob = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred_prob, axis=1)

train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))

print("Training Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m 34/299[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step  

[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Training Accuracy: 0.44
Training Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.58      0.52      3187
           1       0.35      0.13      0.19      3188
           2       0.43      0.61      0.51      3187

    accuracy                           0.44      9562
   macro avg       0.42      0.44      0.41      9562
weighted avg       0.42      0.44      0.41      9562

Training Confusion Matrix:
[[1836  382  969]
 [1171  410 1607]
 [ 860  369 1958]]

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy: 0.43
Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.59      0.52       766
           1       0.34      0.12      0.17       812
           2       0.42      0.58      0.49       813

    accuracy                           0.43      2391
   macro avg       0.4

In [10]:
df = pd.DataFrame(data)

df = pd.DataFrame(data)

# Filter the DataFrame to select rows where Accident_Severity is 1.0
filtered_df = df[df['Accident_Severity'] == 2.0]

# Sample 3000 rows randomly from the filtered DataFrame
random_sample_df = filtered_df.sample(n=30, random_state=42)

print("Random Sample where Accident_Severity is 1.0:")
# print(random_sample_df)


for col in data:
    unique_values = data[col].unique()
    print(f"Unique values in '{col}': {unique_values}")
    print(f"No. of val: {data[col].value_counts()}")
    print()



Random Sample where Accident_Severity is 1.0:
Unique values in 'Day_of_Week': [5. 3. 2. 0. 1. 4. 6.]
No. of val: Day_of_Week
0.0    1937
2.0    1869
5.0    1675
4.0    1652
6.0    1649
1.0    1605
3.0    1566
Name: count, dtype: int64

Unique values in 'Light_Conditions': [4. 1. 3. 2. 0.]
No. of val: Light_Conditions
4.0    8199
1.0    2477
3.0    1131
0.0      99
2.0      47
Name: count, dtype: int64

Unique values in 'Accident_Severity': [1. 2. 0.]
No. of val: Accident_Severity
1.0    4000
2.0    4000
0.0    3953
Name: count, dtype: int64

Unique values in 'Road_Surface_Conditions': [4. 2. 0. 3. 1. 5.]
No. of val: Road_Surface_Conditions
0.0    8214
4.0    3163
2.0     397
3.0     152
1.0      20
5.0       7
Name: count, dtype: int64

Unique values in 'Speed_limit': [60. 30. 40. 70. 50. 20.]
No. of val: Speed_limit
30.0    6560
60.0    2757
40.0    1034
70.0    1025
50.0     494
20.0      83
Name: count, dtype: int64

Unique values in 'Weather_Conditions': [5. 3. 1. 8. 4. 7. 0. 2. 6.