In [11]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
import setuptools._distutils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

tf.keras.backend.clear_session()


In [12]:
# import data
csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
# # print(csv_data)

# clean up typo in data
# # print()
csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])

# extracting out rows that we want to look at
# # print()
working_data = pd.DataFrame(csv_data[[
    'Day_of_Week',
    'Light_Conditions', 
    'Accident_Severity',  
    'Road_Surface_Conditions', 
    'Speed_limit',
    'Weather_Conditions',
    'Vehicle_Type'
]])

# # print(working_data)

# Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
# # print()
col = 'Accident_Severity'
unique_values = working_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {working_data[col].value_counts()}")

# seperate data with 'Accident_Severity' Slight and Serious and Fatal
# # print()
Serious_data = working_data[working_data['Accident_Severity'] == "Serious"]
slight_data = working_data[working_data['Accident_Severity'] == "Slight"]
fatal_data = working_data[working_data['Accident_Severity'] == "Fatal"]

col = 'Accident_Severity'
unique_values = Serious_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {Serious_data[col].value_counts()}")
# # print()

unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")
# # print()

unique_values = fatal_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {fatal_data[col].value_counts()}")

# limit all accident_severty type to 4000 columns
# # print()
slight_data = slight_data.sample(4000)
Serious_data = Serious_data.sample(4000)

col = 'Accident_Severity'
unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")
# # print()

unique_values = Serious_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {Serious_data[col].value_counts()}")

# combine the dataframe
# # print()
combined_df = pd.concat([Serious_data, slight_data, fatal_data], ignore_index=True)
combined_df = combined_df.sample(frac=1)

combined_df.head()

# show the end result
# # print()
col = 'Accident_Severity'
unique_values = combined_df[col].unique()
# print(f"Unique values in '{col}': {unique_values}")
# print(f"No. of val: {combined_df[col].value_counts()}")


In [13]:
# Assuming 'working_data' is your DataFrame with the required columns
data = combined_df.copy()  # Make a copy of the original DataFrame

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for col in ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

In [15]:
print("==========Train data==========")
y_train_pred = rf_classifier.predict(X_train)

# Evaluate model performance on training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred))

# Confusion Matrix for training data
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
print("Train Confusion Matrix:")
print(train_conf_matrix)


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Train Accuracy: 0.55
Train Classification Report:
              precision    recall  f1-score   support

       Fatal       0.58      0.64      0.61      3164
     Serious       0.61      0.33      0.43      3210
      Slight       0.51      0.70      0.59      3188

    accuracy                           0.55      9562
   macro avg       0.57      0.55      0.54      9562
weighted avg       0.57      0.55      0.54      9562

Train Confusion Matrix:
[[2020  322  822]
 [ 873 1051 1286]
 [ 619  350 2219]]

Accuracy: 0.41
              precision    recall  f1-score   support

       Fatal       0.45      0.50      0.47       789
     Serious       0.32      0.18      0.23       790
      Slight       0.42      0.55      0.48       812

    accuracy                           0.41      2391
   macro avg       0.40      0.41      0.39      2391
weighted avg       0.40      0.41      0.39      2391

Confusion Matrix:
[[398 146 245]
 [284 141 365]
 [211 152 449]]


# Implement Neural Networks

In [16]:
# Create a copy of the DataFrame to avoid modifying the original data
encoded_data = combined_df.copy()

# List of categorical columns to encode
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Accident_Severity', 
                        'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']

# Initialize LabelEncoder for each categorical column
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    encoded_data[col] = label_encoders[col].fit_transform(encoded_data[col])

# Display the encoded DataFrame
print("Encoded DataFrame:")
print(encoded_data)


Encoded DataFrame:
       Day_of_Week  Light_Conditions  Accident_Severity  \
7955             5                 4                  2   
9528             3                 1                  0   
2134             0                 4                  1   
2568             2                 4                  1   
9888             0                 3                  0   
...            ...               ...                ...   
3979             3                 1                  1   
1288             3                 4                  1   
10740            3                 3                  0   
514              0                 1                  1   
1368             6                 4                  1   

       Road_Surface_Conditions  Speed_limit  Weather_Conditions  Vehicle_Type  
7955                         4           60                   5             2  
9528                         4           30                   1             3  
2134                         0  

In [17]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.41
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.50      0.47       789
           1       0.32      0.18      0.23       790
           2       0.42      0.55      0.48       812

    accuracy                           0.41      2391
   macro avg       0.40      0.41      0.39      2391
weighted avg       0.40      0.41      0.39      2391

Confusion Matrix:
[[398 145 246]
 [287 140 363]
 [211 152 449]]


In [18]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type', 'Accident_Severity']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Convert DataFrame to float dtype (after encoding all categorical variables)
data = data.astype(float)

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ensure target variable y is numeric
y_train = y_train.astype(int)  # Ensure y_train is of integer dtype
y_test = y_test.astype(int)    # Ensure y_test is of integer dtype

# Initialize Neural Network model
model = Sequential()

# Add input layer and hidden layers
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Add output layer
model.add(Dense(3, activation='softmax'))  # 3 output classes (Fatal, Serious, Slight)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Make predictions on test data
y_pred_prob = model.predict(X_test_scaled)  # Get the raw probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Determine the class with the highest probability

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))  # Set zero_division=0 to handle division by zero

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3617 - loss: 1.1287 - val_accuracy: 0.4159 - val_loss: 1.0787
Epoch 2/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3878 - loss: 1.0893 - val_accuracy: 0.4211 - val_loss: 1.0762
Epoch 3/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4130 - loss: 1.0708 - val_accuracy: 0.4295 - val_loss: 1.0717
Epoch 4/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4109 - loss: 1.0742 - val_accuracy: 0.4389 - val_loss: 1.0699
Epoch 5/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4176 - loss: 1.0728 - val_accuracy: 0.4378 - val_loss: 1.0699
Epoch 6/50
[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4162 - loss: 1.0704 - val_accuracy: 0.4347 - val_loss: 1.0704
Epoch 7/50
[1m269/269

In [19]:
print("==========Train data==========")
# Make predictions on training data
y_train_pred_prob = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred_prob, axis=1)

train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))

print("Training Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step
Training Accuracy: 0.44
Training Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.62      0.53      3164
           1       0.38      0.15      0.21      3210
           2       0.44      0.56      0.49      3188

    accuracy                           0.44      9562
   macro avg       0.42      0.44      0.41      9562
weighted avg       0.42      0.44      0.41      9562

Training Confusion Matrix:
[[1968  395  801]
 [1320  468 1422]
 [1037  381 1770]]

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 799us/step
Accuracy: 0.43
Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.62      0.53       789
           1       0.31      0.12      0.17       790
           2       0.44      0.55      0.49       812

    accuracy                           0.43      2391
   macro avg      

In [20]:
df = pd.DataFrame(data)

df = pd.DataFrame(data)

# Filter the DataFrame to select rows where Accident_Severity is 1.0
filtered_df = df[df['Accident_Severity'] == 2.0]

# Sample 3000 rows randomly from the filtered DataFrame
random_sample_df = filtered_df.sample(n=30, random_state=42)

print("Random Sample where Accident_Severity is 1.0:")
# print(random_sample_df)


for col in data:
    unique_values = data[col].unique()
    print(f"Unique values in '{col}': {unique_values}")
    print(f"No. of val: {data[col].value_counts()}")
    print()



Random Sample where Accident_Severity is 1.0:
Unique values in 'Day_of_Week': [5. 3. 0. 2. 4. 6. 1.]
No. of val: Day_of_Week
0.0    1876
2.0    1858
4.0    1708
1.0    1657
6.0    1655
5.0    1646
3.0    1553
Name: count, dtype: int64

Unique values in 'Light_Conditions': [4. 1. 3. 0. 2.]
No. of val: Light_Conditions
4.0    8212
1.0    2427
3.0    1162
0.0     105
2.0      47
Name: count, dtype: int64

Unique values in 'Accident_Severity': [2. 0. 1.]
No. of val: Accident_Severity
2.0    4000
1.0    4000
0.0    3953
Name: count, dtype: int64

Unique values in 'Road_Surface_Conditions': [4. 0. 2. 3. 1. 5.]
No. of val: Road_Surface_Conditions
0.0    8207
4.0    3152
2.0     418
3.0     147
1.0      24
5.0       5
Name: count, dtype: int64

Unique values in 'Speed_limit': [60. 30. 40. 70. 50. 20.]
No. of val: Speed_limit
30.0    6548
60.0    2778
70.0    1045
40.0    1005
50.0     502
20.0      75
Name: count, dtype: int64

Unique values in 'Weather_Conditions': [5. 1. 8. 3. 4. 0. 7. 2. 6.