In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
import setuptools._distutils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

tf.keras.backend.clear_session()





In [2]:
# # # # 40000 COLUMNS
# import data
csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
# # print(csv_data)

# clean up typo in data
# # print()
csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])

# extracting out rows that we want to look at
# # print()
working_data = pd.DataFrame(csv_data[[
    'Day_of_Week',
    'Light_Conditions', 
    'Accident_Severity',  
    'Road_Surface_Conditions', 
    'Speed_limit',
    'Weather_Conditions',
    'Vehicle_Type'
]])

# # print(working_data)

# Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
# # print()
col = 'Accident_Severity'
unique_values = working_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {working_data[col].value_counts()}")

# seperate data with 'Accident_Severity' Slight from non 'Slight'
# # print()
non_slight_data = working_data[working_data['Accident_Severity'] != "Slight"]
slight_data = working_data[working_data['Accident_Severity'] == "Slight"]

col = 'Accident_Severity'
unique_values = non_slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {non_slight_data[col].value_counts()}")
# # print()


unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")

# taking out random 40000 records which data 'Accident_Severity' is 'Slight'
# # print()
slight_data = slight_data.sample(40000)

col = 'Accident_Severity'
unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")

# combine the dataframe
# # print()
combined_df = pd.concat([non_slight_data, slight_data], ignore_index=True)
combined_df = combined_df.sample(frac=1)

# # print(combined_df)

# show the end result
# # print()
col = 'Accident_Severity'
unique_values = combined_df[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {combined_df[col].value_counts()}")


In [3]:
# Assuming 'working_data' is your DataFrame with the required columns
data = combined_df.copy()  # Make a copy of the original DataFrame

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for col in ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

In [5]:
print("==========Train data==========")
y_train_pred = rf_classifier.predict(X_train)

# Evaluate model performance on training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred))

# Confusion Matrix for training data
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
print("Train Confusion Matrix:")
print(train_conf_matrix)


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Train Accuracy: 0.57
Train Classification Report:
              precision    recall  f1-score   support

       Fatal       0.75      0.07      0.13      3153
     Serious       0.58      0.55      0.56     32635
      Slight       0.57      0.65      0.61     31966

    accuracy                           0.57     67754
   macro avg       0.63      0.42      0.43     67754
weighted avg       0.58      0.57      0.56     67754

Train Confusion Matrix:
[[  221  1789  1143]
 [   42 17814 14779]
 [   30 11076 20860]]

Accuracy: 0.51
              precision    recall  f1-score   support

       Fatal       0.02      0.00      0.00       800
     Serious       0.51      0.49      0.50      8105
      Slight       0.52      0.59      0.55      8034

    accuracy                           0.51     16939
   macro avg       0.35      0.36      0.35     16939
weighted avg       0.49      0.51      0.50     16939

Confusion Matrix:
[[   1  506  293]
 [  21 3939 4145]
 [  20 3294 4720]]


# Implement Neural Networks

In [6]:
# Create a copy of the DataFrame to avoid modifying the original data
encoded_data = combined_df.copy()

# List of categorical columns to encode
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Accident_Severity', 
                        'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']

# Initialize LabelEncoder for each categorical column
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    encoded_data[col] = label_encoders[col].fit_transform(encoded_data[col])

# Display the encoded DataFrame
print("Encoded DataFrame:")
print(encoded_data)


Encoded DataFrame:
       Day_of_Week  Light_Conditions  Accident_Severity  \
42750            6                 1                  1   
61841            6                 4                  2   
5793             5                 4                  1   
38515            2                 1                  1   
54833            4                 4                  2   
...            ...               ...                ...   
12925            6                 1                  0   
5760             4                 4                  1   
58994            3                 4                  2   
64636            0                 4                  2   
61326            1                 4                  2   

       Road_Surface_Conditions  Speed_limit  Weather_Conditions  Vehicle_Type  
42750                        0           30                   1             7  
61841                        0           70                   3             2  
5793                         0  

In [7]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.51
Classification Report:
              precision    recall  f1-score   support

           0       0.02      0.00      0.00       800
           1       0.51      0.49      0.50      8105
           2       0.52      0.59      0.55      8034

    accuracy                           0.51     16939
   macro avg       0.35      0.36      0.35     16939
weighted avg       0.49      0.51      0.50     16939

Confusion Matrix:
[[   1  508  291]
 [  22 3933 4150]
 [  22 3287 4725]]


In [8]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type', 'Accident_Severity']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Convert DataFrame to float dtype (after encoding all categorical variables)
data = data.astype(float)

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ensure target variable y is numeric
y_train = y_train.astype(int)  # Ensure y_train is of integer dtype
y_test = y_test.astype(int)    # Ensure y_test is of integer dtype

# Initialize Neural Network model
model = Sequential()

# Add input layer and hidden layers
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Add output layer
model.add(Dense(3, activation='softmax'))  # 3 output classes (Fatal, Serious, Slight)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Make predictions on test data
y_pred_prob = model.predict(X_test_scaled)  # Get the raw probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Determine the class with the highest probability

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))  # Set zero_division=0 to handle division by zero

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.4916 - loss: 0.8857 - val_accuracy: 0.5223 - val_loss: 0.8395
Epoch 2/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5114 - loss: 0.8480 - val_accuracy: 0.5261 - val_loss: 0.8394
Epoch 3/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5110 - loss: 0.8416 - val_accuracy: 0.5279 - val_loss: 0.8388
Epoch 4/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5143 - loss: 0.8431 - val_accuracy: 0.5266 - val_loss: 0.8366
Epoch 5/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5149 - loss: 0.8388 - val_accuracy: 0.5233 - val_loss: 0.8375
Epoch 6/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5151 - loss: 0.8393 - val_accuracy: 0.5258 - val_loss: 0.8371
Epoch 7/50


In [9]:
print("==========Train data==========")
# Make predictions on training data
y_train_pred_prob = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred_prob, axis=1)

train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))

print("Training Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step
Training Accuracy: 0.53
Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3153
           1       0.53      0.47      0.50     32635
           2       0.53      0.64      0.58     31966

    accuracy                           0.53     67754
   macro avg       0.69      0.37      0.36     67754
weighted avg       0.55      0.53      0.51     67754

Training Confusion Matrix:
[[    0  2115  1038]
 [    0 15441 17194]
 [    0 11663 20303]]

[1m530/530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.53
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       800
           1       0.52      0.47      0.50      8105
           2       0.53      0.63      0.58      8034

    accuracy                           0.53     16939
   macro 

In [10]:
df = pd.DataFrame(data)

df = pd.DataFrame(data)

# Filter the DataFrame to select rows where Accident_Severity is 1.0
filtered_df = df[df['Accident_Severity'] == 2.0]

# Sample 3000 rows randomly from the filtered DataFrame
random_sample_df = filtered_df.sample(n=30, random_state=42)

print("Random Sample where Accident_Severity is 1.0:")
# print(random_sample_df)


for col in data:
    unique_values = data[col].unique()
    print(f"Unique values in '{col}': {unique_values}")
    print(f"No. of val: {data[col].value_counts()}")
    print()



Random Sample where Accident_Severity is 1.0:
Unique values in 'Day_of_Week': [6. 5. 2. 4. 3. 1. 0.]
No. of val: Day_of_Week
0.0    13597
4.0    12357
5.0    12329
6.0    12246
2.0    12154
1.0    11800
3.0    10210
Name: count, dtype: int64

Unique values in 'Light_Conditions': [1. 4. 3. 0. 2.]
No. of val: Light_Conditions
4.0    60831
1.0    17106
3.0     5684
0.0      758
2.0      314
Name: count, dtype: int64

Unique values in 'Accident_Severity': [1. 2. 0.]
No. of val: Accident_Severity
1.0    40740
2.0    40000
0.0     3953
Name: count, dtype: int64

Unique values in 'Road_Surface_Conditions': [0. 4. 2. 3. 1. 5.]
No. of val: Road_Surface_Conditions
0.0    58317
4.0    22095
2.0     2983
3.0     1126
1.0      115
5.0       57
Name: count, dtype: int64

Unique values in 'Speed_limit': [30. 70. 60. 40. 20. 50. 10.]
No. of val: Speed_limit
30.0    52156
60.0    15685
40.0     6995
70.0     6102
50.0     2983
20.0      771
10.0        1
Name: count, dtype: int64

Unique values in 'Wea