In [46]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
import setuptools._distutils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

tf.keras.backend.clear_session()


In [47]:
# # # # 40000 COLUMNS
# import data
csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
# # print(csv_data)

# clean up typo in data
# # print()
csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])

# extracting out rows that we want to look at
# # print()
working_data = pd.DataFrame(csv_data[[
    'Day_of_Week',
    'Light_Conditions', 
    'Accident_Severity',  
    'Road_Surface_Conditions', 
    'Speed_limit',
    'Weather_Conditions',
    'Vehicle_Type'
]])

# # print(working_data)

# Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
# # print()
col = 'Accident_Severity'
unique_values = working_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {working_data[col].value_counts()}")

# seperate data with 'Accident_Severity' Slight from non 'Slight'
# # print()
non_slight_data = working_data[working_data['Accident_Severity'] != "Slight"]
slight_data = working_data[working_data['Accident_Severity'] == "Slight"]

col = 'Accident_Severity'
unique_values = non_slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {non_slight_data[col].value_counts()}")
# # print()


unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")

# taking out random 40000 records which data 'Accident_Severity' is 'Slight'
# # print()
slight_data = slight_data.sample(40000)

col = 'Accident_Severity'
unique_values = slight_data[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {slight_data[col].value_counts()}")

# combine the dataframe
# # print()
combined_df = pd.concat([non_slight_data, slight_data], ignore_index=True)
combined_df = combined_df.sample(frac=1)

# # print(combined_df)

# show the end result
# # print()
col = 'Accident_Severity'
unique_values = combined_df[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {combined_df[col].value_counts()}")


In [48]:
# Assuming 'working_data' is your DataFrame with the required columns
data = combined_df.copy()  # Make a copy of the original DataFrame

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for col in ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

In [50]:
print("==========Train data==========")
y_train_pred = rf_classifier.predict(X_train)

# Evaluate model performance on training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Train Classification Report:")
print(classification_report(y_train, y_train_pred))

# Confusion Matrix for training data
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
print("Train Confusion Matrix:")
print(train_conf_matrix)


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Train Accuracy: 0.57
Train Classification Report:
              precision    recall  f1-score   support

       Fatal       0.80      0.07      0.13      3107
     Serious       0.58      0.56      0.57     32576
      Slight       0.57      0.63      0.60     32071

    accuracy                           0.57     67754
   macro avg       0.65      0.42      0.43     67754
weighted avg       0.58      0.57      0.56     67754

Train Confusion Matrix:
[[  216  1806  1085]
 [   26 18287 14263]
 [   27 11696 20348]]

Accuracy: 0.52
              precision    recall  f1-score   support

       Fatal       0.08      0.00      0.01       846
     Serious       0.52      0.51      0.51      8164
      Slight       0.52      0.58      0.55      7929

    accuracy                           0.52     16939
   macro avg       0.37      0.36      0.36     16939
weighted avg       0.50      0.52      0.50     16939

Confusion Matrix:
[[   4  527  315]
 [  33 4145 3986]
 [  15 3329 4585]]


# Implement Neural Networks

In [51]:
# Create a copy of the DataFrame to avoid modifying the original data
encoded_data = combined_df.copy()

# List of categorical columns to encode
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Accident_Severity', 
                        'Road_Surface_Conditions', 'Weather_Conditions', 'Vehicle_Type']

# Initialize LabelEncoder for each categorical column
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    encoded_data[col] = label_encoders[col].fit_transform(encoded_data[col])

# Display the encoded DataFrame
print("Encoded DataFrame:")
print(encoded_data)


Encoded DataFrame:
       Day_of_Week  Light_Conditions  Accident_Severity  \
79786            2                 4                  2   
36200            4                 4                  1   
48828            6                 2                  2   
84168            6                 4                  2   
55350            6                 4                  2   
...            ...               ...                ...   
23727            5                 1                  1   
56140            1                 4                  2   
45681            4                 4                  2   
83134            3                 4                  2   
19047            5                 4                  1   

       Road_Surface_Conditions  Speed_limit  Weather_Conditions  Vehicle_Type  
79786                        4           30                   5             2  
36200                        4           60                   1             2  
48828                        2  

In [52]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.52
Classification Report:
              precision    recall  f1-score   support

           0       0.08      0.00      0.01       846
           1       0.52      0.51      0.51      8164
           2       0.52      0.58      0.55      7929

    accuracy                           0.52     16939
   macro avg       0.37      0.36      0.36     16939
weighted avg       0.50      0.52      0.50     16939

Confusion Matrix:
[[   4  525  317]
 [  33 4141 3990]
 [  15 3325 4589]]


In [53]:
# Assuming 'working_data' is your DataFrame with the required columns
data = encoded_data.copy()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Day_of_Week', 'Light_Conditions', 'Road_Surface_Conditions', 
                       'Weather_Conditions', 'Vehicle_Type', 'Accident_Severity']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Convert DataFrame to float dtype (after encoding all categorical variables)
data = data.astype(float)

# Split data into features (X) and target variable (y)
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ensure target variable y is numeric
y_train = y_train.astype(int)  # Ensure y_train is of integer dtype
y_test = y_test.astype(int)    # Ensure y_test is of integer dtype

# Initialize Neural Network model
model = Sequential()

# Add input layer and hidden layers
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

# Add output layer
model.add(Dense(3, activation='softmax'))  # 3 output classes (Fatal, Serious, Slight)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Make predictions on test data
y_pred_prob = model.predict(X_test_scaled)  # Get the raw probabilities for each class
y_pred = np.argmax(y_pred_prob, axis=1)  # Determine the class with the highest probability

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))  # Set zero_division=0 to handle division by zero

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.4806 - loss: 0.8981 - val_accuracy: 0.4922 - val_loss: 0.8485
Epoch 2/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5024 - loss: 0.8435 - val_accuracy: 0.5071 - val_loss: 0.8476
Epoch 3/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.5153 - loss: 0.8410 - val_accuracy: 0.4903 - val_loss: 0.8477
Epoch 4/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5079 - loss: 0.8422 - val_accuracy: 0.5100 - val_loss: 0.8467
Epoch 5/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5148 - loss: 0.8370 - val_accuracy: 0.5131 - val_loss: 0.8462
Epoch 6/50
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5123 - loss: 0.8394 - val_accuracy: 0.5111 - val_loss: 0.8464
Epoch 7/50


In [54]:
print("==========Train data==========")
# Make predictions on training data
y_train_pred_prob = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred_prob, axis=1)

train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Generate classification report and confusion matrix for training data
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred, zero_division=1))

print("Training Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))


print()
print("==========Test data==========")
# Make predictions on the test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step
Training Accuracy: 0.52
Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3107
           1       0.52      0.46      0.49     32576
           2       0.52      0.64      0.58     32071

    accuracy                           0.52     67754
   macro avg       0.68      0.37      0.36     67754
weighted avg       0.55      0.52      0.51     67754

Training Confusion Matrix:
[[    0  2001  1106]
 [    0 14987 17589]
 [    0 11561 20510]]

[1m530/530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Accuracy: 0.52
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       846
           1       0.53      0.47      0.49      8164
           2       0.52      0.64      0.57      7929

    accuracy                           0.52     16939
   macro 

In [55]:
df = pd.DataFrame(data)

df = pd.DataFrame(data)

# Filter the DataFrame to select rows where Accident_Severity is 1.0
filtered_df = df[df['Accident_Severity'] == 2.0]

# Sample 3000 rows randomly from the filtered DataFrame
random_sample_df = filtered_df.sample(n=30, random_state=42)

print("Random Sample where Accident_Severity is 1.0:")
# print(random_sample_df)


for col in data:
    unique_values = data[col].unique()
    print(f"Unique values in '{col}': {unique_values}")
    print(f"No. of val: {data[col].value_counts()}")
    print()



Random Sample where Accident_Severity is 1.0:
Unique values in 'Day_of_Week': [2. 4. 6. 3. 5. 0. 1.]
No. of val: Day_of_Week
0.0    13701
5.0    12370
6.0    12364
4.0    12284
2.0    12073
1.0    11741
3.0    10160
Name: count, dtype: int64

Unique values in 'Light_Conditions': [4. 2. 1. 3. 0.]
No. of val: Light_Conditions
4.0    60898
1.0    17029
3.0     5688
0.0      776
2.0      302
Name: count, dtype: int64

Unique values in 'Accident_Severity': [2. 1. 0.]
No. of val: Accident_Severity
1.0    40740
2.0    40000
0.0     3953
Name: count, dtype: int64

Unique values in 'Road_Surface_Conditions': [4. 2. 0. 3. 5. 1.]
No. of val: Road_Surface_Conditions
0.0    58441
4.0    21894
2.0     3048
3.0     1133
1.0      111
5.0       66
Name: count, dtype: int64

Unique values in 'Speed_limit': [30. 60. 40. 50. 70. 20. 15.]
No. of val: Speed_limit
30.0    51938
60.0    15826
40.0     6941
70.0     6172
50.0     3039
20.0      776
15.0        1
Name: count, dtype: int64

Unique values in 'Wea