In [1]:
#Import Dependicies
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from glob import glob

In [2]:
#Define Resourse path
url="/content/2021_DATA_SA_Crash.csv"

In [5]:
# Import and read the charity_data.csv.
crash_df = pd.read_csv(url)


In [6]:
crash_df = crash_df.fillna(0)
crash_df.head()

Unnamed: 0,REPORT_ID,Stats Area,Suburb,Postcode,LGA Name,Total Units,Total Cas,Total Fats,Total SI,Total MI,...,Crash Type,Unit Resp,Entity Code,CSEF Severity,Traffic Ctrls,DUI Involved,Drugs Involved,ACCLOC_X,ACCLOC_Y,UNIQUE_LOC
0,2021-1-6/07/2022,2 Metropolitan,VIRGINIA,5120,CITY OF PLAYFORD.,2,0,0,0,0,...,Side Swipe,2,Driver Rider,1: PDO,No Control,0,0,1326227.35,1696803.21,13262270000000.0
1,2021-2-6/07/2022,2 Metropolitan,MELROSE PARK,5039,CC MITCHAM.,2,1,1,0,0,...,Hit Pedestrian,2,Pedestrian,4: Fatal,No Control,0,0,1325945.79,1664421.93,13259460000000.0
2,2021-3-6/07/2022,2 Metropolitan,SEMAPHORE PARK,5019,CITY OF CHARLES STURT,2,0,0,0,0,...,Hit Fixed Object,1,Driver Rider,1: PDO,Roundabout,0,0,1317767.25,1679377.81,13177670000000.0
3,2021-4-6/07/2022,2 Metropolitan,MARDEN,5070,"CC OF NORWOOD,PAYNEHAM & ST PETERS",2,0,0,0,0,...,Hit Fixed Object,1,Driver Rider,1: PDO,No Control,Y,0,1332264.42,1673848.29,13322640000000.0
4,2021-5-6/07/2022,2 Metropolitan,PARAFIELD GARDENS,5107,CITY OF SALISBURY,2,0,0,0,0,...,Hit Fixed Object,1,Driver Rider,1: PDO,No Control,0,0,1329432.2,1688030.5,13294320000000.0


In [7]:
#Cleaned dataframe with unwanted columns removed
crash_df_filtered_1 = crash_df[['Stats Area', 'Month', 'Day', 'Time', 'Area Speed', 'Position Type',
       'Horizontal Align', 'Vertical Align', 'Other Feat', 'Road Surface',
       'Moisture Cond', 'Weather Cond', 'DayNight', 'Crash Type', 'Unit Resp',
       'Entity Code', 'CSEF Severity', 'Traffic Ctrls', 'DUI Involved',
       'Drugs Involved']]

In [8]:
#Finding the number of unique values in Target Column
crash_df_filtered_1['CSEF Severity'].unique()

array(['1: PDO', '4: Fatal', '3: SI', '2: MI'], dtype=object)

In [9]:
# encoding string values into numeric values
CSEF_Severity_dict = {'4: Fatal': 0, '1: PDO': 1, '2: MI':2, '3: SI':3,}
crash_df_filtered_1 = crash_df_filtered_1.replace({'CSEF Severity': CSEF_Severity_dict})

In [10]:
#Converting Dtype of 'Time' variable from Object to Datetime
crash_df_filtered_1 ['Time'] = pd.to_datetime(crash_df_filtered_1['Time'])

In [11]:
#Changing the format of the 'Time' column from 12Hrs to 24Hrs and assigning it to a new Column 'date_formated'
crash_df_filtered_1['date_formated']=crash_df_filtered_1['Time'].dt.strftime('%H').astype('str')

In [12]:
#Finding the number of unique values in the converted 'date_formated' column
crash_df_filtered_1 ['date_formated'].unique()

array(['12', '11', '01', '03', '05', '07', '13', '15', '14', '06', '17',
       '22', '10', '18', '16', '19', '21', '20', '23', '00', '09', '08',
       '02', '04'], dtype=object)

In [13]:
#Dropping the 'Time' column fromthe Dataframe
crash_df_filtered_1=crash_df_filtered_1.drop('Time', axis=1)

In [14]:
#One-Hot Encoding the Column Variables
ohe_columns = ['Stats Area', 'Month', 'Day', 'date_formated', 'Position Type',
       'Horizontal Align', 'Vertical Align', 'Other Feat', 'Road Surface',
       'Moisture Cond', 'Weather Cond', 'DayNight', 'Crash Type',
       'Entity Code','CSEF Severity', 'Traffic Ctrls', 'DUI Involved',
       'Drugs Involved']      
ohe_df = pd.get_dummies(crash_df_filtered_1, columns=ohe_columns)

In [15]:
#Number of columns after One-Hot Encoding
len(ohe_df.columns)

128

In [16]:
#One-Hot Encoded Columns
ohe_df.columns

Index(['Area Speed', 'Unit Resp', 'Stats Area_1 City',
       'Stats Area_2 Metropolitan', 'Stats Area_3 Country', 'Month_April',
       'Month_August', 'Month_December', 'Month_February', 'Month_January',
       ...
       'Traffic Ctrls_Rail Xing - Flashing',
       'Traffic Ctrls_Rail Xing - No Control',
       'Traffic Ctrls_Rail Xing-Traffic Signals', 'Traffic Ctrls_Roundabout',
       'Traffic Ctrls_Stop Sign', 'Traffic Ctrls_Traffic Signals',
       'DUI Involved_0', 'DUI Involved_Y', 'Drugs Involved_0',
       'Drugs Involved_Y'],
      dtype='object', length=128)

In [17]:
# Split our preprocessed data into our features and target arrays
y = ohe_df[[col for col in ohe_df if col.startswith("CSEF Severity_")]].values

# drop 'Total Fats', 'Total SI', 'Total MI'
X = ohe_df.drop([col for col in ohe_df if col.startswith("CSEF Severity_")], axis=1).values
X

array([[90,  2,  0, ...,  0,  1,  0],
       [10,  2,  0, ...,  0,  1,  0],
       [60,  1,  0, ...,  0,  1,  0],
       ...,
       [80,  1,  0, ...,  0,  1,  0],
       [60,  2,  0, ...,  0,  1,  0],
       [60,  2,  0, ...,  0,  1,  0]])

In [18]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [19]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len( X_train_scaled[0])
hidden_nodes_layer1=8
hidden_nodes_layer2=16
hidden_nodes_layer3=16
nn = tf.keras.models.Sequential()


# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(4, activation='softmax'))

# Check the structure of the model
nn.summary()
     

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 1000      
                                                                 
 dense_1 (Dense)             (None, 16)                144       
                                                                 
 dense_2 (Dense)             (None, 4)                 68        
                                                                 
Total params: 1,212
Trainable params: 1,212
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Compile the model
nn.compile(loss = tf.keras.losses.CategoricalCrossentropy(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,validation_data=(X_test_scaled,y_test), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

103/103 - 0s - loss: 0.7744 - accuracy: 0.6648 - 134ms/epoch - 1ms/step
Loss: 0.7744494080543518, Accuracy: 0.6648418307304382
