In [1]:
import sys
import os

# get path to data and models
parent_path = os.getcwd()

# get the path to dataset
data_path = parent_path.replace('src\\notebooks', 'data')

# add models path to sys.path
models_path = parent_path.replace('notebooks', 'models')
sys.path.insert(0, models_path)


In [2]:
# import libraries
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
# read csv
clean_ARD_data = pd.read_csv(data_path + '\\processed\\clean_ARD.csv')
ARD_df = pd.DataFrame(clean_ARD_data)
ARD_df

Unnamed: 0,Crash ID,State,YYYYMM,Year,Month,Day of week,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,...,Speed,Driving experience,National Remoteness Areas,SA4 Name 2016,National LGA Name 2017,National Road Type,Christmas Period,Easter Period,Age Group,Time of day
0,20213034,Qld,2021-09,2021,9,Saturday,4:00:00,Multiple,No,No,...,41,3,Major Cities of Australia,Brisbane - South,Brisbane (C),Busway,No,No,17_to_25,Night
1,20213026,Qld,2021-09,2021,9,Wednesday,23:00:00,Multiple,No,No,...,20,3,Major Cities of Australia,Ipswich,Ipswich (C),National or State Highway,No,No,0_to_16,Night
2,20213092,Qld,2021-09,2021,9,Saturday,2:00:00,Single,No,No,...,53,12,Major Cities of Australia,Logan - Beaudesert,Logan (C),Local Road,No,No,40_to_64,Night
3,20214053,SA,2021-09,2021,9,Thursday,21:00:00,Single,No,No,...,140,7,Inner Regional Australia,Adelaide - Central and Hills,Adelaide Hills (DC),Sub-Arterial Road,No,No,17_to_25,Night
4,20213178,Qld,2021-09,2021,9,Sunday,21:00:00,Multiple,No,No,...,71,29,Major Cities of Australia,Gold Coast,Gold Coast (C),Local Road,No,No,40_to_64,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6817,20144083,SA,2014-01,2014,1,Friday,11:10:00,Multiple,No,Yes,...,20,14,Outer Regional Australia,South Australia - South East,The Coorong (DC),National or State Highway,No,No,40_to_64,Day
6818,20145108,WA,2014-01,2014,1,Wednesday,11:47:00,Single,No,No,...,142,15,Major Cities of Australia,Perth - South East,Belmont (C),National or State Highway,Yes,No,40_to_64,Day
6819,20144022,SA,2014-01,2014,1,Monday,9:35:00,Single,No,No,...,0,45,Major Cities of Australia,Adelaide - North,Tea Tree Gully (C),Local Road,No,No,75_or_older,Day
6820,20145072,WA,2014-01,2014,1,Tuesday,21:30:00,Single,No,No,...,74,43,Remote Australia,Western Australia - Outback (South),Esperance (S),National or State Highway,No,No,75_or_older,Night


In [4]:
# Investigating the number of crashes over different months and years for car driver

number_of_crashes_df = pd.DataFrame(ARD_df[["Year", "Month", "Road User"]].value_counts())
number_of_crashes_df.rename(columns={0:"Number of crash"}, inplace=True)
number_of_crashes_df.reset_index(inplace=True)
number_of_crashes_df


Unnamed: 0,Year,Month,Road User,Number of crash
0,2019,1,Car driver,55
1,2019,12,Car driver,54
2,2018,3,Car driver,53
3,2017,8,Car driver,52
4,2017,12,Car driver,52
...,...,...,...,...
491,2016,4,Pedal cyclist,1
492,2020,3,Other vehicle driver,1
493,2016,5,Pedal cyclist,1
494,2020,3,Motorcycle pillion Car passenger,1


In [5]:
# The number of crashes over different months and years for car driver
car_crashes = number_of_crashes_df[number_of_crashes_df["Road User"] == "Car driver"]
car_crashes = car_crashes.sort_values(by=["Year", "Month"]).reset_index(drop=True)
car_crashes.to_csv(data_path + '\\final\\car_crashes.csv', index=False)


In [6]:
# The average speed against National Road Type for car drivers
speed_against_road_type = ARD_df[["National Road Type", "Speed", "Road User"]]
speed_against_road_type = pd.DataFrame(speed_against_road_type)
speed_against_road_type = speed_against_road_type[speed_against_road_type["Road User"] == "Car driver"]
speed_against_road_type.reset_index(inplace=True, drop=True)
road_type_group = speed_against_road_type.groupby(["National Road Type"])
average_speed = road_type_group["Speed"].sum() / road_type_group["Speed"].count()
average_speed = pd.DataFrame(average_speed).rename(columns={"Speed": "Car Driver Average Speed"})
average_speed.to_csv(data_path + '\\final\\car_average_speed_against_roads.csv')

In [8]:
# The relationship between the number of crashes and Driving experience
list_of_drivers = ["Car driver", "Motorcycle rider", "Pedal cyclist", "Other vehicle driver"]

crashes_experience = ARD_df[ARD_df["Road User"].isin(list_of_drivers)]

crashes_experience = crashes_experience[["Driving experience"]].value_counts()
crashes_experience = pd.DataFrame(crashes_experience)
crashes_experience.sort_index(inplace=True)
crashes_experience.rename(columns={0: "Number of crashes"}, inplace=True)
crashes_experience.reset_index(inplace=True)
crashes_experience.to_csv(data_path + '\\final\\crashes_against_experiences.csv', index=False)
