In [1]:
import pandas as pd

In [2]:
crashes = pd.read_csv("crashes.csv")

In [3]:
crashes.columns

Index(['X', 'Y', 'OBJECTID', 'ACCIDENT_NO', 'ABS_CODE', 'ACCIDENT_STATUS',
       'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME', 'ACCIDENT_TYPE',
       'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG', 'LIGHT_CONDITION',
       'POLICE_ATTEND', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'NODE_ID', 'LONGITUDE', 'LATITUDE', 'NODE_TYPE',
       'LGA_NAME', 'REGION_NAME', 'VICGRID_X', 'VICGRID_Y', 'TOTAL_PERSONS',
       'INJ_OR_FATAL', 'FATALITY', 'SERIOUSINJURY', 'OTHERINJURY',
       'NONINJURED', 'MALES', 'FEMALES', 'BICYCLIST', 'PASSENGER', 'DRIVER',
       'PEDESTRIAN', 'PILLION', 'MOTORIST', 'UNKNOWN', 'PED_CYCLIST_5_12',
       'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN', 'OLD_DRIVER', 'YOUNG_DRIVER',
       'ALCOHOL_RELATED', 'UNLICENCSED', 'NO_OF_VEHICLES', 'HEAVYVEHICLE',
       'PASSENGERVEHICLE', 'MOTORCYCLE', 'PUBLICVEHICLE', 'DEG_URBAN_NAME',
       'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL', 'SRNS', 'SRNS_ALL',
       'RMA', 'RMA_ALL', 'DIVIDED', 'DIVIDE

## Find which columns we want to remove

In [4]:
remove_columns = ["OBJECTID","VICGRID_X","VICGRID_Y","SRNS","SRNS_ALL","DIVIDED_ALL","DIVIDED","REGION_NAME","LGA_NAME","DEG_URBAN_NAME","RMA_ALL","RMA","NODE_TYPE","NODE_ID","POLICE_ATTEND","ABS_CODE","ACCIDENT_STATUS","UNKNOWN","X","Y"]

In [5]:
crashes = crashes.drop(remove_columns,axis= 1)

In [6]:
crashes.columns

Index(['ACCIDENT_NO', 'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME',
       'ACCIDENT_TYPE', 'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG',
       'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'LONGITUDE', 'LATITUDE', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
       'FATALITY', 'SERIOUSINJURY', 'OTHERINJURY', 'NONINJURED', 'MALES',
       'FEMALES', 'BICYCLIST', 'PASSENGER', 'DRIVER', 'PEDESTRIAN', 'PILLION',
       'MOTORIST', 'PED_CYCLIST_5_12', 'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN',
       'OLD_DRIVER', 'YOUNG_DRIVER', 'ALCOHOL_RELATED', 'UNLICENCSED',
       'NO_OF_VEHICLES', 'HEAVYVEHICLE', 'PASSENGERVEHICLE', 'MOTORCYCLE',
       'PUBLICVEHICLE', 'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL',
       'STAT_DIV_NAME'],
      dtype='object')

## Check for NA and blank values

In [7]:
crashes.isna().sum()[crashes.isna().sum()>0]

DAY_OF_WEEK         2899
NO_OF_VEHICLES         8
HEAVYVEHICLE           8
PASSENGERVEHICLE       8
MOTORCYCLE             8
PUBLICVEHICLE          8
REGION_NAME_ALL        1
STAT_DIV_NAME          2
dtype: int64

## Remove NA values with small counts
## Find the day of the week the accident occurred on from the date and replace the NA values

In [8]:
crashes[crashes==" "].sum()

ACCIDENT_NO            0
ACCIDENT_DATE          0
ACCIDENT_TIME          0
ALCOHOLTIME            0
ACCIDENT_TYPE          0
DAY_OF_WEEK            0
DCA_CODE               0
HIT_RUN_FLAG           0
LIGHT_CONDITION        0
ROAD_GEOMETRY          0
SEVERITY               0
SPEED_ZONE             0
RUN_OFFROAD            0
LONGITUDE            0.0
LATITUDE             0.0
TOTAL_PERSONS        0.0
INJ_OR_FATAL         0.0
FATALITY             0.0
SERIOUSINJURY        0.0
OTHERINJURY          0.0
NONINJURED           0.0
MALES                0.0
FEMALES              0.0
BICYCLIST            0.0
PASSENGER            0.0
DRIVER               0.0
PEDESTRIAN           0.0
PILLION              0.0
MOTORIST             0.0
PED_CYCLIST_5_12     0.0
PED_CYCLIST_13_18    0.0
OLD_PEDESTRIAN       0.0
OLD_DRIVER           0.0
YOUNG_DRIVER         0.0
ALCOHOL_RELATED        0
UNLICENCSED          0.0
NO_OF_VEHICLES       0.0
HEAVYVEHICLE         0.0
PASSENGERVEHICLE     0.0
MOTORCYCLE           0.0


## Check for duplicates

In [9]:
a = crashes["ACCIDENT_NO"].value_counts()
crashes["ACCIDENT_NO"].value_counts()

T20150013821    1
T20190010572    1
T20190008283    1
T20190008282    1
T20190008277    1
               ..
T20170006132    1
T20170006131    1
T20170006112    1
T20170006096    1
T20210020248    1
Name: ACCIDENT_NO, Length: 75320, dtype: int64

## No duplicates

In [10]:
## Accidents, Injury counts, location, driver info, vehicle type

In [11]:
fatal = pd.read_csv("TYPICAL_HOURLY_VOLUME_DATA.csv")

In [12]:
fatal = pd.read_csv("Fatal_Crashes_-_Lives_Lost%2C_Last_5_Years_to_Date.csv")
fatal.columns

Index(['X', 'Y', 'OBJECTID', 'ACCIDENT_N', 'ACCIDENT_D', 'ACC_TIME',
       'NO_VEHICLE', 'ACCIDENT_T', 'ACC_T_DESC', 'ABS_CODE', 'PERSON_ID',
       'AGE', 'RD_USER_T', 'RD_U_DESC', 'SEX', 'LOCATION_D', 'NODE_ID',
       'LOCATION_T', 'NODE_TYPE', 'N_TYPE_DES', 'ROAD_ROUTE', 'RD_NAME_1',
       'RD_TYPE_1', 'RD_NAME_2', 'RD_TYPE_2', 'DEC_RD_DES', 'LGA_NAME',
       'REGION_NAM', 'STAT_DIV_N', 'AMG_X', 'AMG_Y'],
      dtype='object')

In [13]:
remove_columns2 = ["X","Y","RD_NAME_1","OBJECTID","ACCIDENT_T"]

In [14]:
fatal["ACC_T_DESC"].value_counts()

Collision with vehicle               589
Collision with a fixed object        402
Struck Pedestrian                    173
Vehicle overturned (no collision)     37
Collision with some other object      25
Fall from or in moving vehicle        17
No collision and no object struck     13
Struck animal                          6
Other accident                         2
Name: ACC_T_DESC, dtype: int64

In [15]:
crashes["ACCIDENT_DATE"].value_counts()

2015/12/09 00:00:00+00    76
2016/04/29 00:00:00+00    75
2016/03/10 00:00:00+00    75
2015/12/04 00:00:00+00    73
2017/04/21 00:00:00+00    72
                          ..
2020/04/18 00:00:00+00    13
2020/04/13 00:00:00+00    12
2020/04/26 00:00:00+00    11
2020/04/14 00:00:00+00    11
2020/04/17 00:00:00+00    10
Name: ACCIDENT_DATE, Length: 1827, dtype: int64

In [16]:
# crashes[crashes["ACCIDENT_DATE" == "01/01/2019"]]

In [17]:
crashes.sort_values(by = "ACCIDENT_DATE",ascending=False)

Unnamed: 0,ACCIDENT_NO,ACCIDENT_DATE,ACCIDENT_TIME,ALCOHOLTIME,ACCIDENT_TYPE,DAY_OF_WEEK,DCA_CODE,HIT_RUN_FLAG,LIGHT_CONDITION,ROAD_GEOMETRY,...,UNLICENCSED,NO_OF_VEHICLES,HEAVYVEHICLE,PASSENGERVEHICLE,MOTORCYCLE,PUBLICVEHICLE,DEG_URBAN_ALL,LGA_NAME_ALL,REGION_NAME_ALL,STAT_DIV_NAME
62389,T20200012654,2020/06/30 00:00:00+00,1899/12/30 19:35:00+00,Yes,Struck Pedestrian,Tuesday,PED PLAYING/LYING/WORKING/STANDING ON CARRIAGE...,No,Dark No street lights,Not at intersection,...,0,1.0,0.0,1.0,0.0,0.0,RURAL_VICTORIA,GEELONG,SOUTH WESTERN REGION,Country
62437,T20200013030,2020/06/30 00:00:00+00,1899/12/30 14:30:00+00,No,Collision with vehicle,Monday,LEFT REAR,No,Day,Cross intersection,...,0,2.0,0.0,2.0,0.0,0.0,MELB_URBAN,"DANDENONG,MONASH",METROPOLITAN SOUTH EAST REGION,Metro
75272,T20200013131,2020/06/30 00:00:00+00,1899/12/30 07:30:00+00,No,collision with some other object,Tuesday,LEFT OFF CARRIAGEWAY INTO OBJECT/PARKED VEHICLE,No,Day,Not at intersection,...,0,2.0,0.0,2.0,0.0,0.0,MELB_URBAN,KNOX,METROPOLITAN SOUTH EAST REGION,Metro
75271,T20200012988,2020/06/30 00:00:00+00,1899/12/30 04:45:00+00,Yes,Collision with a fixed object,Monday,OFF LEFT BEND INTO OBJECT/PARKED VEHICLE,No,Dark No street lights,Not at intersection,...,0,1.0,0.0,1.0,0.0,0.0,RURAL_VICTORIA,COLAC OTWAY,SOUTH WESTERN REGION,Country
65611,T20200012922,2020/06/30 00:00:00+00,1899/12/30 17:15:00+00,No,Collision with vehicle,Tuesday,RIGHT NEAR (INTERSECTIONS ONLY),No,Dark Street lights on,T intersection,...,0,2.0,0.0,2.0,0.0,0.0,MELB_URBAN,MONASH,METROPOLITAN SOUTH EAST REGION,Metro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,T20150013885,2015/07/01 00:00:00+00,1899/12/30 06:40:00+00,No,Collision with vehicle,Wednesday,REAR END(VEHICLES IN SAME LANE),Yes,Dark No street lights,Not at intersection,...,0,3.0,1.0,1.0,0.0,0.0,RURAL_VICTORIA,ARARAT,WESTERN REGION,Country
33,T20150013865,2015/07/01 00:00:00+00,1899/12/30 18:35:00+00,Yes,Collision with vehicle,Tuesday,VEHICLE STRIKES ANOTHER VEH WHILE EMERGING FRO...,No,Dark Street lights on,T intersection,...,0,2.0,0.0,2.0,0.0,0.0,MELB_URBAN,WHITEHORSE,METROPOLITAN SOUTH EAST REGION,Metro
32,T20150013854,2015/07/01 00:00:00+00,1899/12/30 13:40:00+00,No,Collision with vehicle,Tuesday,CROSS TRAFFIC(INTERSECTIONS ONLY),No,Day,Cross intersection,...,0,2.0,0.0,2.0,0.0,0.0,MELB_URBAN,MARIBYRNONG,METROPOLITAN NORTH WEST REGION,Metro
31,T20150013844,2015/07/01 00:00:00+00,1899/12/30 11:30:00+00,No,Collision with a fixed object,Wednesday,LEFT OFF CARRIAGEWAY INTO OBJECT/PARKED VEHICLE,No,Day,Cross intersection,...,0,1.0,0.0,0.0,1.0,0.0,MELB_URBAN,WHITTLESEA,METROPOLITAN NORTH WEST REGION,Metro


In [29]:
crashes.columns

Index(['ACCIDENT_NO', 'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME',
       'ACCIDENT_TYPE', 'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG',
       'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'LONGITUDE', 'LATITUDE', 'TOTAL_PERSONS', 'INJ_OR_FATAL',
       'FATALITY', 'SERIOUSINJURY', 'OTHERINJURY', 'NONINJURED', 'MALES',
       'FEMALES', 'BICYCLIST', 'PASSENGER', 'DRIVER', 'PEDESTRIAN', 'PILLION',
       'MOTORIST', 'PED_CYCLIST_5_12', 'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN',
       'OLD_DRIVER', 'YOUNG_DRIVER', 'ALCOHOL_RELATED', 'UNLICENCSED',
       'NO_OF_VEHICLES', 'HEAVYVEHICLE', 'PASSENGERVEHICLE', 'MOTORCYCLE',
       'PUBLICVEHICLE', 'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL',
       'STAT_DIV_NAME'],
      dtype='object')

In [18]:
lga = pd.read_csv("lga.csv")

In [19]:
lga = lga.rename(columns= {"Value": "Population Size"})
lga_2018 = lga[lga["Time"]==2018]
lga_2018 = lga_2018.iloc[2:,]
lga_2018 = lga_2018.iloc[:82-3,]
lga_2 = lga_2018[["Region","Population Size"]]

In [27]:
lga_2[['LGA','letter','value']] = lga_2.Region.str.split(" ",expand=True,)
final_lga = lga_2.drop(['Region','letter','value'],axis=1)
final_lga = final_lga.reset_index()
final_lga= final_lga.drop("index",axis=1)
final_lga

Unnamed: 0,Population Size,LGA
0,12730,Alpine
1,11795,Ararat
2,107325,Ballarat
3,130237,Banyule
4,35327,Bass
...,...,...
74,41429,Wodonga
75,255322,Wyndham
76,98521,Yarra
77,158173,Yarra
