In [1]:
# import packages
import pandas as pd
import datetime as dt
from datetime import date

In [2]:
crashes = pd.read_csv("crashes.csv")

In [3]:
crashes.columns

Index(['X', 'Y', 'OBJECTID', 'ACCIDENT_NO', 'ABS_CODE', 'ACCIDENT_STATUS',
       'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME', 'ACCIDENT_TYPE',
       'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG', 'LIGHT_CONDITION',
       'POLICE_ATTEND', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'NODE_ID', 'LONGITUDE', 'LATITUDE', 'NODE_TYPE',
       'LGA_NAME', 'REGION_NAME', 'VICGRID_X', 'VICGRID_Y', 'TOTAL_PERSONS',
       'INJ_OR_FATAL', 'FATALITY', 'SERIOUSINJURY', 'OTHERINJURY',
       'NONINJURED', 'MALES', 'FEMALES', 'BICYCLIST', 'PASSENGER', 'DRIVER',
       'PEDESTRIAN', 'PILLION', 'MOTORIST', 'UNKNOWN', 'PED_CYCLIST_5_12',
       'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN', 'OLD_DRIVER', 'YOUNG_DRIVER',
       'ALCOHOL_RELATED', 'UNLICENCSED', 'NO_OF_VEHICLES', 'HEAVYVEHICLE',
       'PASSENGERVEHICLE', 'MOTORCYCLE', 'PUBLICVEHICLE', 'DEG_URBAN_NAME',
       'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL', 'SRNS', 'SRNS_ALL',
       'RMA', 'RMA_ALL', 'DIVIDED', 'DIVIDE

In [4]:
# add columns we want to remove to a list
remove_columns = ["OBJECTID","VICGRID_X","VICGRID_Y","SRNS","SRNS_ALL","DIVIDED_ALL","DIVIDED","REGION_NAME","LGA_NAME","DEG_URBAN_NAME","RMA_ALL","RMA","NODE_TYPE","NODE_ID","POLICE_ATTEND","ABS_CODE","ACCIDENT_STATUS","UNKNOWN","INJ_OR_FATAL","X","Y"]

In [5]:
# remove unwanted columns
crashes = crashes.drop(remove_columns,axis= 1)

In [6]:
# check if columns have been removed
crashes.columns

Index(['ACCIDENT_NO', 'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME',
       'ACCIDENT_TYPE', 'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG',
       'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'LONGITUDE', 'LATITUDE', 'TOTAL_PERSONS', 'FATALITY',
       'SERIOUSINJURY', 'OTHERINJURY', 'NONINJURED', 'MALES', 'FEMALES',
       'BICYCLIST', 'PASSENGER', 'DRIVER', 'PEDESTRIAN', 'PILLION', 'MOTORIST',
       'PED_CYCLIST_5_12', 'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN', 'OLD_DRIVER',
       'YOUNG_DRIVER', 'ALCOHOL_RELATED', 'UNLICENCSED', 'NO_OF_VEHICLES',
       'HEAVYVEHICLE', 'PASSENGERVEHICLE', 'MOTORCYCLE', 'PUBLICVEHICLE',
       'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL', 'STAT_DIV_NAME'],
      dtype='object')

In [7]:
# check for NA values
crashes.isna().sum()[crashes.isna().sum()>0]

DAY_OF_WEEK         2899
NO_OF_VEHICLES         8
HEAVYVEHICLE           8
PASSENGERVEHICLE       8
MOTORCYCLE             8
PUBLICVEHICLE          8
REGION_NAME_ALL        1
STAT_DIV_NAME          2
dtype: int64

In [8]:
# check for blank data
crashes[crashes==" "].sum()

ACCIDENT_NO            0
ACCIDENT_DATE          0
ACCIDENT_TIME          0
ALCOHOLTIME            0
ACCIDENT_TYPE          0
DAY_OF_WEEK            0
DCA_CODE               0
HIT_RUN_FLAG           0
LIGHT_CONDITION        0
ROAD_GEOMETRY          0
SEVERITY               0
SPEED_ZONE             0
RUN_OFFROAD            0
LONGITUDE            0.0
LATITUDE             0.0
TOTAL_PERSONS        0.0
FATALITY             0.0
SERIOUSINJURY        0.0
OTHERINJURY          0.0
NONINJURED           0.0
MALES                0.0
FEMALES              0.0
BICYCLIST            0.0
PASSENGER            0.0
DRIVER               0.0
PEDESTRIAN           0.0
PILLION              0.0
MOTORIST             0.0
PED_CYCLIST_5_12     0.0
PED_CYCLIST_13_18    0.0
OLD_PEDESTRIAN       0.0
OLD_DRIVER           0.0
YOUNG_DRIVER         0.0
ALCOHOL_RELATED        0
UNLICENCSED          0.0
NO_OF_VEHICLES       0.0
HEAVYVEHICLE         0.0
PASSENGERVEHICLE     0.0
MOTORCYCLE           0.0
PUBLICVEHICLE        0.0


In [9]:
# check for duplicates
crashes["ACCIDENT_NO"].value_counts()

T20150013821    1
T20190010572    1
T20190008283    1
T20190008282    1
T20190008277    1
               ..
T20170006132    1
T20170006131    1
T20170006112    1
T20170006096    1
T20210020248    1
Name: ACCIDENT_NO, Length: 75320, dtype: int64

In [10]:
crashes["ACCIDENT_DATE"].value_counts()

2015/12/09 00:00:00+00    76
2016/04/29 00:00:00+00    75
2016/03/10 00:00:00+00    75
2015/12/04 00:00:00+00    73
2017/04/21 00:00:00+00    72
                          ..
2020/04/18 00:00:00+00    13
2020/04/13 00:00:00+00    12
2020/04/26 00:00:00+00    11
2020/04/14 00:00:00+00    11
2020/04/17 00:00:00+00    10
Name: ACCIDENT_DATE, Length: 1827, dtype: int64

In [11]:
crashes.columns

Index(['ACCIDENT_NO', 'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME',
       'ACCIDENT_TYPE', 'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG',
       'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'LONGITUDE', 'LATITUDE', 'TOTAL_PERSONS', 'FATALITY',
       'SERIOUSINJURY', 'OTHERINJURY', 'NONINJURED', 'MALES', 'FEMALES',
       'BICYCLIST', 'PASSENGER', 'DRIVER', 'PEDESTRIAN', 'PILLION', 'MOTORIST',
       'PED_CYCLIST_5_12', 'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN', 'OLD_DRIVER',
       'YOUNG_DRIVER', 'ALCOHOL_RELATED', 'UNLICENCSED', 'NO_OF_VEHICLES',
       'HEAVYVEHICLE', 'PASSENGERVEHICLE', 'MOTORCYCLE', 'PUBLICVEHICLE',
       'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL', 'STAT_DIV_NAME'],
      dtype='object')

In [12]:
#drop all NA's
crashes = crashes.dropna(axis = 0)

In [13]:
crashes.isna().sum()

ACCIDENT_NO          0
ACCIDENT_DATE        0
ACCIDENT_TIME        0
ALCOHOLTIME          0
ACCIDENT_TYPE        0
DAY_OF_WEEK          0
DCA_CODE             0
HIT_RUN_FLAG         0
LIGHT_CONDITION      0
ROAD_GEOMETRY        0
SEVERITY             0
SPEED_ZONE           0
RUN_OFFROAD          0
LONGITUDE            0
LATITUDE             0
TOTAL_PERSONS        0
FATALITY             0
SERIOUSINJURY        0
OTHERINJURY          0
NONINJURED           0
MALES                0
FEMALES              0
BICYCLIST            0
PASSENGER            0
DRIVER               0
PEDESTRIAN           0
PILLION              0
MOTORIST             0
PED_CYCLIST_5_12     0
PED_CYCLIST_13_18    0
OLD_PEDESTRIAN       0
OLD_DRIVER           0
YOUNG_DRIVER         0
ALCOHOL_RELATED      0
UNLICENCSED          0
NO_OF_VEHICLES       0
HEAVYVEHICLE         0
PASSENGERVEHICLE     0
MOTORCYCLE           0
PUBLICVEHICLE        0
DEG_URBAN_ALL        0
LGA_NAME_ALL         0
REGION_NAME_ALL      0
STAT_DIV_NA

In [14]:
# load data
lga = pd.read_csv("lga.csv")

In [15]:
# find most recent population, and create dataframe with only the columns we need
lga = lga.rename(columns= {"Value": "Population Size"})
lga_2018 = lga[lga["Time"]==2018]
lga_2018 = lga_2018.iloc[2:,]
lga_2018 = lga_2018.iloc[:82-3,]
lga_2 = lga_2018[["Region","Population Size"]]

In [16]:
lga_2[['LGA','letter','value']] = lga_2.Region.str.split(" ",expand=True,)
final_lga = lga_2.drop(['Region','letter','value'],axis=1)
final_lga = final_lga.reset_index()
final_lga= final_lga.drop("index",axis=1)
final_lga

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Population Size,LGA
0,12730,Alpine
1,11795,Ararat
2,107325,Ballarat
3,130237,Banyule
4,35327,Bass
...,...,...
74,41429,Wodonga
75,255322,Wyndham
76,98521,Yarra
77,158173,Yarra


In [17]:
# accident table
# create accident type id column
# crashes["ACCIDENT_TYPE_ID"] = (crashes["ACCIDENT_TYPE"].astype("category").cat.codes+1)

In [18]:
# create accident light condition id column
crashes["LIGHT_CONDITION_ID"] = (crashes["LIGHT_CONDITION"].astype("category").cat.codes+1)

In [19]:
# create SEVERITY_ID column
crashes["SEVERITY_ID"] = (crashes["SEVERITY"].astype("category").cat.codes+1)

In [20]:
# create ROAD_ID column
crashes["ROAD_ID"] = (crashes["ROAD_GEOMETRY"].astype("category").cat.codes+1)

In [21]:
# create ROAD_ID column
crashes["LGA_ID"] = (crashes["LGA_NAME_ALL"].astype("category").cat.codes+1)

In [22]:
#Road Table
road = pd.DataFrame(crashes["ROAD_GEOMETRY"].unique())
road = road.rename(columns = {0:"ROAD_GEOMETRY"})
road["ROAD_ID"] = road["ROAD_GEOMETRY"].astype("category").cat.codes+1

In [23]:
# ROAD SPEED Table
road_speed = pd.DataFrame(crashes["SPEED_ZONE"].unique())
road_speed = road_speed.rename(columns = {0:"ROAD_SPEED"})
road_speed["ROAD_ID"] = crashes["ROAD_ID"]

In [24]:
# Accident light condition table
accident_light_cond = pd.DataFrame(crashes["LIGHT_CONDITION"].unique())
accident_light_cond = accident_light_cond.rename(columns = {0:"LIGHT_CONDITION"})
accident_light_cond["LIGHT_CONDITION_ID"] = accident_light_cond["LIGHT_CONDITION"].astype("category").cat.codes+1

In [25]:
# LGA table
LGA = pd.DataFrame(final_lga["LGA"].unique())
LGA = LGA.rename(columns = {0:"LGA"})
LGA["LGA_ID"] = LGA["LGA"].astype("category").cat.codes+1


In [26]:
# Population table
population = pd.DataFrame(final_lga)
population["LGA_ID"] = population["LGA"].astype("category").cat.codes+1
population

Unnamed: 0,Population Size,LGA,LGA_ID
0,12730,Alpine,1
1,11795,Ararat,2
2,107325,Ballarat,3
3,130237,Banyule,4
4,35327,Bass,5
...,...,...,...
74,41429,Wodonga,72
75,255322,Wyndham,73
76,98521,Yarra,74
77,158173,Yarra,74


In [27]:
# Region table
region = pd.DataFrame(crashes["REGION_NAME_ALL"].unique())
region = region.rename(columns = {0:"REGION_NAME"})
region["REGION_NAME_ID"] = region["REGION_NAME"].astype("category").cat.codes+1
region["DEGREE_URBAN_ID"] = crashes["DEG_URBAN_ALL"].astype("category").cat.codes+1

In [28]:
# Degree_urban
urban = pd.DataFrame(crashes["DEG_URBAN_ALL"].unique())
urban = urban.rename(columns = {0:"DEG_URBAN_DESCRIPTION"})
urban["DEG_URBAN_ID"] = urban["DEG_URBAN_DESCRIPTION"].astype("category").cat.codes+1
urban["STATE_DIV_ID"] = crashes["STAT_DIV_NAME"].astype("category").cat.codes+1

In [29]:
# state div
state_div = pd.DataFrame(crashes["STAT_DIV_NAME"].unique())
state_div = state_div.rename(columns = {0:"STAT_DIV_NAME"})
state_div["STATE_DIV_ID"] = state_div["STAT_DIV_NAME"].astype("category").cat.codes+1

In [44]:
# accident type
accident_type = pd.DataFrame(crashes["ACCIDENT_TYPE"].unique())
accident_type = accident_type.rename(columns = {0:"ACCIDENT_TYPE"})
accident_type["ACCIDENT_TYPE_ID"] = accident_type["ACCIDENT_TYPE"].astype("category").cat.codes+1

In [None]:
# accident severity
wanted_columns = ["FATALITY","SERIOUSINJURY",""]
people_involved =crashes[]

In [32]:
crashes.columns

Index(['ACCIDENT_NO', 'ACCIDENT_DATE', 'ACCIDENT_TIME', 'ALCOHOLTIME',
       'ACCIDENT_TYPE', 'DAY_OF_WEEK', 'DCA_CODE', 'HIT_RUN_FLAG',
       'LIGHT_CONDITION', 'ROAD_GEOMETRY', 'SEVERITY', 'SPEED_ZONE',
       'RUN_OFFROAD', 'LONGITUDE', 'LATITUDE', 'TOTAL_PERSONS', 'FATALITY',
       'SERIOUSINJURY', 'OTHERINJURY', 'NONINJURED', 'MALES', 'FEMALES',
       'BICYCLIST', 'PASSENGER', 'DRIVER', 'PEDESTRIAN', 'PILLION', 'MOTORIST',
       'PED_CYCLIST_5_12', 'PED_CYCLIST_13_18', 'OLD_PEDESTRIAN', 'OLD_DRIVER',
       'YOUNG_DRIVER', 'ALCOHOL_RELATED', 'UNLICENCSED', 'NO_OF_VEHICLES',
       'HEAVYVEHICLE', 'PASSENGERVEHICLE', 'MOTORCYCLE', 'PUBLICVEHICLE',
       'DEG_URBAN_ALL', 'LGA_NAME_ALL', 'REGION_NAME_ALL', 'STAT_DIV_NAME',
       'LIGHT_CONDITION_ID', 'SEVERITY_ID', 'ROAD_ID', 'LGA_ID'],
      dtype='object')

In [33]:
final_lga.columns

Index(['Population Size', 'LGA', 'LGA_ID'], dtype='object')