In [2]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
from sqlalchemy import create_engine
import pymongo

# Load CSV files into dataframes

In [3]:
# Bring ACCIDENT CSV into a separate data frame
accident_df = pd.read_csv("resources/ACCIDENT.csv", low_memory=False)
accident_df.head()

Unnamed: 0,ACCIDENT_NO,ACCIDENTDATE,ACCIDENTTIME,ACCIDENT_TYPE,Accident Type Desc,DAY_OF_WEEK,Day Week Description,DCA_CODE,DCA Description,DIRECTORY,...,NO_PERSONS,NO_PERSONS_INJ_2,NO_PERSONS_INJ_3,NO_PERSONS_KILLED,NO_PERSONS_NOT_INJ,POLICE_ATTEND,ROAD_GEOMETRY,Road Geometry Desc,SEVERITY,SPEED_ZONE
0,T20060000010,13/01/2006,12:42:00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,...,6,0,1,0,5,1,1,Cross intersection,3,60
1,T20060000018,13/01/2006,19:10:00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,...,4,0,1,0,3,1,2,T intersection,3,70
2,T20060000022,14/01/2006,12:10:00,7,Fall from or in moving vehicle,7,Saturday,190,FELL IN/FROM VEHICLE,MEL,...,2,1,0,0,1,1,5,Not at intersection,2,100
3,T20060000023,14/01/2006,11:49:00,1,Collision with vehicle,7,Saturday,130,REAR END(VEHICLES IN SAME LANE),MEL,...,2,1,0,0,1,1,2,T intersection,2,80
4,T20060000026,14/01/2006,10:45:00,1,Collision with vehicle,7,Saturday,121,RIGHT THROUGH,MEL,...,3,0,3,0,0,1,5,Not at intersection,3,50


In [4]:
# Bring ACCIDENT_EVENT CSV into a separate data frame
accident_event_df = pd.read_csv("resources/ACCIDENT_EVENT.csv")
accident_event_df.head()

Unnamed: 0,ACCIDENT_NO,EVENT_SEQ_NO,EVENT_TYPE,Event Type Desc,VEHICLE_1_ID,VEHICLE_1_COLL_PT,Vehicle 1 Coll Pt Desc,VEHICLE_2_ID,VEHICLE_2_COLL_PT,Vehicle 2 Coll Pt Desc,PERSON_ID,OBJECT_TYPE,Object Type Desc
0,T20060000010,1.0,C,Collision,B,2,Right side (forwards),A,F,Front,,99.0,Not Applicable
1,T20060000018,1.0,C,Collision,B,F,Front,A,9,Not known or Not Applicable,,99.0,Not Applicable
2,T20060000022,1.0,2,Fell from vehicle,A,9,Not known or Not Applicable,,,,1.0,99.0,Not Applicable
3,T20060000023,1.0,C,Collision,A,F,Front,B,R,Rear,,99.0,Not Applicable
4,T20060000026,1.0,C,Collision,A,9,Not known or Not Applicable,B,9,Not known or Not Applicable,,99.0,Not Applicable


In [5]:
# Bring ACCIDENT_LOCATION CSV into a separate data frame
accident_location_df = pd.read_csv("resources/ACCIDENT_LOCATION.csv")
accident_location_df.head()

Unnamed: 0,ACCIDENT_NO,NODE_ID,ROAD_ROUTE_1,ROAD_NAME,ROAD_TYPE,ROAD_NAME_INT,ROAD_TYPE_INT,DISTANCE_LOCATION,DIRECTION_LOCATION,NEAREST_KM_POST,OFF_ROAD_LOCATION
0,T20060000010,43078,2090.0,FOSTER,STREET,MCCRAE,STREET,0.0,SW,,
1,T20060000018,29720,5057.0,HALLAM,ROAD,BELGRAVE-HALLAM,ROAD,70.0,S,,
2,T20060000022,203074,9999.0,BROWNS,ROAD,TRUEMANS,ROAD,210.0,W,,
3,T20060000023,55462,2400.0,SPRINGVALE,ROAD,KEYSBOROUGH,AVENUE,0.0,N,,
4,T20060000026,202988,9999.0,ELIZABETH,AVENUE,GREENHOOD,CRESCENT,20.0,N,,


# Filter and rename columns

In [6]:
# Filter columns and rename with the same format
accident_filtered_df = accident_df[["ACCIDENT_NO", "ACCIDENTDATE", "ACCIDENTTIME", "Accident Type Desc", "Day Week Description", "DCA_CODE", "DCA Description", "NODE_ID", "Road Geometry Desc", "SPEED_ZONE"]]

accident_filtered_df = accident_filtered_df.rename(columns={"ACCIDENT_NO": "Accident No",
                                                            "ACCIDENTDATE": "Accident Date",
                                                            "ACCIDENTTIME": "Accident Time",
                                                            "Day Week Description": "Day Week Desc",
                                                            "DCA_CODE": "DCA Code",
                                                            "DCA Description": "DCA Desc",
                                                            "NODE_ID": "Node ID",
                                                            "SPEED_ZONE": "Speed Zone"})

accident_filtered_df.head()

Unnamed: 0,Accident No,Accident Date,Accident Time,Accident Type Desc,Day Week Desc,DCA Code,DCA Desc,Node ID,Road Geometry Desc,Speed Zone
0,T20060000010,13/01/2006,12:42:00,Collision with vehicle,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),43078,Cross intersection,60
1,T20060000018,13/01/2006,19:10:00,Collision with vehicle,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),29720,T intersection,70
2,T20060000022,14/01/2006,12:10:00,Fall from or in moving vehicle,Saturday,190,FELL IN/FROM VEHICLE,203074,Not at intersection,100
3,T20060000023,14/01/2006,11:49:00,Collision with vehicle,Saturday,130,REAR END(VEHICLES IN SAME LANE),55462,T intersection,80
4,T20060000026,14/01/2006,10:45:00,Collision with vehicle,Saturday,121,RIGHT THROUGH,202988,Not at intersection,50


In [7]:
# Filter columns and rename with the same format
accident_event_filtered_df = accident_event_df[["ACCIDENT_NO", "EVENT_TYPE", "Event Type Desc", "Vehicle 1 Coll Pt Desc", "Vehicle 2 Coll Pt Desc"]]

accident_event_filtered_df = accident_event_filtered_df.rename(columns={"ACCIDENT_NO": "Accident No",
                                                                        "EVENT_TYPE": "Event Type"})

accident_event_filtered_df.head()

Unnamed: 0,Accident No,Event Type,Event Type Desc,Vehicle 1 Coll Pt Desc,Vehicle 2 Coll Pt Desc
0,T20060000010,C,Collision,Right side (forwards),Front
1,T20060000018,C,Collision,Front,Not known or Not Applicable
2,T20060000022,2,Fell from vehicle,Not known or Not Applicable,
3,T20060000023,C,Collision,Front,Rear
4,T20060000026,C,Collision,Not known or Not Applicable,Not known or Not Applicable


In [8]:
# Filter columns and rename with the same format
accident_location_filtered_df = accident_location_df[["ACCIDENT_NO", "ROAD_NAME", "ROAD_TYPE", "ROAD_NAME_INT", "ROAD_TYPE_INT"]]

accident_location_filtered_df = accident_location_filtered_df.rename(columns={"ACCIDENT_NO": "Accident No",
                                                                            "ROAD_NAME": "Road Name",
                                                                            "ROAD_TYPE": "Road Type",
                                                                            "ROAD_NAME_INT": "Road Name Int",
                                                                            "ROAD_TYPE_INT": "Road Type Int"})

accident_location_filtered_df.head()

Unnamed: 0,Accident No,Road Name,Road Type,Road Name Int,Road Type Int
0,T20060000010,FOSTER,STREET,MCCRAE,STREET
1,T20060000018,HALLAM,ROAD,BELGRAVE-HALLAM,ROAD
2,T20060000022,BROWNS,ROAD,TRUEMANS,ROAD
3,T20060000023,SPRINGVALE,ROAD,KEYSBOROUGH,AVENUE
4,T20060000026,ELIZABETH,AVENUE,GREENHOOD,CRESCENT


# Combine dataframes

In [9]:
# Combine ACCIDENT and ACCIDENT EVENT data frames
combined_accident_event_df = pd.merge(accident_filtered_df, accident_event_filtered_df, how='outer', on='Accident No')
combined_accident_event_df.head()

Unnamed: 0,Accident No,Accident Date,Accident Time,Accident Type Desc,Day Week Desc,DCA Code,DCA Desc,Node ID,Road Geometry Desc,Speed Zone,Event Type,Event Type Desc,Vehicle 1 Coll Pt Desc,Vehicle 2 Coll Pt Desc
0,T20060000010,13/01/2006,12:42:00,Collision with vehicle,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),43078,Cross intersection,60,C,Collision,Right side (forwards),Front
1,T20060000018,13/01/2006,19:10:00,Collision with vehicle,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),29720,T intersection,70,C,Collision,Front,Not known or Not Applicable
2,T20060000022,14/01/2006,12:10:00,Fall from or in moving vehicle,Saturday,190,FELL IN/FROM VEHICLE,203074,Not at intersection,100,2,Fell from vehicle,Not known or Not Applicable,
3,T20060000023,14/01/2006,11:49:00,Collision with vehicle,Saturday,130,REAR END(VEHICLES IN SAME LANE),55462,T intersection,80,C,Collision,Front,Rear
4,T20060000026,14/01/2006,10:45:00,Collision with vehicle,Saturday,121,RIGHT THROUGH,202988,Not at intersection,50,C,Collision,Not known or Not Applicable,Not known or Not Applicable


In [10]:
# Combine ACCIDENT and ACCIDENT EVENT combined data frames with ACCIDENT LOCATION data frame
combined_accident_df = pd.merge(combined_accident_event_df, accident_location_filtered_df, how='outer', on='Accident No')
combined_accident_df.head()

Unnamed: 0,Accident No,Accident Date,Accident Time,Accident Type Desc,Day Week Desc,DCA Code,DCA Desc,Node ID,Road Geometry Desc,Speed Zone,Event Type,Event Type Desc,Vehicle 1 Coll Pt Desc,Vehicle 2 Coll Pt Desc,Road Name,Road Type,Road Name Int,Road Type Int
0,T20060000010,13/01/2006,12:42:00,Collision with vehicle,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),43078,Cross intersection,60,C,Collision,Right side (forwards),Front,FOSTER,STREET,MCCRAE,STREET
1,T20060000018,13/01/2006,19:10:00,Collision with vehicle,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),29720,T intersection,70,C,Collision,Front,Not known or Not Applicable,HALLAM,ROAD,BELGRAVE-HALLAM,ROAD
2,T20060000022,14/01/2006,12:10:00,Fall from or in moving vehicle,Saturday,190,FELL IN/FROM VEHICLE,203074,Not at intersection,100,2,Fell from vehicle,Not known or Not Applicable,,BROWNS,ROAD,TRUEMANS,ROAD
3,T20060000023,14/01/2006,11:49:00,Collision with vehicle,Saturday,130,REAR END(VEHICLES IN SAME LANE),55462,T intersection,80,C,Collision,Front,Rear,SPRINGVALE,ROAD,KEYSBOROUGH,AVENUE
4,T20060000026,14/01/2006,10:45:00,Collision with vehicle,Saturday,121,RIGHT THROUGH,202988,Not at intersection,50,C,Collision,Not known or Not Applicable,Not known or Not Applicable,ELIZABETH,AVENUE,GREENHOOD,CRESCENT


# Convert merged dataframe into CSV file output

In [None]:
# Convert combined accident, event, location dataframe to csv file

combined_accident_df.to_csv("Victorian_Accident_Data_2006-2020.csv")

# Load to SQL

In [28]:
# Postgres connection setup
# Creating database connection

connection_string = "postgres:postgres@localhost:5432/accidents_db"
engine = create_engine(f'postgresql://{connection_string}')

In [29]:
# Load DataFrame into database
accident_filtered_df.to_sql(name='premise', con=engine, if_exists='append', index=True)

OperationalError: (psycopg2.OperationalError) FATAL:  password authentication failed for user "postgres"

(Background on this error at: http://sqlalche.me/e/13/e3q8)

In [14]:
# Postgres connection setup
# Confirm tables

# engine.table_names()

In [None]:
# Load DataFrame into database
# accident_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

In [None]:
#Mongo setup
# The default port used by MongoDB is 27017
#conn = 'mongodb://localhost:27017'
#client = pymongo.MongoClient(conn)

# Define the database in Mongo
#db = client.accident_transformed