# Data Arrangement

In [1]:
%matplotlib inline

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

# Load data

In [2]:
df1 = pd.read_csv('../data/2013_Tabular_Transportation_Collision_Data.csv')
df2 = pd.read_csv('../data/2014_Tabular_Transportation_Collision_Data.csv')
df3 = pd.read_csv('../data/2015_Tabular_Transportation_Collision_Data.csv')
df4 = pd.read_csv('../data/2016_Tabular_Transportation_Collision_Data.csv')
df5 = pd.read_csv('../data/2017_Tabular_Transportation_Collision_Data.csv')
df6 = pd.read_csv('../data/2018_Tabular_Transportation_Collision_Data.csv')
df7 = pd.read_csv('../data/2019_Tabular_Transportation_Collision_Data.csv')

# Concatenate DataFrames 1 to 4

In [3]:
dfa = pd.concat([df1, df2, df3, df4], sort = False) # Concatenate
print("Data size", dfa.shape)
dfa.columns = [x.lower() for x in dfa.columns] # column names in lowe case
dfa.head() # Display

Data size (59107, 17)


Unnamed: 0,date,time,collision_id,location,x,y,longitude,latitude,environment,light,surface_condition,traffic_control,traffic_control_condition,collision_classification,impact_type,no_of_pedestrians,fid
0,2013/01/18 05:00:00+00,1899/12/31 23:13:00+00,13-704,ROBERTSON RD btwn MOODIE DR & VANIER RD,357144.875,5020503.5,-75.832298,45.322533,01 - Clear,07 - Dark,01 - Dry,10 - No control,,02 - Non-fatal injury,02 - Angle,0,1
1,2013/01/20 05:00:00+00,1899/12/31 18:45:00+00,13-806,FALLOWFIELD RD btwn EAGLESON RD & RICHMOND RD,356860.1875,5013034.5,-75.836708,45.255344,05 - Drifting Snow,01 - Daylight,06 - Ice,10 - No control,,03 - P.D. only,01 - Approaching,0,2
2,2013/01/03 05:00:00+00,1899/12/31 21:22:00+00,13-101,FIFTH AVE @ RUPERT ST,368589.71875,5029516.5,-75.685158,45.402679,03 - Snow,01 - Daylight,06 - Ice,02 - Stop sign,01 - Functioning,03 - P.D. only,02 - Angle,0,3
3,2013/01/12 05:00:00+00,1899/12/31 21:10:00+00,13-502,BIRCH AVE @ FARNHAM CRES,370292.90625,5035187.0,-75.662651,45.453541,01 - Clear,01 - Daylight,02 - Wet,02 - Stop sign,01 - Functioning,03 - P.D. only,02 - Angle,0,4
4,2013/01/11 05:00:00+00,1900/01/01 04:00:00+00,13-503,ALESTHER ST btwn QUEEN MARY ST & PRINCE ALBERT ST,372133.1875,5032130.5,-75.639542,45.425865,02 - Rain,07 - Dark,02 - Wet,10 - No control,,03 - P.D. only,06 - SMV unattended vehicle,0,5


# Dataframe 5

In [4]:
list_to_drop =['collision_location', 'year']
df5.columns = ['collision_id', 'location', 'x', 'y', 'longitude',
               'latitude','year', 'date', 'time', 'environment',
               'surface_condition','traffic_control',
               'collision_location', 'light', 
               'collision_classification', 'impact_type', 'fid']
df5.drop(list_to_drop, axis =1, inplace = True)

In [5]:
ar = np.zeros(len(df5))
ar[:] = np.nan
df5['traffic_control_condition'] = ar
df5['no_of_pedestrians'] = np.zeros(len(df5))

# Dataframe 6

In [6]:
list_to_drop =['geo_id', 'year', 'accident_location']
df6.columns = ['date', 'collision_id', 'year', 'time', 'location',
               'geo_id','accident_location', 'collision_classification',
               'impact_type', 'environment', 'light', 
               'surface_condition', 'traffic_control','traffic_control_condition',
               'no_of_pedestrians', 'x', 'y', 'longitude',
               'latitude', 'fid']
df6.drop(list_to_drop, axis =1, inplace = True)

# Dataframe 7

In [7]:
list_to_drop =['geo_id', 'accident_location']
df7.columns = ['collision_id', 'date', 'time', 'location',
               'geo_id','accident_location', 'collision_classification',
               'impact_type', 'environment', 'light', 
               'surface_condition', 'traffic_control','traffic_control_condition',
               'x', 'y', 'longitude','latitude', 'fid']
df7.drop(list_to_drop, axis =1, inplace = True)

In [8]:
df7['no_of_pedestrians'] = np.zeros(len(df7))

# Concatenate All DataFrames

In [9]:
df = pd.concat([dfa, df5, df6, df7], sort = False) 

df.drop(['time', 'collision_id'], axis =1, inplace = True)
print("Data size", df.shape)
df.head() 

Data size (104384, 15)


Unnamed: 0,date,location,x,y,longitude,latitude,environment,light,surface_condition,traffic_control,traffic_control_condition,collision_classification,impact_type,no_of_pedestrians,fid
0,2013/01/18 05:00:00+00,ROBERTSON RD btwn MOODIE DR & VANIER RD,357144.875,5020503.5,-75.832298,45.322533,01 - Clear,07 - Dark,01 - Dry,10 - No control,,02 - Non-fatal injury,02 - Angle,0.0,1
1,2013/01/20 05:00:00+00,FALLOWFIELD RD btwn EAGLESON RD & RICHMOND RD,356860.1875,5013034.5,-75.836708,45.255344,05 - Drifting Snow,01 - Daylight,06 - Ice,10 - No control,,03 - P.D. only,01 - Approaching,0.0,2
2,2013/01/03 05:00:00+00,FIFTH AVE @ RUPERT ST,368589.71875,5029516.5,-75.685158,45.402679,03 - Snow,01 - Daylight,06 - Ice,02 - Stop sign,01 - Functioning,03 - P.D. only,02 - Angle,0.0,3
3,2013/01/12 05:00:00+00,BIRCH AVE @ FARNHAM CRES,370292.90625,5035187.0,-75.662651,45.453541,01 - Clear,01 - Daylight,02 - Wet,02 - Stop sign,01 - Functioning,03 - P.D. only,02 - Angle,0.0,4
4,2013/01/11 05:00:00+00,ALESTHER ST btwn QUEEN MARY ST & PRINCE ALBERT ST,372133.1875,5032130.5,-75.639542,45.425865,02 - Rain,07 - Dark,02 - Wet,10 - No control,,03 - P.D. only,06 - SMV unattended vehicle,0.0,5


# Features From Date Variable

In [11]:
df['date'] = pd.to_datetime(df['date'])
df['collision_year'] = df['date'].dt.year
df['collision_month'] = df['date'].dt.month_name() 
df['collision_day'] = df['date'].dt.day_name()
df['collision_hour'] = df['date'].dt.hour

In [12]:
# Drop duplicate rows
df.drop_duplicates(inplace = True) 

## Save data

In [13]:
df.to_csv('../data/clean_data.csv', index = False)