
# Construction of Time Series Data

The data for the motor vehicle collisions on public roads in Canada can be found in [open.canada.ca](https://open.canada.ca/data/en/dataset/1eb9eba7-71d1-4b30-9fb1-30cbdab7e63a).   We have extracted a univariate time series data for total monthly fatality from 1999 to 2017.

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd
import numpy as np

# Load raw data

In [20]:
base_path = ''
df = pd.read_csv(base_path + '/canada-collision/data/clean_data.csv')

In [21]:
print("Data size:", df.shape)
df.columns = [x.lower() for x in df.columns]
df.head()

Data size: (6771768, 23)


Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Other trucks and vans,1990.0,1.0,M,41.0,Driver,No Injury,,Motor Vehicle Driver,752
1,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,1.0,M,19.0,Driver,No Injury,,Motor Vehicle Driver,752
2,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,2.0,F,20.0,"Front row, right outboard",Injury,Safety device used,Motor Vehicle Passenger,752
3,1999,January,Monday,8.0,0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,Light Duty Vehicle,1986.0,1.0,M,46.0,Driver,No Injury,,Motor Vehicle Driver,753
4,1999,January,Monday,8.0,0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,,,1.0,M,5.0,Pedestrian,Injury,,Pedestrian,753


# Create a time series data
We are interested in univariate time series of the monthly fatality on public roads in Canada. The first step in time series analysis is to put the data in a time series format. Below we use pandas pivot_table function to calculate the sum of fatality indexed by the year of collision with the columns as the month of collision. We then stack the result and form a time series data set

In [23]:
# Change month to integer
month_dict = {'January':1, 'February':2, 'March':3, 'April':4,  'May':5, 'June':6,  'July':7,  
              'August':8, 'September':9,'October':10, 'November':11, 'December':12
}

wday_dict = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5,  'Saturday':5,   'Sunday':6
}

df['c_mnth'].replace(month_dict, inplace = True)
df['c_wday'].replace(wday_dict, inplace = True)

In [24]:
# Create date attribute
df['date'] = df['c_year'].astype(str) + '-' + df['c_mnth'].astype(str)
df['date']  = pd.to_datetime(df.date)

# Create features

In [25]:
# Head-on collision indicator
df['conf_ind'] = np.where(df.c_conf == 'Head-on collision', 'head-on', 'not-head-on')

In [26]:
df.insert(1, 'year', df['c_year'])  # Insert year
df.insert(2, 'month', df['c_mnth'])  # Insert month

# Insert summer indicator
df.insert(3, 'summer_ind', np.where(df.c_mnth == 6, 1,
                        np.where(df.c_mnth ==7, 1,
                        np.where(df.c_mnth ==8, 1,0)
                        )))

In [None]:
# Medical treatment of injury type
df_inj = df[df['class'] == 1][['date', 'p_isev', 'conf_ind']]

inj_ty = df_inj.drop('conf_ind', axis =1).copy()
inj_ty = pd.get_dummies(inj_ty, columns= ['p_isev'])
inj_ty.columns = ['date'] + list(df_inj.p_isev.unique())

inj_ty = inj_ty.groupby('date').sum()

inj_ty.rename(columns= {'Fatality': 'med_fatal', 'Injury':'med_injury', 
                        'No Injury': 'med_no_injury'}, inplace=True)

inj_ty.insert(0, 'date', inj_ty.index)
inj_ty.reset_index(drop=True, inplace=True)

inj_ty.head()

In [None]:
# Type of collision
df_conf = df_inj.drop('p_isev', axis =1).copy()

df_conf = pd.get_dummies(df_conf, columns= ['conf_ind'])
df_conf.columns = ['date'] + list(df_inj.conf_ind.unique()) 

df_conf = df_conf.groupby('date').sum()

df_conf.insert(0, 'date', df_conf.index)
df_conf.reset_index(drop=True, inplace=True)

df_conf.head()

In [None]:
# Fatality rate
pvt = pd.pivot_table(df, values= 'class', index=['date'], aggfunc = 'mean')
pvt.insert(0, 'date', pvt.index)
pvt.reset_index(drop=True, inplace=True)
pvt.rename(columns={'class':'fatality_rate'}, inplace=True)
pvt.head()

# Consolidated Data

In [16]:
df_sel = df.copy()[['date', 'year', 'month',, 'class']]

In [17]:
df_ts = pvt.merge(inj_ty, how = 'inner', on = 'date')\
    .merge(df_conf, how = 'inner', on = 'date')\
        .merge(df_sel.drop('class', axis =1).drop_duplicates(), how = 'inner', on = 'date')
df_ts

Unnamed: 0,date,fatality_rate,med_fatal,med_injury,med_no_injury,not-head-on,head-on,year,month,summer_ind
0,1999-01-01,0.013067,85.0,99.0,48.0,108.0,124.0,1999,1,0
1,1999-02-01,0.017570,101.0,108.0,40.0,93.0,156.0,1999,2,0
2,1999-03-01,0.021727,105.0,170.0,48.0,96.0,227.0,1999,3,0
3,1999-04-01,0.013606,99.0,84.0,32.0,44.0,171.0,1999,4,0
4,1999-05-01,0.018343,150.0,165.0,53.0,79.0,289.0,1999,5,0
...,...,...,...,...,...,...,...,...,...,...
223,2017-08-01,0.016928,106.0,89.0,62.0,59.0,198.0,2017,8,1
224,2017-09-01,0.015841,94.0,85.0,60.0,67.0,172.0,2017,9,0
225,2017-10-01,0.015182,93.0,67.0,65.0,52.0,173.0,2017,10,0
226,2017-11-01,0.011240,66.0,55.0,39.0,57.0,103.0,2017,11,0


In [18]:
# save data
df_ts.to_csv('../data/collision_ts_month.csv', index = False)