
# Construction of Time Series Data

The data for the motor vehicle collisions on public roads in Canada can be found in [open.canada.ca](https://open.canada.ca/data/en/dataset/1eb9eba7-71d1-4b30-9fb1-30cbdab7e63a).   We have extracted a univariate time series data for total monthly fatality from 1999 to 2017.

In [31]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd
import numpy as np

# Load raw data

In [84]:
base_path = '/python/ml-projects/imbalanced-learning'
df = pd.read_csv(base_path + '/canada-collision/data/clean_data.csv')

In [85]:
print("Data size:", df.shape)
df.columns = [x.lower() for x in df.columns]
df = df.dropna(subset = ['c_mnth']) # drop rows with missing values in c_mnth
df.head()

Data size: (6771768, 23)


Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Other trucks and vans,1990.0,1.0,M,41.0,Driver,No Injury,,Motor Vehicle Driver,752
1,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,1.0,M,19.0,Driver,No Injury,,Motor Vehicle Driver,752
2,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,2.0,F,20.0,"Front row, right outboard",Injury,Safety device used,Motor Vehicle Passenger,752
3,1999,January,Monday,8.0,0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,Light Duty Vehicle,1986.0,1.0,M,46.0,Driver,No Injury,,Motor Vehicle Driver,753
4,1999,January,Monday,8.0,0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,,,1.0,M,5.0,Pedestrian,Injury,,Pedestrian,753


# Create a time series data
We are interested in univariate time series of the monthly fatality on public roads in Canada. The first step in time series analysis is to put the data in a time series format. Below we use pandas pivot_table function to calculate the sum of fatality indexed by the year of collision with the columns as the month of collision. We then stack the result and form a time series data set

In [86]:
# Change month to integer
month_dict = {'January':1, 'February':2, 'March':3, 
              'April':4,  'May':5, 'June':6,  'July':7,  
              'August':8, 'September':9,'October':10, 
              'November':11, 'December':12
}

wday_dict = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 
             'Friday':5,  'Saturday':5,   'Sunday':6
}

df['c_mnth'].replace(month_dict, inplace = True)
df['c_wday'].replace(wday_dict, inplace = True)

In [87]:
# Create date attribute
df['date'] = df['c_year'].astype(str) + '-' + df['c_mnth'].astype(str)
df['date']  = pd.to_datetime(df.date)

# Create features

In [88]:
# Head-on collision indicator
df['conf_ind'] = np.where(df.c_conf == 'Head-on collision', 'head-on', 'not-head-on')

In [89]:
df.insert(1, 'year', df['c_year'])  # Insert year
df.insert(2, 'month', df['c_mnth'])  # Insert month

# Insert summer indicator
df.insert(3, 'summer_ind', np.where(df.c_mnth == 6, 1,
                        np.where(df.c_mnth ==7, 1,
                        np.where(df.c_mnth ==8, 1,0)
                        )))

In [90]:
# Medical treatment of injury type
df_inj = df[df['class'] == 1][['date', 'p_isev', 'conf_ind']]

inj_ty = df_inj.drop('conf_ind', axis =1).copy()
inj_ty = pd.get_dummies(inj_ty, columns= ['p_isev'])
# inj_ty.columns = ['date'] + list(df_inj.p_isev.unique())

inj_ty = inj_ty.groupby('date').sum()

inj_ty.rename(columns= {'Fatality': 'med_fatal', 'Injury':'med_injury', 
                        'No Injury': 'med_no_injury'}, inplace=True)

inj_ty.insert(0, 'date', inj_ty.index)
inj_ty.reset_index(drop=True, inplace=True)
inj_ty.head()

Unnamed: 0,date,p_isev_Fatality,p_isev_Injury,p_isev_No Injury
0,1999-01-01,163.0,164.0,89.0
1,1999-02-01,196.0,176.0,97.0
2,1999-03-01,219.0,250.0,122.0
3,1999-04-01,190.0,154.0,83.0
4,1999-05-01,266.0,248.0,99.0


In [91]:
# Type of collision
df_conf = df_inj.drop('p_isev', axis =1).copy()
df_conf = pd.get_dummies(df_conf, columns= ['conf_ind'])
df_conf = df_conf.groupby('date').sum()

df_conf.insert(0, 'date', df_conf.index)
df_conf.reset_index(drop=True, inplace=True)
df_conf.head()

Unnamed: 0,date,conf_ind_head-on,conf_ind_not-head-on
0,1999-01-01,152.0,281.0
1,1999-02-01,139.0,343.0
2,1999-03-01,149.0,469.0
3,1999-04-01,82.0,356.0
4,1999-05-01,114.0,523.0


In [92]:
# Average number of vehicles
df_vehs = pd.pivot_table(df, values= 'c_vehs', index=['date'], aggfunc = 'mean')
df_vehs.insert(0, 'date', df_vehs.index)
df_vehs.reset_index(drop=True, inplace=True)
df_vehs.head()

Unnamed: 0,date,c_vehs
0,1999-01-01,2.042258
1,1999-02-01,2.006859
2,1999-03-01,2.028966
3,1999-04-01,2.041576
4,1999-05-01,2.037239


In [96]:
# Average hour
df_hr = pd.pivot_table(df, values= 'c_hour', index=['date'], aggfunc = 'mean')
df_hr.insert(0, 'date', df_hr.index)
df_hr.reset_index(drop=True, inplace=True)
df_hr.head()

Unnamed: 0,date,c_hour
0,1999-01-01,13.511338
1,1999-02-01,13.499924
2,1999-03-01,13.507759
3,1999-04-01,13.89034
4,1999-05-01,13.851641


In [97]:
# Average person age
df_age = pd.pivot_table(df, values= 'p_age', index=['date'], aggfunc = 'mean')
df_age.insert(0, 'date', df_age.index)
df_age.reset_index(drop=True, inplace=True)
df_age.head()

Unnamed: 0,date,p_age
0,1999-01-01,35.239035
1,1999-02-01,35.102478
2,1999-03-01,34.71783
3,1999-04-01,34.662779
4,1999-05-01,34.548825


In [95]:
# Fatality rate
pvt = pd.pivot_table(df, values= 'class', index=['date'], aggfunc = 'mean')
pvt.insert(0, 'date', pvt.index)
pvt.reset_index(drop=True, inplace=True)
pvt.rename(columns={'class':'fatality_rate'}, inplace=True)
pvt.head()

Unnamed: 0,date,fatality_rate
0,1999-01-01,0.012911
1,1999-02-01,0.018163
2,1999-03-01,0.020982
3,1999-04-01,0.015418
4,1999-05-01,0.017953


# Consolidated Data

In [98]:
df_sel = df.copy()[['date', 'year', 'month','summer_ind', 'class']]

In [99]:
df_ts = pvt.merge(inj_ty, how = 'inner', on = 'date')\
    .merge(df_conf, how = 'inner', on = 'date')\
.merge(df_vehs, how = 'inner', on = 'date')\
.merge(df_hr, how = 'inner', on = 'date')\
.merge(df_age, how = 'inner', on = 'date')\
        .merge(df_sel.drop('class', axis =1).drop_duplicates(), how = 'inner', on = 'date')
df_ts

Unnamed: 0,date,fatality_rate,p_isev_Fatality,p_isev_Injury,p_isev_No Injury,conf_ind_head-on,conf_ind_not-head-on,c_vehs,c_hour,p_age,year,month,summer_ind
0,1999-01-01,0.012911,163.0,164.0,89.0,152.0,281.0,2.042258,13.511338,35.239035,1999,1,0
1,1999-02-01,0.018163,196.0,176.0,97.0,139.0,343.0,2.006859,13.499924,35.102478,1999,2,0
2,1999-03-01,0.020982,219.0,250.0,122.0,149.0,469.0,2.028966,13.507759,34.717830,1999,3,0
3,1999-04-01,0.015418,190.0,154.0,83.0,82.0,356.0,2.041576,13.890340,34.662779,1999,4,0
4,1999-05-01,0.017953,266.0,248.0,99.0,114.0,523.0,2.037239,13.851641,34.548825,1999,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,2017-08-01,0.019307,201.0,160.0,112.0,80.0,428.0,2.053252,14.084537,38.690179,2017,8,1
224,2017-09-01,0.017195,184.0,115.0,120.0,95.0,351.0,2.050087,13.687103,39.149918,2017,9,0
225,2017-10-01,0.018010,199.0,124.0,118.0,94.0,370.0,2.065520,13.689260,39.213049,2017,10,0
226,2017-11-01,0.013402,152.0,95.0,85.0,82.0,264.0,1.993687,13.757082,39.217431,2017,11,0


In [100]:
# save data
df_ts.to_csv('../data/collision_ts_month.csv', index = False)