In [1]:
# Imports

import pandas as pd
import numpy as np
from sklearn import metrics
import math
import matplotlib.pyplot as plt

In [2]:
# Reading CSV dataset with ANSI encoding as it includes German Tokens

df = pd.read_csv(r"dataset/preprocessed_accidents.csv", encoding = "ANSI")
df = df.iloc[:, 1:]
df

Unnamed: 0,Category,AccidentType,Year,Month,Value
0,Traffic Accidents,injured and killed,2000,200012,515.0
1,Traffic Accidents,injured and killed,2000,200011,578.0
2,Traffic Accidents,injured and killed,2000,200010,615.0
3,Traffic Accidents,injured and killed,2000,200009,675.0
4,Traffic Accidents,injured and killed,2000,200008,647.0
...,...,...,...,...,...
1759,Alcohol Accidents,subtotal,2020,202005,40.0
1760,Alcohol Accidents,subtotal,2020,202004,26.0
1761,Alcohol Accidents,subtotal,2020,202003,27.0
1762,Alcohol Accidents,subtotal,2020,202002,40.0


In [3]:
df['Month'] = pd.to_datetime(df.Month , format = '%Y%m')
df['Date'] = df['Month'].dt.date.apply(lambda x: x.strftime('%Y-%m'))
df = df.drop(['Year'], axis = 1)
df = df.drop(['Month'], axis = 1)
df.index = df.Date
df = df.drop(['Date'], axis = 1)
df

Unnamed: 0_level_0,Category,AccidentType,Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-12,Traffic Accidents,injured and killed,515.0
2000-11,Traffic Accidents,injured and killed,578.0
2000-10,Traffic Accidents,injured and killed,615.0
2000-09,Traffic Accidents,injured and killed,675.0
2000-08,Traffic Accidents,injured and killed,647.0
...,...,...,...
2020-05,Alcohol Accidents,subtotal,40.0
2020-04,Alcohol Accidents,subtotal,26.0
2020-03,Alcohol Accidents,subtotal,27.0
2020-02,Alcohol Accidents,subtotal,40.0


In [4]:
# One Hot Encoding for better results

one_hot_encoded_data = pd.get_dummies(df, columns = ['Category', 'AccidentType'])
one_hot_encoded_data = one_hot_encoded_data.iloc[:, :-1]
one_hot_encoded_data = one_hot_encoded_data.drop(['Category_Traffic Accidents'], axis = 1)
one_hot_encoded_data

Unnamed: 0_level_0,Value,Category_Alcohol Accidents,Category_Escape Accidents,AccidentType_injured and killed,AccidentType_subtotal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-12,515.0,0,0,1,0
2000-11,578.0,0,0,1,0
2000-10,615.0,0,0,1,0
2000-09,675.0,0,0,1,0
2000-08,647.0,0,0,1,0
...,...,...,...,...,...
2020-05,40.0,1,0,0,1
2020-04,26.0,1,0,0,1
2020-03,27.0,1,0,0,1
2020-02,40.0,1,0,0,1


In [5]:
# Missing Values

cols = one_hot_encoded_data.columns
for j in cols:
    for i in range(0, len(df)):
       if one_hot_encoded_data[j][i] == None:
           one_hot_encoded_data[j][i] = df[j][i-1]

In [6]:
one_hot_encoded_data.to_csv('./dataset/oneHotEncodedAccidents.csv')

In [7]:
#creating the train and validation set
train = one_hot_encoded_data[:int(0.8*(len(one_hot_encoded_data)))]
valid = one_hot_encoded_data[int(0.8*(len(one_hot_encoded_data))):]

In [8]:
#fit the model
from statsmodels.tsa.vector_ar.var_model import VAR

In [9]:
one_hot_encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1764 entries, 2000-12 to 2020-01
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Value                            1764 non-null   float64
 1   Category_Alcohol Accidents       1764 non-null   uint8  
 2   Category_Escape Accidents        1764 non-null   uint8  
 3   AccidentType_injured and killed  1764 non-null   uint8  
 4   AccidentType_subtotal            1764 non-null   uint8  
dtypes: float64(1), uint8(4)
memory usage: 34.5+ KB


In [None]:
model = VAR(endog = train)
model_fit = model.fit()

In [None]:
# make prediction on validation
prediction = model_fit.forecast(model_fit.y, steps = len(valid))

In [None]:
#converting predictions to dataframe

pred = pd.DataFrame(index = range(0, len(prediction)), columns = [cols])
for j in range(0, 3):
    for i in range(0, len(prediction)):
        pred.iloc[i][j] = prediction[i][j]
pred

In [None]:
#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', math.sqrt(metrics.mean_squared_error(pred[i], valid[i])))

In [None]:
#make final predictions
model = VAR(endog=one_hot_encoded_data)
model_fit = model.fit()
yhat = model_fit.forecast(model_fit.y, steps=2)
print(yhat)