In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf 
import plotly.express as px
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model

from forecasting_func import timeSeriesMultivariate, timeSeriesEvaluationMetrics

In [2]:
# Reading CSV dataset with ANSI encoding as it includes German Tokens

df = pd.read_csv(r"dataset/preprocessed_accidents.csv", encoding = "ANSI")
df = df.iloc[:, 1:]
df

Unnamed: 0,Category,AccidentType,Year,Month,Value
0,Traffic Accidents,injured and killed,2000,200012,515.0
1,Traffic Accidents,injured and killed,2000,200011,578.0
2,Traffic Accidents,injured and killed,2000,200010,615.0
3,Traffic Accidents,injured and killed,2000,200009,675.0
4,Traffic Accidents,injured and killed,2000,200008,647.0
...,...,...,...,...,...
1759,Alcohol Accidents,subtotal,2020,202005,40.0
1760,Alcohol Accidents,subtotal,2020,202004,26.0
1761,Alcohol Accidents,subtotal,2020,202003,27.0
1762,Alcohol Accidents,subtotal,2020,202002,40.0


In [3]:
len(pd.unique(df['Month']))

252

This means there are 252 uniques dates in this dataset. So we have to iterate every 252 records and mark their respective category and types of deaths

In [4]:
date_vals = {}
for index, row in df.iterrows():
    date_vals[row['Month']] = []
for index, row in df.iterrows():
    date_vals[row['Month']].append(row['Value'])

In [5]:
# for date, vals in date_vals.items():
#     date_vals[date][2] = vals[2] - vals[1] - vals[0]
#     date_vals[date][4] = vals[4] - vals[3]
#     date_vals[date][6] = vals[6] - vals[5]
    
date_vals

{200012: [515.0, 403.0, 3789.0, 45.0, 950.0, 31.0, 85.0],
 200011: [578.0, 461.0, 3931.0, 56.0, 1017.0, 24.0, 71.0],
 200010: [615.0, 493.0, 4117.0, 47.0, 1078.0, 31.0, 83.0],
 200009: [675.0, 557.0, 3807.0, 68.0, 978.0, 45.0, 84.0],
 200008: [647.0, 525.0, 3387.0, 74.0, 881.0, 32.0, 77.0],
 200007: [744.0, 610.0, 4282.0, 67.0, 1102.0, 37.0, 99.0],
 200006: [685.0, 555.0, 3329.0, 58.0, 842.0, 32.0, 57.0],
 200005: [675.0, 568.0, 4123.0, 69.0, 1086.0, 44.0, 96.0],
 200004: [557.0, 441.0, 3317.0, 57.0, 834.0, 26.0, 78.0],
 200003: [509.0, 414.0, 3783.0, 48.0, 974.0, 18.0, 73.0],
 200002: [419.0, 329.0, 3181.0, 35.0, 890.0, 19.0, 53.0],
 200001: [434.0, 321.0, 3448.0, 34.0, 835.0, 37.0, 78.0],
 200112: [507.0, 377.0, 3989.0, 39.0, 941.0, 23.0, 66.0],
 200111: [568.0, 436.0, 4022.0, 32.0, 1001.0, 25.0, 68.0],
 200110: [717.0, 581.0, 4250.0, 85.0, 1104.0, 37.0, 77.0],
 200109: [643.0, 497.0, 3939.0, 40.0, 953.0, 48.0, 107.0],
 200108: [600.0, 510.0, 3351.0, 57.0, 814.0, 21.0, 60.0],
 200107

In [6]:
df = pd.DataFrame.from_dict(date_vals, orient = 'index')
df

Unnamed: 0,0,1,2,3,4,5,6
200012,515.0,403.0,3789.0,45.0,950.0,31.0,85.0
200011,578.0,461.0,3931.0,56.0,1017.0,24.0,71.0
200010,615.0,493.0,4117.0,47.0,1078.0,31.0,83.0
200009,675.0,557.0,3807.0,68.0,978.0,45.0,84.0
200008,647.0,525.0,3387.0,74.0,881.0,32.0,77.0
...,...,...,...,...,...,...,...
202005,464.0,416.0,3193.0,34.0,859.0,26.0,40.0
202004,363.0,314.0,2251.0,34.0,607.0,15.0,26.0
202003,334.0,270.0,2825.0,33.0,744.0,13.0,27.0
202002,355.0,306.0,3271.0,34.0,870.0,19.0,40.0


In [7]:
df.columns = [
    'traffic_injured_and_killed', 
    'traffic_with_people',
    'traffic_subtotal',
    'escape_injured_and_killed', 
    'escape_subtotal',
    'alcohol_injured_and_killed',
    'alcohol_subtotal'
]

# df.rename(columns = {
#     '0':'traffic_injured_and_killed', 
#     '1':'traffic_with_people',
#     '2':'traffic_other',
#     '3':'escape_injured_and_killed', 
#     '4':'escape_other',
#     '5':'alcohol_injured_and_killed',
#     '6':'alcohol_other'
# }, inplace = True)

In [8]:
df

Unnamed: 0,traffic_injured_and_killed,traffic_with_people,traffic_subtotal,escape_injured_and_killed,escape_subtotal,alcohol_injured_and_killed,alcohol_subtotal
200012,515.0,403.0,3789.0,45.0,950.0,31.0,85.0
200011,578.0,461.0,3931.0,56.0,1017.0,24.0,71.0
200010,615.0,493.0,4117.0,47.0,1078.0,31.0,83.0
200009,675.0,557.0,3807.0,68.0,978.0,45.0,84.0
200008,647.0,525.0,3387.0,74.0,881.0,32.0,77.0
...,...,...,...,...,...,...,...
202005,464.0,416.0,3193.0,34.0,859.0,26.0,40.0
202004,363.0,314.0,2251.0,34.0,607.0,15.0,26.0
202003,334.0,270.0,2825.0,33.0,744.0,13.0,27.0
202002,355.0,306.0,3271.0,34.0,870.0,19.0,40.0


In [10]:
df.to_csv("./dataset/accident_by_date_insgesamt.csv")

In [None]:
sum(df['traffic_other'])