In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf 
import plotly.express as px
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model

from forecasting_func import timeSeriesMultivariate, timeSeriesEvaluationMetrics

IndentationError: unindent does not match any outer indentation level (forecasting_func.py, line 16)

In [None]:
# Reading CSV dataset with ANSI encoding as it includes German Tokens

df = pd.read_csv(r"dataset/accidents.csv", encoding = "ANSI")
df.head()

In [None]:
# Recognizing first 5 columns for further analysis

df = df.iloc[:, :5]
df = df.dropna()
df.head()

In [None]:
# Renaming Columns to English Counterparts for universal comprehension

df = df.rename(
    columns = {
        'MONATSZAHL':'Category',
        'AUSPRAEGUNG':'AccidentType',
        'JAHR':'Year',
        'MONAT':'Month',
        'WERT':'Value',        
    }
)
df.head()

In [None]:
# Basic Preprocessing to drop yearly summary value records
# More translation of Categorical Values for ease of comprehension

df.drop(df[df['Month'] == "Summe"].index, inplace = True)

for index, row in df.iterrows():
    if row['Category'] == "Alkoholunf?le":
        df['Category'][index] = "Alcohol Accidents"
    if row['Category'] == "Fluchtunf?le":
        df['Category'][index] = "Escape Accidents"
    if row['Category'] == "Verkehrsunf?le":
        df['Category'][index] = "Traffic Accidents"
    if row['AccidentType'] == "insgesamt":
            df['AccidentType'][index] = "subtotal"
    if row['AccidentType'] == "Verletzte und Get?ete":
            df['AccidentType'][index] = "injured and killed"
    if row['AccidentType'] == "mit Personensch?en":
            df['AccidentType'][index] = "with people"
        
df.head()

In [None]:
# Counters for each category of accidents is counted manually as the dataframe on itself cannot fully portray this information
# As the subtotal type and injured, death statistics are not mutually exclusive

ctr_alcohol = 0
ctr_escape = 0
ctr_traffic = 0

for index, row in df.iterrows():
    if row['Month'] != "Summe":
        if row["Category"] == "Alcohol Accidents" and row["AccidentType"] == "subtotal":
            ctr_alcohol += row["Value"]
        if row["Category"] == "Escape Accidents" and row["AccidentType"] == "subtotal":
            ctr_escape += row["Value"]
        if row["Category"] == "Traffic Accidents" and row["AccidentType"] == "subtotal":
            ctr_traffic += row["Value"]

# Generating dictionary for plotting
accident_nos = {
    "Alcohol Accidents": ctr_alcohol, 
    "Escape Accidents": ctr_escape, 
    "Traffic Accidents": ctr_traffic
}
print(accident_nos)

In [None]:
# Histogram Visualization

plt.figure(figsize = (16, 9))
plt.bar(accident_nos.keys(), accident_nos.values(), width=0.25)

In [None]:
# Label Encoding Path.....

# for i in df.select_dtypes('object').columns:
#     le = LabelEncoder().fit(df[i])
#     df[i] = le.transform(df[i]) 

In [None]:
# One Hot Encoding for better results

one_hot_encoded_data = pd.get_dummies(df, columns = ['Category', 'AccidentType'])
one_hot_encoded_data = one_hot_encoded_data.iloc[:, :-1]
one_hot_encoded_data = one_hot_encoded_data.drop(['Category_Traffic Accidents'], axis = 1)
one_hot_encoded_data

In [None]:
# More preprocessing to only get the month value from Months Column

for index, row in one_hot_encoded_data.iterrows():
    one_hot_encoded_data.at[index, 'Month'] = row['Month'][-2:]
    one_hot_encoded_data.at[index, 'Month'] = int(row['Month'])
    one_hot_encoded_data.at[index, 'Year'] = int(row['Year'])
    
one_hot_encoded_data.head()

In [None]:
# fitting multivariate data for forecasting into Min Max Scalers

X_scaler = MinMaxScaler()
Y_scaler = MinMaxScaler()
X_df = X_scaler.fit_transform(one_hot_encoded_data[['Category_Alcohol Accidents', 'Category_Escape Accidents', 'AccidentType_injured and killed', 'AccidentType_subtotal','Year', 'Month', 'Value']])
Y_df = Y_scaler.fit_transform(one_hot_encoded_data[['Value']]) 

In [None]:
# Hyperparameters for Time Series Function Forecasting

hist_window = 48
horizon = 10
TRAIN_SPLIT = 1400
x_train, y_train = timeSeriesMultivariate(X_df, Y_df, 0, TRAIN_SPLIT, hist_window, horizon)
x_vali, y_vali = timeSeriesMultivariate(X_df, Y_df, TRAIN_SPLIT, None, hist_window, horizon) 

In [None]:
# Model Training Hyperparameters

batch_size = 256
buffer_size = 256
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().shuffle(buffer_size).batch(batch_size).repeat()
val_data = tf.data.Dataset.from_tensor_slices((x_vali, y_vali))
val_data = val_data.batch(batch_size).repeat()

In [None]:
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), input_shape=x_train.shape[-2:]),
    tf.keras.layers.Dense(32, activation='tanh'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(32, activation='tanh'),
    tf.keras.layers.Dense(32, activation='tanh'),
    tf.keras.layers.Dropout(0.20),
    tf.keras.layers.Dense(units=horizon),
])
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.summary() 

In [None]:
# Model save files, callbacks, and checkpoints if necessary

early_stopings = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='min')
checkpoint =  tf.keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=0)
callbacks=[early_stopings,checkpoint] 

In [None]:
history = Model()

In [None]:
# Training the model on all the above parameters and settings, excluding early stopping to attain better yield

history = lstm_model.fit(
    train_data,
    epochs = 40,
    steps_per_epoch = 100,
    validation_data = val_data,
    validation_steps = 50,
    verbose = 1,
    # callbacks=callbacks
)

In [None]:
plt.figure(figsize = (16, 9))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Accident LSTM Model loss plot')
plt.ylabel('loss axis --->')
plt.xlabel('epochs --->')
plt.legend(['train set loss', 'validation set loss'])
plt.savefig("./plots/loss_curve")
plt.show()

In [None]:
# Saving model

history.model.save('./models/GermanAccidents.h5')
history.model.save('./models/GermanAccidentsWeights.h5')

In [None]:
# Loading Model

loadedModel = load_model('./models/GermanAccidents.h5')