In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf 
import plotly.express as px
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, MinMaxScaler

In [2]:
# Reading CSV dataset with ANSI encoding as it includes German Tokens

df = pd.read_csv(r"dataset/accidents.csv", encoding = "ANSI")
df.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT,VORJAHRESWERT,VERAEND_VORMONAT_PROZENT,VERAEND_VORJAHRESMONAT_PROZENT,ZWOELF_MONATE_MITTELWERT
0,Alkoholunf?le,insgesamt,2021,202101,,28.0,,,
1,Alkoholunf?le,insgesamt,2021,202102,,40.0,,,
2,Alkoholunf?le,insgesamt,2021,202103,,27.0,,,
3,Alkoholunf?le,insgesamt,2021,202104,,26.0,,,
4,Alkoholunf?le,insgesamt,2021,202105,,40.0,,,


In [3]:
# Recognizing first 5 columns for further analysis

df = df.iloc[:, :5]
df = df.dropna()
df.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT
12,Alkoholunf?le,insgesamt,2020,Summe,430.0
13,Alkoholunf?le,insgesamt,2020,202001,28.0
14,Alkoholunf?le,insgesamt,2020,202002,40.0
15,Alkoholunf?le,insgesamt,2020,202003,27.0
16,Alkoholunf?le,insgesamt,2020,202004,26.0


In [4]:
# Renaming Columns to English Counterparts for universal comprehension

df = df.rename(
    columns = {
        'MONATSZAHL':'Category',
        'AUSPRAEGUNG':'AccidentType',
        'JAHR':'Year',
        'MONAT':'Month',
        'WERT':'Value',        
    }
)
df.head()

Unnamed: 0,Category,AccidentType,Year,Month,Value
12,Alkoholunf?le,insgesamt,2020,Summe,430.0
13,Alkoholunf?le,insgesamt,2020,202001,28.0
14,Alkoholunf?le,insgesamt,2020,202002,40.0
15,Alkoholunf?le,insgesamt,2020,202003,27.0
16,Alkoholunf?le,insgesamt,2020,202004,26.0


In [5]:
# Basic Preprocessing to drop yearly summary value records
# More translation of Categorical Values for ease of comprehension

df.drop(df[df['Month'] == "Summe"].index, inplace = True)

for index, row in df.iterrows():
    if row['Category'] == "Alkoholunf?le":
        df['Category'][index] = "Alcohol Accidents"
    if row['Category'] == "Fluchtunf?le":
        df['Category'][index] = "Escape Accidents"
    if row['Category'] == "Verkehrsunf?le":
        df['Category'][index] = "Traffic Accidents"
    if row['AccidentType'] == "insgesamt":
            df['AccidentType'][index] = "subtotal"
    if row['AccidentType'] == "Verletzte und Get?ete":
            df['AccidentType'][index] = "injured and killed"
    if row['AccidentType'] == "mit Personensch?en":
            df['AccidentType'][index] = "with people"
        
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Category'][index] = "Alcohol Accidents"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AccidentType'][index] = "subtotal"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AccidentType'][index] = "injured and killed"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Category'][index] = "Escape Acci

Unnamed: 0,Category,AccidentType,Year,Month,Value
13,Alcohol Accidents,subtotal,2020,202001,28.0
14,Alcohol Accidents,subtotal,2020,202002,40.0
15,Alcohol Accidents,subtotal,2020,202003,27.0
16,Alcohol Accidents,subtotal,2020,202004,26.0
17,Alcohol Accidents,subtotal,2020,202005,40.0


In [None]:
# Counters for each category of accidents is counted manually as the dataframe on itself cannot fully portray this information
# As the subtotal type and injured, death statistics are not mutually exclusive

ctr_alcohol = 0
ctr_escape = 0
ctr_traffic = 0

for index, row in df.iterrows():
    if row['Month'] != "Summe":
        if row["Category"] == "Alcohol Accidents" and row["AccidentType"] == "subtotal":
            ctr_alcohol += row["Value"]
        if row["Category"] == "Escape Accidents" and row["AccidentType"] == "subtotal":
            ctr_escape += row["Value"]
        if row["Category"] == "Traffic Accidents" and row["AccidentType"] == "subtotal":
            ctr_traffic += row["Value"]

# Generating dictionary for plotting
accident_nos = {
    "Alcohol Accidents": ctr_alcohol, 
    "Escape Accidents": ctr_escape, 
    "Traffic Accidents": ctr_traffic
}
print(accident_nos)

In [None]:
# Histogram Visualization

plt.figure(figsize = (16, 9))
plt.bar(accident_nos.keys(), accident_nos.values(), width=0.25)

In [None]:
# Label Encoding Path.....

# for i in df.select_dtypes('object').columns:
#     le = LabelEncoder().fit(df[i])
#     df[i] = le.transform(df[i]) 

In [None]:
# One Hot Encoding for better results

one_hot_encoded_data = pd.get_dummies(df, columns = ['Category', 'AccidentType'])
one_hot_encoded_data = one_hot_encoded_data.iloc[:, :-1]
one_hot_encoded_data = one_hot_encoded_data.drop(['Category_Traffic Accidents'], axis = 1)
one_hot_encoded_data