In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
accidents_dataset=pd.read_csv("../input/uk-road-safety-accidents-and-vehicles/Accident_Information.csv")
accidents_dataset.head()

# **Finding no. of null values in the dataset**

In [3]:
accidents_dataset.isna().sum(axis=0)

In [4]:
accidents_dataset["Date"]=pd.to_datetime(accidents_dataset["Date"],format="%Y-%m-%d")

# **Convert hrs to a categorical data**

In [5]:
accidents_dataset['Hour']=accidents_dataset['Time'].str[0:2]
accidents_dataset=accidents_dataset.dropna(subset=['Hour'])
accidents_dataset['Hour']=pd.to_numeric(accidents_dataset['Hour'])

In [6]:
def converthrstocategory(hrs):
    if(hrs<12):
        return 'Morning'
    elif(hrs>=12 and hrs<17):
        return 'Afternoon'
    else:
        return 'Night'

In [7]:
accidents_dataset['Hrs_category']=accidents_dataset['Hour'].apply(converthrstocategory)
accidents_dataset['Hrs_category']

# **Proportion of null values in the dataframe**

In [8]:
print(accidents_dataset.isna().sum(axis=0).sum()/len(accidents_dataset))

In [9]:
import glob

import matplotlib.pyplot as plt
import seaborn as sns

# **No. of accidents decreased overtime (Monthly and Yearly plots to indicate) (3 plots)**

In [10]:
sns.set_style('white')
fig, ax = plt.subplots(figsize=(20,8))
accidents_dataset.set_index('Date').resample('M').size().plot(label="Total Per Month",color="black",ax=ax)
ax.set_title('Accidents per Month', fontsize=12, fontweight='bold')
ax.set(ylabel='Total Count\n', xlabel='Years')
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);

In [11]:
sns.set_style('white')
fig, ax = plt.subplots(figsize=(20,8))
ax.set_title('Accidents every Year', fontsize=12, fontweight='bold')
accidents_dataset.set_index('Date').resample('Y').size().plot(label="Total Per Year",color="orange",linewidth="4",ax=ax)
ax.set(ylabel='Total Count\n', xlabel='Years')

In [12]:
sns.set_style('white')
fig,ax=plt.subplots(figsize=(18,10))
yearly_count = accidents_dataset['Date'].dt.year.value_counts().sort_index(ascending=False)
ax.bar(yearly_count.index, yearly_count.values, color='gray')
ax.plot(yearly_count, color='orange',linewidth="2")
ax.set_title('Accidents per Year', fontsize=12, fontweight='bold')
ax.set(ylabel='Total Counts',xlabel="Years")

# Number of accidents throughout the day

In [13]:
accidents_dataset['Hrs_category'].to_frame()

In [14]:
time_of_day = accidents_dataset['Hrs_category']

In [15]:
plotMat = time_of_day.value_counts()
plotMat.to_frame()
plotMat.plot(kind = 'bar')

# Correlation analysis

In [16]:
def histogram_intersection(a, b):
    v = np.minimum(a, b).sum().round(decimals=1)
    return v
accidents_dataset.corr(method=histogram_intersection)

In [17]:
df = accidents_dataset

# Number of fatal accidents throughout the day

In [18]:
accidents_dataset['Accident_Severity'].value_counts()['Fatal']/accidents_dataset['Accident_Severity'].value_counts().sum()

In [19]:
df[(df.Accident_Severity == 'Fatal')]['Hrs_category'].value_counts().plot(kind = 'bar')

In [20]:

vehicle_info = pd.read_csv("../input/uk-road-safety-accidents-and-vehicles/Vehicle_Information.csv",encoding='latin1')

# Age of drivers in accidents

In [21]:
vehicle_info['Age_Band_of_Driver'].value_counts()

# Weather Conditions vs no. of accidents

In [22]:
accidents_dataset['Weather_Conditions'].value_counts().plot(kind = 'bar')

In [23]:
accidents_dataset['Weather_Conditions'].value_counts()['Fine no high winds']/accidents_dataset['Weather_Conditions'].value_counts().sum()

# **Plotting accident severity**

In [24]:
accidents_dataset['Accident_Severity'].value_counts().plot(kind = 'pie')