<h1>UK ACCIDENT REPORT DATA ANALYSIS</h1>

<h3>ANALYST: Abdul Barry A. Adam</h3>

<h1>IMPORTING LIBRARIES NEEDED</h1>

In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

<h1>IMPORTING DATASETS, LOCALLY, TO DATAFRAME</h1>

In [98]:
accident = pd.read_csv("./datasets/accident_data.csv")

In [99]:
accident

Unnamed: 0,Index,Accident_Severity,Accident_Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,5/6/2019,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2/7/2019,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,26-08-2019,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,16-08-2019,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,3/9/2019,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,18-02-2022,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,21-02-2022,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,23-02-2022,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,23-02-2022,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


<h1>INFORMATION</h1>

In [100]:
accident.describe()

Unnamed: 0,Latitude,Longitude,Number_of_Casualties,Number_of_Vehicles
count,660654.0,660653.0,660679.0,660679.0
mean,52.553866,-1.43121,1.35704,1.831255
std,1.406922,1.38333,0.824847,0.715269
min,49.91443,-7.516225,1.0,1.0
25%,51.49069,-2.332291,1.0,1.0
50%,52.315641,-1.411667,1.0,2.0
75%,53.453452,-0.232869,1.0,2.0
max,60.757544,1.76201,68.0,32.0


In [101]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Index                    660679 non-null  object 
 1   Accident_Severity        660679 non-null  object 
 2   Accident_Date            660679 non-null  object 
 3   Latitude                 660654 non-null  float64
 4   Light_Conditions         660679 non-null  object 
 5   District Area            660679 non-null  object 
 6   Longitude                660653 non-null  float64
 7   Number_of_Casualties     660679 non-null  int64  
 8   Number_of_Vehicles       660679 non-null  int64  
 9   Road_Surface_Conditions  659953 non-null  object 
 10  Road_Type                656159 non-null  object 
 11  Urban_or_Rural_Area      660664 non-null  object 
 12  Weather_Conditions       646551 non-null  object 
 13  Vehicle_Type             660679 non-null  object 
dtypes: f

In [102]:
accident.dtypes

Index                       object
Accident_Severity           object
Accident_Date               object
Latitude                   float64
Light_Conditions            object
District Area               object
Longitude                  float64
Number_of_Casualties         int64
Number_of_Vehicles           int64
Road_Surface_Conditions     object
Road_Type                   object
Urban_or_Rural_Area         object
Weather_Conditions          object
Vehicle_Type                object
dtype: object

<h1>CONVERT COLUMNS TO ITS APPROPRIATE DATA TYPE</h1>

In [103]:
accident['Index'] = accident['Index'].astype('category')
accident['Accident_Severity'] = accident['Accident_Severity'].astype('category')
accident['Accident_Date'] = pd.to_datetime(accident['Accident_Date'], dayfirst = True, errors = 'coerce') 
accident['Light_Conditions'] = accident['Light_Conditions'].astype('category')
accident['District Area'] = accident['District Area'].astype('category')
accident['Road_Surface_Conditions'] = accident['Road_Surface_Conditions'].astype('category')
accident['Road_Type'] = accident['Road_Type'].astype('category')
accident['Urban_or_Rural_Area'] = accident['Urban_or_Rural_Area'].astype('category')
accident['Weather_Conditions'] = accident['Weather_Conditions'].astype('category')
accident['Vehicle_Type'] = accident['Vehicle_Type'].astype('category')

<h1>DETERMINING NULL VALUES AND FILLNG THEM</h1>

<h4>DETERMING NULLS</h4>

In [104]:
accident.isnull().sum()

Index                           0
Accident_Severity               0
Accident_Date              395672
Latitude                       25
Light_Conditions                0
District Area                   0
Longitude                      26
Number_of_Casualties            0
Number_of_Vehicles              0
Road_Surface_Conditions       726
Road_Type                    4520
Urban_or_Rural_Area            15
Weather_Conditions          14128
Vehicle_Type                    0
dtype: int64

In [105]:
FreqDate = accident['Accident_Date'].mode()[0]
AveLat = accident['Latitude'].mean()
AveLot = accident['Longitude'].mean()
FreqSurConditions = accident['Road_Surface_Conditions'].mode()[0]
FreqgRoadType = accident['Road_Type'].mode()[0]
FreqArea = accident['Urban_or_Rural_Area'].mode()[0]


<h4>FILLING NULLS</h4>

In [106]:
accident['Accident_Date'] = accident['Accident_Date'].fillna(FreqDate)
accident['Latitude'] = accident['Latitude'].fillna(AveLat)
accident['Longitude'] = accident['Longitude'].fillna(AveLot)
accident['Road_Surface_Conditions'] = accident['Road_Surface_Conditions'].fillna(FreqSurConditions)
accident['Road_Type'] = accident['Road_Type'].fillna(FreqgRoadType)
accident['Urban_or_Rural_Area'] = accident['Urban_or_Rural_Area'].fillna(FreqArea)


<h4>LAST CHECKUP</h4>

In [107]:
accident.isnull().sum()

Index                          0
Accident_Severity              0
Accident_Date                  0
Latitude                       0
Light_Conditions               0
District Area                  0
Longitude                      0
Number_of_Casualties           0
Number_of_Vehicles             0
Road_Surface_Conditions        0
Road_Type                      0
Urban_or_Rural_Area            0
Weather_Conditions         14128
Vehicle_Type                   0
dtype: int64

In [108]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Index                    660679 non-null  category      
 1   Accident_Severity        660679 non-null  category      
 2   Accident_Date            660679 non-null  datetime64[ns]
 3   Latitude                 660679 non-null  float64       
 4   Light_Conditions         660679 non-null  category      
 5   District Area            660679 non-null  category      
 6   Longitude                660679 non-null  float64       
 7   Number_of_Casualties     660679 non-null  int64         
 8   Number_of_Vehicles       660679 non-null  int64         
 9   Road_Surface_Conditions  660679 non-null  category      
 10  Road_Type                660679 non-null  category      
 11  Urban_or_Rural_Area      660679 non-null  category      
 12  Weather_Conditio

In [109]:
accident['Year'] = accident['Accident_Date'].dt.year
accident['dayOfWeek'] = accident['Accident_Date'].dt.dayofweek
accident['Month'] = accident['Accident_Date'].dt.month

# accident['Year']  = accident['Year'].astype('category')
# accident['dayOfWeek'] = accident['dayOfWeek'].astype('category')
# accident['Month'] = accident['Month'].astype('category')

In [110]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Index                    660679 non-null  category      
 1   Accident_Severity        660679 non-null  category      
 2   Accident_Date            660679 non-null  datetime64[ns]
 3   Latitude                 660679 non-null  float64       
 4   Light_Conditions         660679 non-null  category      
 5   District Area            660679 non-null  category      
 6   Longitude                660679 non-null  float64       
 7   Number_of_Casualties     660679 non-null  int64         
 8   Number_of_Vehicles       660679 non-null  int64         
 9   Road_Surface_Conditions  660679 non-null  category      
 10  Road_Type                660679 non-null  category      
 11  Urban_or_Rural_Area      660679 non-null  category      
 12  Weather_Conditio

<h1>DATA ANALYTICS</h1>
<h2>UniVariate</h2>
<h3>Vehicle's type that is frequently on accident</h3>
<h4>Vehicle Type</h4>
<hr>

In [111]:
types = accident['Vehicle_Type'].value_counts()
types

Vehicle_Type
Car                                      497992
Van / Goods 3.5 tonnes mgw or under       34160
Bus or coach (17 or more pass seats)      25878
Motorcycle over 500cc                     25657
Goods 7.5 tonnes mgw and over             17307
Motorcycle 125cc and under                15269
Taxi/Private hire car                     13294
Motorcycle over 125cc and up to 500cc      7656
Motorcycle 50cc and under                  7603
Goods over 3.5t. and under 7.5t            6096
Other vehicle                              5637
Minibus (8 - 16 passenger seats)           1976
Agricultural vehicle                       1947
Pedal cycle                                 197
Data missing or out of range                  6
Ridden horse                                  4
Name: count, dtype: int64

<h1>INSIGHT #1</h1>
<h2>Car is the most frequent type of vehicle involved in an accidents</h2>

<h3>Road's Type that has the most accidents</h3>

In [112]:
roadAccidents = accident['Road_Type'].value_counts()

In [113]:
roadAccidents

Road_Type
Single carriageway    496663
Dual carriageway       99424
Roundabout             43992
One way street         13559
Slip road               7041
Name: count, dtype: int64

<p>A single carriageway is a road with one or more lanes for traffic traveling in each direction, with no physical separation between the two directions</p>

<h1>INSIGHT #2</h1>
<h2>Single Carriageway road has the most recorded accidents in UK</h2>
<h1>INSIGHT #3</h1>
<h2>According to one way street, there are 2.05% Rear-End Collisions accident</h2>

In [114]:
po = accident[accident['Road_Type'] == 'Single carriageway']

In [115]:
po['Accident_Severity'].value_counts()

Accident_Severity
Slight     419563
Serious     70540
Fatal        6560
Name: count, dtype: int64

In [116]:
accident['Accident_Severity'].value_counts()

Accident_Severity
Slight     563801
Serious     88217
Fatal        8661
Name: count, dtype: int64

In [117]:
(8661 / 660679) * 100

1.3109240644851736

<h3>Percentile Distribution of accident severity category</h3>

In [118]:
allAccidents = accident['Accident_Severity'].value_counts()

In [119]:
allAccidents

Accident_Severity
Slight     563801
Serious     88217
Fatal        8661
Name: count, dtype: int64

<h1>INSIGHT #4</h1>
<h2>Only 1.31% of 660,607 accidents took a life and 85.3% of recorded accidents only caused a slight severity</h2>

<h3>Annual Accidents</h3>

In [120]:
accident['Year'].value_counts()

Year
2021    461844
2019     71867
2020     70163
2022     56805
Name: count, dtype: int64

<h1>INSIGHT #5</h1>
<h2>The average annual accidents rised to 59% in 2021 and dropped by approximiately 61% by 2022</h2>

In [121]:
accident['District Area'].mode()[0]

'Birmingham'

In [122]:
BirminghamAccidents = accident[accident['District Area'] == 'Birmingham']

In [123]:
BirminghamAccidents['Year'].value_counts()

Year
2021    9354
2019    1554
2020    1438
2022    1145
Name: count, dtype: int64

In [124]:
BirminghamAccidents['Accident_Severity'].value_counts()

Accident_Severity
Slight     11912
Serious     1474
Fatal        105
Name: count, dtype: int64

<h1>INSIGHT #6</h1>
<h2>Birmingham district area has the most recorded accidents but only 0.7% were fatal</h2>

In [125]:
accident['Number_of_Casualties'].max()

68

In [126]:
accident[accident['Number_of_Casualties'] == accident['Number_of_Casualties'].max()]

Unnamed: 0,Index,Accident_Severity,Accident_Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type,Year,dayOfWeek,Month
117980,200743N002017,Fatal,2019-01-03,51.497547,Darkness - lights lit,South Bucks,-0.496697,68,1,Wet or damp,Slip road,Rural,Raining no high winds,Car,2019,3,1


<h1>INSIGHT #7</h1>
<h2>In the Night time of January 2019, the highest recorded number of casualties is in South Bucks Area, with the number of casualties of 68</h2>

In [127]:
averageCasualties = np.round(accident['Number_of_Casualties'].mean(), 0)

In [128]:
averageCasualties

1.0

<h1>INSIGHT #8</h1>
<h2>The average number of casualties is 1</h2>

In [129]:
averageVehicleInvolved = np.round(accident['Number_of_Vehicles'].mean(), 0)
averageVehicleInvolved

2.0

<h1>INSIGHT #9</h1>
<h2>The average number of vehicles involved are 2</h2>

In [130]:
accident[accident['Number_of_Vehicles'] == 1].count()[0]

  accident[accident['Number_of_Vehicles'] == 1].count()[0]


200787

<h1>INSIGHT #10</h1>
<h2>About 30% of accidents, only one vehicles are involved, this 30% may mean human-errors or environment factors </h2>

<h1>BiVariate</h1>

<h3>CASUALTIES = VEHICLES INVOLVED</h3>

In [131]:
NumCasVec = accident['Number_of_Casualties'].corr(accident['Number_of_Vehicles'])

In [132]:
NumCasVec

0.2288888612692756

<h1>INSIGHT #11</h1>
<h2>The p-value is less than 0.5, therefore, the Number of Vehicles does not necesarrily cause more casualties</h2>

In [133]:
seveCasualties = accident.groupby(['Light_Conditions'])['Number_of_Vehicles'].sum()

  seveCasualties = accident.groupby(['Light_Conditions'])['Number_of_Vehicles'].sum()


In [134]:
seveCasualties

Light_Conditions
Darkness - lighting unknown     11402
Darkness - lights lit          229634
Darkness - lights unlit          4457
Darkness - no lighting          59891
Daylight                       904488
Name: Number_of_Vehicles, dtype: int64

<h1>INSIGHT #12</h1>
<h2>Contrary to the beliefs, Daylight accidents is more prominient compared to night accidents</h2>

In [135]:
roadSeverity = accident.groupby(['Accident_Severity', 'Road_Type']).size().unstack()
roadSeverity

  roadSeverity = accident.groupby(['Accident_Severity', 'Road_Type']).size().unstack()


Road_Type,Dual carriageway,One way street,Roundabout,Single carriageway,Slip road
Accident_Severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fatal,1815,95,142,6560,49
Serious,11746,1655,3665,70540,611
Slight,85863,11809,40185,419563,6381


<h1>INSIGHT #13</h1>
<h2>Single carriageway has the most recorded accidents according to insight #2, but only 1.3% of it were fatal</h2>

In [136]:
accident.groupby(['Accident_Severity','Year']).size().unstack()

  accident.groupby(['Accident_Severity','Year']).size().unstack()


Year,2019,2020,2021,2022
Accident_Severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fatal,1116,942,5980,623
Serious,9823,9346,61782,7266
Slight,60928,59875,394082,48916


<h1>INSIGHT #14</h1>
<h2>The year of 2019 has the highest fatality rate of 1.5% compared to other </h2>
<p>2020 has 1.3%, 2021 has 1.2% and lastly, 2022 has 1.1% fatality rate</p>

In [137]:
winterAccidents = accident[(accident['Month'] >= 12) | (accident['Month'] <= 2)]
springAccidents = accident[(accident['Month'] >= 3) | (accident['Month'] <= 5)]
summerAccidents = accident[(accident['Month'] >= 6) | (accident['Month'] <= 8)]
autumnAccidents = accident[(accident['Month'] >= 9) | (accident['Month'] <= 11)]


In [138]:
accident[(accident['Number_of_Vehicles'] >= 2) & (accident['Road_Type'] == 'Slip road')]

Unnamed: 0,Index,Accident_Severity,Accident_Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type,Year,dayOfWeek,Month
125,200701BS70120,Slight,2021-02-11,51.515641,Daylight,Hammersmith and Fulham,-0.221683,1,2,Dry,Slip road,Urban,Fine no high winds,Motorcycle 125cc and under,2021,3,2
545,200701BS70576,Slight,2021-02-11,51.504841,Daylight,Kensington and Chelsea,-0.215187,1,2,Dry,Slip road,Urban,Fine no high winds,Car,2021,3,2
795,200701CP00112,Slight,2021-02-11,51.511470,Daylight,City of London,-0.104678,1,3,Dry,Slip road,Urban,Fine no high winds,Taxi/Private hire car,2021,3,2
1265,200701CW10285,Slight,2021-02-11,51.519959,Darkness - lights lit,Westminster,-0.174956,1,2,Dry,Slip road,Urban,Fine no high winds,Taxi/Private hire car,2021,3,2
2265,200701CW64696,Serious,2021-02-11,51.519554,Daylight,Westminster,-0.183621,1,3,Dry,Slip road,Urban,Fine no high winds,Car,2021,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658710,2.01E+12,Slight,2022-12-06,51.624709,Daylight,Neath Port Talbot,-3.854608,2,2,Wet or damp,Slip road,Rural,Fine no high winds,Car,2022,1,12
658795,2.01E+12,Slight,2022-12-06,51.647478,Daylight,Neath Port Talbot,-3.847732,1,2,Frost or ice,Slip road,Rural,Fine no high winds,Car,2022,1,12
658899,2.01E+12,Slight,2022-12-05,51.624980,Darkness - lighting unknown,Swansea,-3.937983,1,2,Dry,Slip road,Urban,,Car,2022,0,12
658968,201063AC03610,Slight,2022-02-05,51.688746,Darkness - lights lit,Carmarthenshire,-4.094649,2,2,Dry,Slip road,Rural,Fine no high winds,Van / Goods 3.5 tonnes mgw or under,2022,5,2


<