<h1>UK ACCIDENT REPORT DATA ANALYSIS</h1>

<h3>Analyst: Abdul Barry A. Adam</h3>

<h1>IMPORTING LIBRARIES NEEDED</h1>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

<h1>IMPORTING DATASETS, LOCALLY, TO DATAFRAME</h1>

In [2]:
accident = pd.read_csv("./datasets/accident_data.csv")

In [3]:
accident

Unnamed: 0,Index,Accident_Severity,Accident_Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,5/6/2019,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2/7/2019,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,26-08-2019,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,16-08-2019,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,3/9/2019,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,18-02-2022,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,21-02-2022,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,23-02-2022,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,23-02-2022,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


<h1>INFORMATION</h1>

In [4]:
accident.describe()

Unnamed: 0,Latitude,Longitude,Number_of_Casualties,Number_of_Vehicles
count,660654.0,660653.0,660679.0,660679.0
mean,52.553866,-1.43121,1.35704,1.831255
std,1.406922,1.38333,0.824847,0.715269
min,49.91443,-7.516225,1.0,1.0
25%,51.49069,-2.332291,1.0,1.0
50%,52.315641,-1.411667,1.0,2.0
75%,53.453452,-0.232869,1.0,2.0
max,60.757544,1.76201,68.0,32.0


In [5]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Index                    660679 non-null  object 
 1   Accident_Severity        660679 non-null  object 
 2   Accident_Date            660679 non-null  object 
 3   Latitude                 660654 non-null  float64
 4   Light_Conditions         660679 non-null  object 
 5   District Area            660679 non-null  object 
 6   Longitude                660653 non-null  float64
 7   Number_of_Casualties     660679 non-null  int64  
 8   Number_of_Vehicles       660679 non-null  int64  
 9   Road_Surface_Conditions  659953 non-null  object 
 10  Road_Type                656159 non-null  object 
 11  Urban_or_Rural_Area      660664 non-null  object 
 12  Weather_Conditions       646551 non-null  object 
 13  Vehicle_Type             660679 non-null  object 
dtypes: f

In [6]:
accident.dtypes

Index                       object
Accident_Severity           object
Accident_Date               object
Latitude                   float64
Light_Conditions            object
District Area               object
Longitude                  float64
Number_of_Casualties         int64
Number_of_Vehicles           int64
Road_Surface_Conditions     object
Road_Type                   object
Urban_or_Rural_Area         object
Weather_Conditions          object
Vehicle_Type                object
dtype: object

<h1>CONVERT COLUMNS TO ITS APPROPRIATE DATA TYPE</h1>

In [7]:
accident['Index'] = accident['Index'].astype('category')
accident['Accident_Severity'] = accident['Accident_Severity'].astype('category')
accident['Accident_Date'] = pd.to_datetime(accident['Accident_Date'], dayfirst = True, errors = 'coerce') 
accident['Light_Conditions'] = accident['Light_Conditions'].astype('category')
accident['District Area'] = accident['District Area'].astype('category')
accident['Road_Surface_Conditions'] = accident['Road_Surface_Conditions'].astype('category')
accident['Road_Type'] = accident['Road_Type'].astype('category')
accident['Urban_or_Rural_Area'] = accident['Urban_or_Rural_Area'].astype('category')
accident['Weather_Conditions'] = accident['Weather_Conditions'].astype('category')
accident['Vehicle_Type'] = accident['Vehicle_Type'].astype('category')

<h1>DETERMINING NULL VALUES AND FILLNG THEM</h1>

<h4>DETERMING NULLS</h4>

In [8]:
accident.isnull().sum()

Index                           0
Accident_Severity               0
Accident_Date              395672
Latitude                       25
Light_Conditions                0
District Area                   0
Longitude                      26
Number_of_Casualties            0
Number_of_Vehicles              0
Road_Surface_Conditions       726
Road_Type                    4520
Urban_or_Rural_Area            15
Weather_Conditions          14128
Vehicle_Type                    0
dtype: int64

In [9]:
FreqDate = accident['Accident_Date'].mode()[0]
AveLat = accident['Latitude'].mean()
AveLot = accident['Longitude'].mean()
FreqSurConditions = accident['Road_Surface_Conditions'].mode()[0]
FreqgRoadType = accident['Road_Type'].mode()[0]
FreqArea = accident['Urban_or_Rural_Area'].mode()[0]


<h4>FILLING NULLS</h4>

In [10]:
accident['Accident_Date'] = accident['Accident_Date'].fillna(FreqDate)
accident['Latitude'] = accident['Latitude'].fillna(AveLat)
accident['Longitude'] = accident['Longitude'].fillna(AveLot)
accident['Road_Surface_Conditions'] = accident['Road_Surface_Conditions'].fillna(FreqSurConditions)
accident['Road_Type'] = accident['Road_Type'].fillna(FreqgRoadType)
accident['Urban_or_Rural_Area'] = accident['Urban_or_Rural_Area'].fillna(FreqArea)


<h4>LAST CHECKUP</h4>

In [11]:
accident.isnull().sum()

Index                          0
Accident_Severity              0
Accident_Date                  0
Latitude                       0
Light_Conditions               0
District Area                  0
Longitude                      0
Number_of_Casualties           0
Number_of_Vehicles             0
Road_Surface_Conditions        0
Road_Type                      0
Urban_or_Rural_Area            0
Weather_Conditions         14128
Vehicle_Type                   0
dtype: int64

<h1>CASUALTIES PER VEHICLE TYPE</h1>

In [12]:
casualtiesVehicle = accident.groupby(['Vehicle_Type'])['Number_of_Casualties'].sum()

  casualtiesVehicle = accident.groupby(['Vehicle_Type'])['Number_of_Casualties'].sum()


In [13]:
casualtiesVehicle

Vehicle_Type
Agricultural vehicle                       2613
Bus or coach (17 or more pass seats)      34915
Car                                      676692
Data missing or out of range                  9
Goods 7.5 tonnes mgw and over             23397
Goods over 3.5t. and under 7.5t            8308
Minibus (8 - 16 passenger seats)           2659
Motorcycle 125cc and under                20348
Motorcycle 50cc and under                 10167
Motorcycle over 125cc and up to 500cc     10286
Motorcycle over 500cc                     34879
Other vehicle                              7554
Pedal cycle                                 270
Ridden horse                                  5
Taxi/Private hire car                     18195
Van / Goods 3.5 tonnes mgw or under       46271
Name: Number_of_Casualties, dtype: int64

<h1>INSIGHT #1</h1>
<h2>CAR IS THE MOST TYPE OF VEHICLE WITH CASUALTIES</h2>

<h1>WHAT AREA HAS THE MOST NUMBER OF CASUALTIES</h1>

In [14]:
urbanCasualties = accident[accident['Urban_or_Rural_Area'] == 'Urban']['Number_of_Casualties'].sum()
ruralCasualties = accident[accident['Urban_or_Rural_Area'] == 'Rural']['Number_of_Casualties'].sum()

In [15]:
urbanCasualties

543040

In [16]:
ruralCasualties

353515

<h1>INSIGHT #2</h1>
<h2>Urban area has the highest average number of casualties</h2>
<hr>

<h1>WHAT ROAD TYPE HAS THE MOST NUMBER OF CASUALTIES</h1>

In [17]:
a = accident['Road_Type'].value_counts()

In [18]:
a

Road_Type
Single carriageway    496663
Dual carriageway       99424
Roundabout             43992
One way street         13559
Slip road               7041
Name: count, dtype: int64

In [19]:
single = accident[accident['Road_Type'] == 'Single carriageway']['Number_of_Casualties'].mean()
dual = accident[accident['Road_Type'] == 'Dual carriageway']['Number_of_Casualties'].mean()
roundabout = accident[accident['Road_Type'] == 'Roundabout']['Number_of_Casualties'].mean()
oneWay = accident[accident['Road_Type'] == 'One way street']['Number_of_Casualties'].mean()
slippy = accident[accident['Road_Type'] == 'Slip road']['Number_of_Casualties'].mean()

<h3>AVERAGE NUMBER OF CASUALTIES PER ROAD TYPE</h3>

In [20]:
single

1.3437884440757617

In [21]:
dual

1.4772791277759898

In [22]:
roundabout

1.2748908892525914

In [23]:
oneWay

1.1927133269415149

In [24]:
slippy

1.4236614117312882

<h1>INSIGHT #3</h1>
<H2>DUAL OR TWO WAY ROAD TYPE HAS THE MOST NUMBER OF CASUALTIES</H2>
<hr>

<h1>DOES THE NUMBER OF VEHICLES IMPACT THE NUMBER OF CASUALTIES</h1>

In [25]:
vehicleCasualties = accident['Number_of_Vehicles'].corr(accident['Number_of_Casualties'])

In [26]:
vehicleCasualties

0.22888886126927557

<h1>INSIGHT #4</h1>
<h2>THE NUMBER OF VEHICLES INVOLVED DOES NOT IMPACT OR AFFECT THE NUMBER OF CASUALTIES</h2>
<hr>

<h1>IN AREAS, WHAT TYPE OF VEHICLE HAS THE MOST NUMBER OF CASUALTIES TIED ON IT</h1>

In [27]:
areaTypeCasualties = np.round(accident.groupby(['Urban_or_Rural_Area', 'Vehicle_Type'])['Number_of_Casualties'].mean(), 1)

  areaTypeCasualties = np.round(accident.groupby(['Urban_or_Rural_Area', 'Vehicle_Type'])['Number_of_Casualties'].mean(), 1)


In [28]:
areaTypeCasualties.unstack()

Vehicle_Type,Agricultural vehicle,Bus or coach (17 or more pass seats),Car,Data missing or out of range,Goods 7.5 tonnes mgw and over,Goods over 3.5t. and under 7.5t,Minibus (8 - 16 passenger seats),Motorcycle 125cc and under,Motorcycle 50cc and under,Motorcycle over 125cc and up to 500cc,Motorcycle over 500cc,Other vehicle,Pedal cycle,Ridden horse,Taxi/Private hire car,Van / Goods 3.5 tonnes mgw or under
Urban_or_Rural_Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Rural,1.4,1.5,1.5,,1.5,1.5,1.4,1.5,1.5,1.5,1.5,1.4,1.5,2.0,1.5,1.5
Unallocated,,1.0,1.2,,,,,,,,,,,,,1.0
Urban,1.3,1.3,1.3,1.5,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.0,1.3,1.3


<h1>INSIGHT# 5</h1>
<h2>In Rural Area, Ridden Horse has the most number of casualties with the average of 2.0</h2>

<h1>Under which lighting conditions were the most severe incidents recorded on all Roads?</h1>

In [29]:
roadVehicleSever = accident.groupby(['Road_Type','Accident_Severity', 'Light_Conditions'])['Number_of_Casualties'].sum()

  roadVehicleSever = accident.groupby(['Road_Type','Accident_Severity', 'Light_Conditions'])['Number_of_Casualties'].sum()


In [30]:
roadVehicleSever.unstack()

Unnamed: 0_level_0,Light_Conditions,Darkness - lighting unknown,Darkness - lights lit,Darkness - lights unlit,Darkness - no lighting,Daylight
Road_Type,Accident_Severity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dual carriageway,Fatal,11,878,37,819,1853
Dual carriageway,Serious,149,4527,92,2248,12179
Dual carriageway,Slight,1101,24842,540,8712,88889
One way street,Fatal,1,30,0,2,88
One way street,Serious,15,583,7,17,1406
One way street,Slight,132,3459,61,133,10238
Roundabout,Fatal,0,74,0,2,118
Roundabout,Serious,44,1359,37,41,3193
Roundabout,Slight,546,12553,205,250,37663
Single carriageway,Fatal,101,2355,39,2529,7399


<h1>INSIGHT #6</h1>
<h2>Slip Road is the most fatal in Darkness - lights lit compared to all road in this lighting ocnditions</h2>

In [31]:
accident

Unnamed: 0,Index,Accident_Severity,Accident_Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,2019-06-05,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2019-07-02,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,2021-02-11,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,Single carriageway,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,2021-02-11,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,2019-09-03,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,Single carriageway,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,2021-02-11,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,2021-02-11,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,2021-02-11,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,2021-02-11,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


In [32]:
dryCas = accident[accident['Road_Surface_Conditions'] == 'Dry']['Number_of_Casualties']
wetCas = accident[accident['Road_Surface_Conditions'] == 'Wet or damp']['Number_of_Casualties']


f_stats, p_value = f_oneway(dryCas, wetCas)

print(p_value)

5.097050919646033e-249


<h1>INSIGHT #7</h1>
<h2>THERE IS A CORRELATION BETWEEN ROAD SURFACE CONDITIONS AND NUMBER OF CASUALTIES</h2>

In [34]:
def perform_oneway_anova(data, group_col, value_col):
    """
    Performs one-way ANOVA on a pandas DataFrame.

    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        group_col (str): The name of the column containing the group labels.
        value_col (str): The name of the column containing the values to compare.

    Returns:
        tuple: A tuple containing the F-statistic and p-value, or None if an error occurs.
    """
    try:
        groups = [data[value_col][data[group_col] == group] for group in data[group_col].unique()]
        f_statistic, p_value = stats.f_oneway(*groups)
        return f_statistic, p_value
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example Usage:
data = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'C', 'C', 'A', 'B', 'C'],
    'value': [10, 12, 15, 18, 20, 22, 11, 16, 21]
})

result = perform_oneway_anova(data, 'group', 'value')

if result:
    f_statistic, p_value = result
    print(f"F-statistic: {f_statistic}")
    print(f"P-value: {p_value}")

    if p_value < 0.05:
        print("There are statistically significant differences between the groups.")
    else:
        print("There are no statistically significant differences between the groups.")

An error occurred: name 'stats' is not defined


In [35]:
haha = perform_oneway_anova(accident, accident['Road_Surface_Conditions'], accident['Number_of_Casualties'])

An error occurred: "None of [CategoricalIndex(['Dry', 'Wet or damp', 'Dry', 'Dry', 'Dry', 'Dry', 'Dry',\n                  'Dry', 'Dry', 'Dry',\n                  ...\n                  'Wet or damp', 'Dry', 'Frost or ice', 'Snow', 'Wet or damp',\n                  'Dry', 'Frost or ice', 'Frost or ice', 'Wet or damp',\n                  'Wet or damp'],\n                 categories=['Dry', 'Flood over 3cm. deep', 'Frost or ice', 'Snow', 'Wet or damp'], ordered=False, dtype='category', length=660679)] are in the [columns]"
