# Accident Data
### Analyst: Ryann Kim Sesgundo

#### Import dependencies

In [1]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from scipy.stats import f_oneway

In [2]:
data = pd.read_csv("datasets/accident_data.csv")

In [3]:
data.dtypes

Index                       object
Accident_Severity           object
Accident Date               object
Latitude                   float64
Light_Conditions            object
District Area               object
Longitude                  float64
Number_of_Casualties         int64
Number_of_Vehicles           int64
Road_Surface_Conditions     object
Road_Type                   object
Urban_or_Rural_Area         object
Weather_Conditions          object
Vehicle_Type                object
dtype: object

In [4]:
data.isnull().sum()

Index                          0
Accident_Severity              0
Accident Date                  0
Latitude                      25
Light_Conditions               0
District Area                  0
Longitude                     26
Number_of_Casualties           0
Number_of_Vehicles             0
Road_Surface_Conditions      726
Road_Type                   4520
Urban_or_Rural_Area           15
Weather_Conditions         14128
Vehicle_Type                   0
dtype: int64

In [5]:
data['Latitude'] = data['Latitude'].astype('category')
data['Longitude'] = data['Longitude'].astype('category')

data['Latitude'] = data['Latitude'].fillna(data['Latitude'].mode()[0])
data['Longitude'] = data['Longitude'].fillna(data['Longitude'].mode()[0])
data['Road_Surface_Conditions'] = data['Road_Surface_Conditions'].fillna(data['Road_Surface_Conditions'].mode()[0])
data['Urban_or_Rural_Area'] = data['Urban_or_Rural_Area'].fillna(data['Urban_or_Rural_Area'].mode()[0])
data['Road_Type'] = data['Road_Type'].fillna('Unknown Road Type')
data['Weather_Conditions'] = data['Weather_Conditions'].fillna('Unknown Weather Conditions')

In [6]:
# If ever, convert first to string then convert back to Date Time
data['Accident Date'] = data['Accident Date'].astype("str")
data['Accident Date'] = data['Accident Date'].str.strip()
data['Accident Date'] = data['Accident Date'].str.replace('/', '-')

In [7]:
data['Accident Date'] = pd.to_datetime(data['Accident Date'], dayfirst=True, errors='coerce')

In [8]:
data['Accident_Severity'] = data['Accident_Severity'].astype('category')
data['Light_Conditions'] = data['Light_Conditions'].astype('category')
data['District Area'] = data['District Area'].astype('category')
data['Road_Surface_Conditions'] = data['Road_Surface_Conditions'].astype('category')
data['Road_Type'] = data['Road_Type'].astype('category')
data['Urban_or_Rural_Area'] = data['Urban_or_Rural_Area'].astype('category')
data['Weather_Conditions'] = data['Weather_Conditions'].astype('category')
data['Vehicle_Type'] = data['Vehicle_Type'].astype('category')

In [9]:
data.dtypes

Index                              object
Accident_Severity                category
Accident Date              datetime64[ns]
Latitude                         category
Light_Conditions                 category
District Area                    category
Longitude                        category
Number_of_Casualties                int64
Number_of_Vehicles                  int64
Road_Surface_Conditions          category
Road_Type                        category
Urban_or_Rural_Area              category
Weather_Conditions               category
Vehicle_Type                     category
dtype: object

In [10]:
data.isnull().sum()

Index                      0
Accident_Severity          0
Accident Date              0
Latitude                   0
Light_Conditions           0
District Area              0
Longitude                  0
Number_of_Casualties       0
Number_of_Vehicles         0
Road_Surface_Conditions    0
Road_Type                  0
Urban_or_Rural_Area        0
Weather_Conditions         0
Vehicle_Type               0
dtype: int64

### Adding more fields

In [11]:
data['Year'] = data['Accident Date'].dt.year
data['Month'] = data['Accident Date'].dt.month
data['DayOfWeek'] = data['Accident Date'].dt.dayofweek
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660679 entries, 0 to 660678
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Index                    660679 non-null  object        
 1   Accident_Severity        660679 non-null  category      
 2   Accident Date            660679 non-null  datetime64[ns]
 3   Latitude                 660679 non-null  category      
 4   Light_Conditions         660679 non-null  category      
 5   District Area            660679 non-null  category      
 6   Longitude                660679 non-null  category      
 7   Number_of_Casualties     660679 non-null  int64         
 8   Number_of_Vehicles       660679 non-null  int64         
 9   Road_Surface_Conditions  660679 non-null  category      
 10  Road_Type                660679 non-null  category      
 11  Urban_or_Rural_Area      660679 non-null  category      
 12  Weather_Conditio

### 1. Common Weather Conditions

In [12]:
data['Weather_Conditions'].mode()[0]

'Fine no high winds'

### 2. Common Vehicle

In [13]:
data['Vehicle_Type'].mode()[0]

'Car'

### 3. Area with highest accident rate
#### Rural or Urban

In [14]:
data['Urban_or_Rural_Area'].mode()[0]

'Urban'

### 4. Date with highest accident rate

In [15]:
accident_date = data['Accident Date'].mode()[0]
accident_date

Timestamp('2019-11-30 00:00:00')

### 5. Accident rate based on result from #4

In [16]:
### Using the earlier data, where it gives the data how many are the accidents recorded with this date
data[data['Accident Date'] == accident_date].count()['Index']

np.int64(704)

### 6. Common District Area Involve

In [17]:
district_data = data['District Area'].mode()[0]
district_data

'Birmingham'

### 7. Common weather where the accidents in Common District Area Involve

In [18]:
weather_x_district = data[data['District Area'] == district_data]['Weather_Conditions'].mode()[0]
weather_x_district

'Fine no high winds'

### 8. Most Common Vehicle Involve

In [19]:
vehicles_x_weather_x_district = data[(data['District Area'] == district_data) & (data['Weather_Conditions'] == weather_x_district)]
vehicles_x_weather_x_district['Vehicle_Type'].mode()[0]

'Car'

### 9. Average Accident rate in Urban and Rural Area

In [20]:
data[data['Urban_or_Rural_Area'] == 'Rural'].count()

Index                      238990
Accident_Severity          238990
Accident Date              238990
Latitude                   238990
Light_Conditions           238990
District Area              238990
Longitude                  238990
Number_of_Casualties       238990
Number_of_Vehicles         238990
Road_Surface_Conditions    238990
Road_Type                  238990
Urban_or_Rural_Area        238990
Weather_Conditions         238990
Vehicle_Type               238990
Year                       238990
Month                      238990
DayOfWeek                  238990
dtype: int64

In [21]:
data[data['Urban_or_Rural_Area'] == 'Urban'].count()

Index                      421678
Accident_Severity          421678
Accident Date              421678
Latitude                   421678
Light_Conditions           421678
District Area              421678
Longitude                  421678
Number_of_Casualties       421678
Number_of_Vehicles         421678
Road_Surface_Conditions    421678
Road_Type                  421678
Urban_or_Rural_Area        421678
Weather_Conditions         421678
Vehicle_Type               421678
Year                       421678
Month                      421678
DayOfWeek                  421678
dtype: int64

### 10. Common Day of Week Accident

In [22]:
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
day = data['DayOfWeek'].mode()
days[day[0]]

'Saturday'

### 11. Year with Highest Accident Rate

In [23]:
data['Year'].mode()[0]

np.int32(2019)

### 12. Month with Highest accident rate in Year 2020

In [24]:
months = [
    "January", "Febuary", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]
month_2020 = data[data['Year'] == 2020]['Month'].mode()[0]
months[month_2020]

'November'

### 13. Month with highest accident rate in Year 2021

In [25]:
month_2021 = data[data['Year'] == 2021]['Month'].mode()[0]
months[month_2021]

'December'

### 14. Month with highest accident rate in year 2022

In [26]:
month_2022 = data[data['Year'] == 2022]['Month'].mode()[0]
months[month_2022]

'December'

### 15. Highest casualty in an accident

In [27]:
data['Number_of_Casualties'].max()

np.int64(68)

### 16. Road type with highest accident rate

In [28]:
data['Road_Type'].mode()[0]

'Single carriageway'

### 17. Is there's any relationship between the Road Type and the Casualty

In [None]:
f_stat, p_value = f_oneway(data['Road_Type'].mode()[0], data['Number_of_Casualties'].max())
f_stat

In [None]:
p_value