# Accident Data
### Analyst: Ryann Kim Sesgundo

#### Import dependencies

In [None]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from scipy.stats import f_oneway
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("datasets/accident_data.csv")

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

In [None]:
data['Latitude'] = data['Latitude'].astype('category')
data['Longitude'] = data['Longitude'].astype('category')

data['Latitude'] = data['Latitude'].fillna(data['Latitude'].mode()[0])
data['Longitude'] = data['Longitude'].fillna(data['Longitude'].mode()[0])
data['Road_Surface_Conditions'] = data['Road_Surface_Conditions'].fillna(data['Road_Surface_Conditions'].mode()[0])
data['Urban_or_Rural_Area'] = data['Urban_or_Rural_Area'].fillna(data['Urban_or_Rural_Area'].mode()[0])
data['Road_Type'] = data['Road_Type'].fillna('Unknown Road Type')
data['Weather_Conditions'] = data['Weather_Conditions'].fillna('Unknown Weather Conditions')

In [None]:
# If ever, convert first to string then convert back to Date Time
data['Accident Date'] = data['Accident Date'].astype("str")
data['Accident Date'] = data['Accident Date'].str.strip()
data['Accident Date'] = data['Accident Date'].str.replace('/', '-')

In [None]:
data['Accident Date'] = pd.to_datetime(data['Accident Date'], dayfirst=True, errors='coerce')

In [None]:
data['Accident_Severity'] = data['Accident_Severity'].astype('category')
data['Light_Conditions'] = data['Light_Conditions'].astype('category')
data['District Area'] = data['District Area'].astype('category')
data['Road_Surface_Conditions'] = data['Road_Surface_Conditions'].astype('category')
data['Road_Type'] = data['Road_Type'].astype('category')
data['Urban_or_Rural_Area'] = data['Urban_or_Rural_Area'].astype('category')
data['Weather_Conditions'] = data['Weather_Conditions'].astype('category')
data['Vehicle_Type'] = data['Vehicle_Type'].astype('category')


In [None]:
data.dtypes

In [None]:
data.isnull().sum()

### Adding more fields

In [None]:
data['Year'] = data['Accident Date'].dt.year
data['Month'] = data['Accident Date'].dt.month
data['DayOfWeek'] = data['Accident Date'].dt.dayofweek


data['Month'] = data['Month'].astype('category')
data['DayOfWeek'] = data['DayOfWeek'].astype('category')
data['Year'] = data['Year'].astype('category')
data.info()

# Analyzation Starts here

In [None]:
# TODO: Create a percentage function
def percent(value, total = 0):
    if total == 0:
        total = data.value_counts().sum()
    return f"{np.round(((value / total) * 100), 2)}%"

months = [ "",
    "January", "Febuary", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

months_dict = {
    1: "January", 2: "Febuary", 3: "March", 4: "April", 5: "May", 6:"June",
    7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December"
}


days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

days_dict = {
    0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"
}

### 1. Common Weather Conditions

In [None]:
weather_condition = data['Weather_Conditions'].value_counts()

weather_condition.plot(kind='bar')
plt.show()

weather_condition

In [None]:
weather = data[data['Weather_Conditions'] == data['Weather_Conditions'].mode()[0]].value_counts().sum()
weather

In [None]:
percent(value=weather)

### The highest accident rate with the weather condition is `Fine no high winds` with `660679` or `78.84%` of the entire record records.

### 2. Common Vehicle

In [None]:
vehicle_type = data['Vehicle_Type'].value_counts()

vehicle_type.plot(kind="bar")
plt.show()

vehicle_type

In [None]:
vehicle_count = data[data['Vehicle_Type'] == data['Vehicle_Type'].mode()[0]].value_counts().sum()
vehicle_count

In [None]:
percent(vehicle_count)

### The highest Vehicle type involved in accident recorded is `Car` with the count of `497992` or `75.38%` of the entire record.


### 3. Is there's a relationship of Weather Conditions and Vehicle Type in count of accident


In [None]:

vehicle_ = data.groupby(['Vehicle_Type', 'Weather_Conditions']).size().unstack()

vehicle_.plot(kind="bar", stacked=True)
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), fontsize=10)
plt.show()
vehicle_

### 4. Area with highest accident rate
#### Rural or Urban

In [None]:
u_r = data['Urban_or_Rural_Area'].value_counts()

u_r.plot(kind="pie")
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), fontsize=10)
plt.show()

u_r

In [None]:
u_r_c = data[data['Urban_or_Rural_Area'] == data['Urban_or_Rural_Area'].mode()[0]].value_counts().sum()
u_r_c

In [None]:
percent(u_r_c)

### Most accident happened in `Urban Areas`, with data recorded `421678` or `63.82%` of the entire record.

### 5. Date with highest accident rate

In [None]:
accident_date = data['Accident Date'].mode()[0]
accident_date

### 6. Accident rate based on result from #5

In [None]:
### Using the earlier data, where it gives the data how many are the accidents recorded with this date
data[data['Accident Date'] == accident_date].value_counts().sum()

### 7. Common District Area Involve (TOP 10)

In [None]:
district_data = data['District Area'].value_counts()

district_data.sort_values()
district_data = district_data[:10]
district_data.plot(kind="bar")
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), fontsize=10)
plt.show()

district_data

### 8. Common weather where the accidents in Common District Area Involve

In [None]:
weather_x_district = data[data['District Area'] == data['District Area'].mode()[0]]['Weather_Conditions'].mode()[0]
weather_x_district

In [None]:

wxd = data[data['District Area'] == 'Birmingham']
wxd = wxd['Weather_Conditions'].value_counts()

wxd.plot(kind="bar")
plt.show()
wxd

### 9. Most Common Vehicle Involve

In [None]:
vehicles_x_weather_x_district = data[(data['District Area'] == data['District Area'].mode()[0]) & (data['Weather_Conditions'] == weather_x_district)]
vehicles_x_weather_x_district['Vehicle_Type'].mode()[0]

vehicles_x_weather_x_district['Vehicle_Type'].value_counts().plot(kind="bar")
plt.show()

### 10. Average Accident rate in Urban and Rural Area

In [None]:
rural_data = data[data['Urban_or_Rural_Area'] == 'Rural'].value_counts().sum()
rural_data

In [None]:
urban_data = data[data['Urban_or_Rural_Area'] == 'Urban'].value_counts().sum()
urban_data

In [None]:
rulban = data['Urban_or_Rural_Area'].value_counts()
rulban.plot(kind="pie")
plt.show()

In [None]:
percent(rural_data)

In [None]:
percent(urban_data)

### There's more accident in `Urban areas` which has the count of `421678` or `63.82%` of the entire record rather than the `Rural areas` which as `238990` or `36.17` of the entire record.

### 11. Common Day of Week Accident

In [None]:
day = data['DayOfWeek'].mode()
days[day[0]]

datannn = data
datannn['DayOfWeek'] = datannn['DayOfWeek'].map(days_dict)
datannn['DayOfWeek'].value_counts().plot(kind="bar")
plt.show()

In [None]:
days = data['DayOfWeek']


### 12. Most common weather in every accident with each months

In [None]:
data.groupby(['Weather_Conditions', 'Month']).size().unstack()

In [None]:
data__ = data

In [None]:
data__['Month'] = data__['Month'].map(months_dict)

__ = data__.groupby(['Weather_Conditions', 'Month']).size().unstack()
__.plot(kind="bar", stacked=True)
plt.show()

### 13. Year with Highest Accident Rate

In [None]:
h_year = data['Year'].value_counts()
h_year.plot(kind="bar")
plt.show()

### 14. Month with Highest accident rate in Year 2020

In [None]:
month_2020 = data[data['Year'] == 2020]['Month'].mode()[0]
month_2020

In [None]:
# Accident rate un year 2020

_2020 = data.groupby(['Month', 'Year']).size().unstack()

# _2020.plot(kind='line')
# plt.plot(_2020.size().unstack(), marker="o")
_2020.plot(kind="line", marker="o")
# plt.xticks(ticks=_2020['Month'], labels=months)
plt.legend(loc='upper right', fontsize=10)
plt.xlabel("Months")
plt.title("Monthly Data Representation")
plt.gcf().set_size_inches(13, 9)
plt.show()
# _2020['Month'].unique()

### 15. Month with highest accident rate in Year 2021

In [None]:
month_2021 = data[data['Year'] == 2021]['Month'].mode()[0]
months[month_2021]

In [None]:
month_2021 = data[data['Year'] == 2021]['Month'].value_counts()

month_2021.index = month_2021.index.map(months_dict)
# plt.bar(x=month_2019.index, height=month_2019)
month_2021.plot(kind='bar')
plt.xlabel("Months")
# plt.xticks(rotate=90, ha='center')
plt.show()


### 16. Month with highest accident rate in year 2022

In [None]:
month_2022 = data[data['Year'] == 2022]['Month'].mode()[0]
months[month_2022]

In [None]:
month_2022 = data[data['Year'] == 2022]['Month'].value_counts()

month_2022.index = month_2022.index.map(months_dict)
# plt.bar(x=month_2019.index, height=month_2019)
month_2022.plot(kind='bar')
plt.xlabel("Months")
# plt.xticks(rotate=90, ha='center')
plt.show()


### 17. Highest casualty in an accident

In [None]:
data['Number_of_Casualties'].max()

### 18. Road type with highest accident rate

In [None]:
rt = data['Road_Type'].mode()[0]
rt

In [None]:
rtype = data['Road_Type'].value_counts()

rtype.plot(kind="bar")
plt.show()

In [None]:
rtc = data[data['Road_Type'] == rt].value_counts().sum()
rtc

In [None]:
percent(rtc)

### The most common `Road Type` is `Single Carriageway` which has `492143` or `74.49%` of the entire data.

### 19. Is there's any relationship between the Road Type and the Casualty

In [None]:
data.groupby(['Number_of_Casualties', 'Road_Type']).size().unstack()

In [None]:
casualites = data.groupby(['Number_of_Casualties', 'Road_Type']).count()
casualites.plot(kind="bar")
plt.show()
# casualites

### 20. Months with highest accidents in year 2019

In [None]:
month_2019 = data[data['Year'] == 2019]['Month'].value_counts()

month_2019.index = month_2019.index.map(months_dict)
# plt.bar(x=month_2019.index, height=month_2019)
month_2019.plot(kind='bar')
plt.xlabel("Months")
# plt.xticks(rotate=90, ha='center')
plt.show()
