In [92]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [93]:
# import data
csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
csv_data.head()

Unnamed: 0,Accident_Index,Accident Date,Day_of_Week,Junction_Control,Junction_Detail,Accident_Severity,Latitude,Light_Conditions,Local_Authority_(District),Carriageway_Hazards,...,Number_of_Casualties,Number_of_Vehicles,Police_Force,Road_Surface_Conditions,Road_Type,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200901BS70001,1/1/2021,Thursday,Give way or uncontrolled,T or staggered junction,Serious,51.512273,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,One way street,30,15:11,Urban,Fine no high winds,Car
1,200901BS70002,1/5/2021,Monday,Give way or uncontrolled,Crossroads,Serious,51.514399,Daylight,Kensington and Chelsea,,...,11,2,Metropolitan Police,Wet or damp,Single carriageway,30,10:59,Urban,Fine no high winds,Taxi/Private hire car
2,200901BS70003,1/4/2021,Sunday,Give way or uncontrolled,T or staggered junction,Slight,51.486668,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,Single carriageway,30,14:19,Urban,Fine no high winds,Taxi/Private hire car
3,200901BS70004,1/5/2021,Monday,Auto traffic signal,T or staggered junction,Serious,51.507804,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Frost or ice,Single carriageway,30,8:10,Urban,Other,Motorcycle over 500cc
4,200901BS70005,1/6/2021,Tuesday,Auto traffic signal,Crossroads,Serious,51.482076,Darkness - lights lit,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,Single carriageway,30,17:25,Urban,Fine no high winds,Car


In [94]:
# clean up typo in data
print()
csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])




In [95]:
# extracting out rows that we want to look at
print()
working_data = pd.DataFrame(csv_data[[
    'Day_of_Week',
    'Light_Conditions', 
    'Accident_Severity',  
    'Road_Surface_Conditions', 
    'Speed_limit',
    'Weather_Conditions',
    'Vehicle_Type'
]])

working_data.head()




Unnamed: 0,Day_of_Week,Light_Conditions,Accident_Severity,Road_Surface_Conditions,Speed_limit,Weather_Conditions,Vehicle_Type
0,Thursday,Daylight,Serious,Dry,30,Fine no high winds,Car
1,Monday,Daylight,Serious,Wet or damp,30,Fine no high winds,Taxi/Private hire car
2,Sunday,Daylight,Slight,Dry,30,Fine no high winds,Taxi/Private hire car
3,Monday,Daylight,Serious,Frost or ice,30,Other,Motorcycle over 500cc
4,Tuesday,Darkness - lights lit,Serious,Dry,30,Fine no high winds,Car


In [96]:
# Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
print()
col = 'Accident_Severity'
unique_values = working_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {working_data[col].value_counts()}")


Unique values in 'Accident_Severity': ['Serious' 'Slight' 'Fatal']
No. of val: Accident_Severity
Slight     263280
Serious     40740
Fatal        3953
Name: count, dtype: int64


In [97]:
# seperate data with 'Accident_Severity' Slight from non 'Slight'
print()
non_slight_data = working_data[working_data['Accident_Severity'] != "Slight"]
slight_data = working_data[working_data['Accident_Severity'] == "Slight"]

col = 'Accident_Severity'
unique_values = non_slight_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {non_slight_data[col].value_counts()}")
print()


unique_values = slight_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {slight_data[col].value_counts()}")


Unique values in 'Accident_Severity': ['Serious' 'Fatal']
No. of val: Accident_Severity
Serious    40740
Fatal       3953
Name: count, dtype: int64

Unique values in 'Accident_Severity': ['Slight']
No. of val: Accident_Severity
Slight    263280
Name: count, dtype: int64


In [98]:
# taking out random 40000 records which data 'Accident_Severity' is 'Slight'
print()
slight_data = slight_data.sample(40000)

col = 'Accident_Severity'
unique_values = slight_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {slight_data[col].value_counts()}")


Unique values in 'Accident_Severity': ['Slight']
No. of val: Accident_Severity
Slight    40000
Name: count, dtype: int64


In [99]:
# combine the dataframe
print()
combined_df = pd.concat([non_slight_data, slight_data], ignore_index=True)
combined_df = combined_df.sample(frac=1)

combined_df.head()




Unnamed: 0,Day_of_Week,Light_Conditions,Accident_Severity,Road_Surface_Conditions,Speed_limit,Weather_Conditions,Vehicle_Type
71244,Friday,Daylight,Slight,Wet or damp,30,Fine no high winds,Car
56317,Wednesday,Darkness - lights lit,Slight,Wet or damp,50,Fine no high winds,Van / Goods 3.5 tonnes mgw or under
81865,Sunday,Daylight,Slight,Dry,30,Fine no high winds,Car
51343,Friday,Daylight,Slight,Dry,60,Fine no high winds,Car
80591,Wednesday,Daylight,Slight,Dry,40,Fine no high winds,Car


In [100]:
# show the end result
print()
col = 'Accident_Severity'
unique_values = combined_df[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {combined_df[col].value_counts()}")


Unique values in 'Accident_Severity': ['Slight' 'Serious' 'Fatal']
No. of val: Accident_Severity
Serious    40740
Slight     40000
Fatal       3953
Name: count, dtype: int64


## Limit all to Accident_Severity type to 4000 records

In [102]:
# seperate data with 'Accident_Severity' Slight and Serious and Fatal
print()
Serious_data = working_data[working_data['Accident_Severity'] == "Serious"]
slight_data = working_data[working_data['Accident_Severity'] == "Slight"]
fatal_data = working_data[working_data['Accident_Severity'] == "Fatal"]

col = 'Accident_Severity'
unique_values = Serious_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {Serious_data[col].value_counts()}")
print()

unique_values = slight_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {slight_data[col].value_counts()}")
print()

unique_values = fatal_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {fatal_data[col].value_counts()}")


Unique values in 'Accident_Severity': ['Serious']
No. of val: Accident_Severity
Serious    40740
Name: count, dtype: int64

Unique values in 'Accident_Severity': ['Slight']
No. of val: Accident_Severity
Slight    263280
Name: count, dtype: int64

Unique values in 'Accident_Severity': ['Fatal']
No. of val: Accident_Severity
Fatal    3953
Name: count, dtype: int64


In [104]:
# limit all accident_severty type to 4000 columns
print()
slight_data = slight_data.sample(4000)
Serious_data = Serious_data.sample(4000)

col = 'Accident_Severity'
unique_values = slight_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {slight_data[col].value_counts()}")
print()

unique_values = Serious_data[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {Serious_data[col].value_counts()}")


Unique values in 'Accident_Severity': ['Slight']
No. of val: Accident_Severity
Slight    4000
Name: count, dtype: int64

Unique values in 'Accident_Severity': ['Serious']
No. of val: Accident_Severity
Serious    4000
Name: count, dtype: int64


In [107]:
# combine the dataframe
print()
combined_df = pd.concat([Serious_data, slight_data, fatal_data], ignore_index=True)
combined_df = combined_df.sample(frac=1)

combined_df.head()




Unnamed: 0,Day_of_Week,Light_Conditions,Accident_Severity,Road_Surface_Conditions,Speed_limit,Weather_Conditions,Vehicle_Type
10683,Friday,Daylight,Fatal,Dry,30,Fine no high winds,Car
4998,Sunday,Daylight,Slight,Dry,60,Fine no high winds,Car
7557,Wednesday,Daylight,Slight,Wet or damp,30,Raining no high winds,Goods 7.5 tonnes mgw and over
2766,Tuesday,Daylight,Serious,Wet or damp,30,Raining no high winds,Goods over 3.5t. and under 7.5t
4679,Sunday,Daylight,Slight,Wet or damp,30,Fine no high winds,Other vehicle


In [108]:
# show the end result
print()
col = 'Accident_Severity'
unique_values = combined_df[col].unique()
print(f"Unique values in '{col}': {unique_values}")
print(f"No. of val: {combined_df[col].value_counts()}")


Unique values in 'Accident_Severity': ['Fatal' 'Slight' 'Serious']
No. of val: Accident_Severity
Slight     4000
Serious    4000
Fatal      3953
Name: count, dtype: int64


In [101]:
# # # # # 40000 COLUMNS
# # import data
# csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
# # # print(csv_data)

# # clean up typo in data
# # # print()
# csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])

# # extracting out rows that we want to look at
# # # print()
# working_data = pd.DataFrame(csv_data[[
#     'Day_of_Week',
#     'Light_Conditions', 
#     'Accident_Severity',  
#     'Road_Surface_Conditions', 
#     'Speed_limit',
#     'Weather_Conditions',
#     'Vehicle_Type'
# ]])

# # # print(working_data)

# # Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
# # # print()
# col = 'Accident_Severity'
# unique_values = working_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {working_data[col].value_counts()}")

# # seperate data with 'Accident_Severity' Slight from non 'Slight'
# # # print()
# non_slight_data = working_data[working_data['Accident_Severity'] != "Slight"]
# slight_data = working_data[working_data['Accident_Severity'] == "Slight"]

# col = 'Accident_Severity'
# unique_values = non_slight_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {non_slight_data[col].value_counts()}")
# # # print()


# unique_values = slight_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {slight_data[col].value_counts()}")

# # taking out random 40000 records which data 'Accident_Severity' is 'Slight'
# # # print()
# slight_data = slight_data.sample(40000)

# col = 'Accident_Severity'
# unique_values = slight_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {slight_data[col].value_counts()}")

# # combine the dataframe
# # # print()
# combined_df = pd.concat([non_slight_data, slight_data], ignore_index=True)
# combined_df = combined_df.sample(frac=1)

# # # print(combined_df)

# # show the end result
# # # print()
# col = 'Accident_Severity'
# unique_values = combined_df[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {combined_df[col].value_counts()}")


In [110]:
# # # # # 4000 COLUMNS
# # import data
# csv_data = pd.read_csv('./Road Accident Data.csv', low_memory=False)
# # # print(csv_data)

# # clean up typo in data
# # # print()
# csv_data['Accident_Severity'] = csv_data['Accident_Severity'].replace(['Fetal'], ['Fatal'])

# # extracting out rows that we want to look at
# # # print()
# working_data = pd.DataFrame(csv_data[[
#     'Day_of_Week',
#     'Light_Conditions', 
#     'Accident_Severity',  
#     'Road_Surface_Conditions', 
#     'Speed_limit',
#     'Weather_Conditions',
#     'Vehicle_Type'
# ]])

# # # print(working_data)

# # Show that  data where 'Accident_Severity' is have an unewven distribution where slight is 85% of all data 
# # # print()
# col = 'Accident_Severity'
# unique_values = working_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {working_data[col].value_counts()}")

# # seperate data with 'Accident_Severity' Slight and Serious and Fatal
# # # print()
# Serious_data = working_data[working_data['Accident_Severity'] == "Serious"]
# slight_data = working_data[working_data['Accident_Severity'] == "Slight"]
# fatal_data = working_data[working_data['Accident_Severity'] == "Fatal"]

# col = 'Accident_Severity'
# unique_values = Serious_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {Serious_data[col].value_counts()}")
# # # print()

# unique_values = slight_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {slight_data[col].value_counts()}")
# # # print()

# unique_values = fatal_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {fatal_data[col].value_counts()}")

# # limit all accident_severty type to 4000 columns
# # # print()
# slight_data = slight_data.sample(4000)
# Serious_data = Serious_data.sample(4000)

# col = 'Accident_Severity'
# unique_values = slight_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {slight_data[col].value_counts()}")
# # # print()

# unique_values = Serious_data[col].unique()
# # # print(f"Unique values in '{col}': {unique_values}")
# # # print(f"No. of val: {Serious_data[col].value_counts()}")

# # combine the dataframe
# # # print()
# combined_df = pd.concat([Serious_data, slight_data, fatal_data], ignore_index=True)
# combined_df = combined_df.sample(frac=1)

# combined_df.head()

# # show the end result
# # # print()
# col = 'Accident_Severity'
# unique_values = combined_df[col].unique()
# # print(f"Unique values in '{col}': {unique_values}")
# # print(f"No. of val: {combined_df[col].value_counts()}")
