# UK Gov

**Source of original dataset:** https://data.gov.uk/dataset/cb7ae6f0-4be6-4935-9277-47e5ce24a11f/road-safety-data

**Location of accidents:** Latitude, Longitude

**Date of accidents:** Date

**Outcome of accidents:** Fatality, Serious Injury, Slight Injury, PDO

In [None]:
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('display.max_colwidth', -1)
import numpy as np
from plotly import graph_objects as go
import plotly.express as px
from itertools import chain
import matplotlib.pyplot as plt
import pyproj
import glob
import pygeodesy as geo

Setup input files

In [None]:
data_dir = "../data/uk_gov/"
acc_files = [data_dir + 'Stats19_Data_Accidents7904.csv',
             data_dir + 'Stats19_Data_Accidents0514.csv',
             data_dir + 'DfTRoadSafety_Accidents_2015.csv',
             data_dir + 'DftRoadSafety_Accidents_2016.csv',
             data_dir + 'DfTRoadSafety_Accidents_2017.csv',
             data_dir + 'DftRoadSafety_Accidents_2018.csv']
veh_files = [data_dir + 'Stats19_Data_Vehicles7904.csv',
             data_dir + 'Stats19_Data_Vehicles0514.csv', 
             data_dir + 'DftRoadSafety_Vehicles_2015.csv',
             data_dir + 'DftRoadSafety_Vehicles_2016.csv',
             data_dir + 'DftRoadSafety_Vehicles_2017.csv',
             data_dir + 'DftRoadSafety_Vehicles_2018.csv']
                         

Read original data

In [None]:
data_aux = []
for i, (accidents_data, vehicles_data) in enumerate(zip(acc_files, veh_files)):
    print(accidents_data)

    data_acc = pd.read_csv(accidents_data)
    data_veh = pd.read_csv(vehicles_data)

    name1 = ['Acc_Index',
            ]
    name2 = ['Accident_Index',
             ]
    for key1, key2 in zip(name1, name2):
        try:
            print('data_veh')
            data_veh = data_veh.rename(columns={key1: key2})
        except:
            pass
        try:
            print('data_acc')
            data_acc = data_acc.rename(columns={key1: key2})
        except:
            pass
    
    data_veh['Vehicle_Type'] = data_veh['Vehicle_Type'].replace(1, 'bike')
    data_veh['Vehicle_Type'] = data_veh['Vehicle_Type'].astype(str)

    data_veh = data_veh.groupby( 'Accident_Index').agg({
                                 'Vehicle_Type': '; '.join}).reset_index()
    data = pd.merge(data_acc, data_veh, on="Accident_Index", how="left")

    # Filter accident where a bicycle is involved
    bicycle_related_codes = ['bike']
    data['Bicycles'] = data['Vehicle_Type'].str.contains('|'.join(bicycle_related_codes))
    data['Bicycles'].fillna(value=0, inplace=True)
    data['Bicycles'] = data['Bicycles'].astype('int')   
    data = data[data['Bicycles']>0]

    data_aux.append(data)

In [None]:
list_of_dfs = data_aux
list_of_dicts = [cur_df.T.to_dict().values() for cur_df in list_of_dfs]
data = pd.DataFrame(list(chain(*list_of_dicts)))

Create Datetime column

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

Setup outcome variables

In [None]:
data['fatal']= 0
data['serious']= 0
data['slight']= 0

data.loc[data['Accident_Severity'] == 1, 'fatal'] = 1
data.loc[data['Accident_Severity'] == 2, 'serious'] = 1
data.loc[data['Accident_Severity'] == 3, 'slight'] = 1

data = data[data['Location_Easting_OSGR'].notna()]
data = data[data['Location_Northing_OSGR'].notna()]
data.shape

Setup Longitude & Latitude

In [None]:
data['Location_Easting_OSGR'] = data['Location_Easting_OSGR'].astype(float)
data['Location_Northing_OSGR'] = data['Location_Northing_OSGR'].astype(float)

data = data[data['Location_Easting_OSGR']  != 0]
data = data[data['Location_Northing_OSGR'] != 0]

In [None]:
def f(easting, northing):
    try:
        a = geo.parseOSGR(str(easting)+','+str(northing)).toLatLon()[0:2]
    except:
        return [None, None]
    
    return a

In [None]:
data[['Latitude', 'Longitude']] = data.apply(lambda x: f(x['Location_Easting_OSGR'], x['Location_Northing_OSGR']), axis=1, result_type='expand')

In [None]:
data = data[data['Latitude'].notna()]
data = data[data['Longitude'].notna()]

Some key statistics

In [None]:
print('Accidents between '+str(data['Date'].min())+' and '+str(data['Date'].max()))

total_accidents = data.shape[0]
print("There are a total of "+str(total_accidents)+" bicycle accidents.")

fatalities = data['fatal'].sum()
print("There are a total of "+str(fatalities)+" fatalities.")

serious_injuries = data['serious'].sum()
print("There are a total of "+str(serious_injuries)+" seriously injured.")

injuries = data['slight'].sum()
print("There are a total of "+str(injuries)+" slightly injured.")

Slice all bicycle accidents

In [None]:
data_bicycles = data

In [None]:
data_bicycles.head()

Save to file

In [None]:
print(data_bicycles.shape)
data_bicycles.to_csv('cycling_safety_uk_gov.csv')
print('Wrote file to: cycling_safety_uk_gov.csv')