# France

**Source of original dataset:** https://www.data.gouv.fr/en/datasets/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2019/

**Location of accidents:** Latitude, Longitude

**Date of accidents:** Date

**Outcome of accidents:** Fatality, Hospitalised Injury, Light Injury, PDO

In [None]:
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('display.max_colwidth', -1)
import numpy as np
from plotly import graph_objects as go
import plotly.express as px
from itertools import chain
import matplotlib.pyplot as plt
import pyproj

Setup input files

In [None]:
data_dir = "../data/france/"

characteristics_files = ["caracteristiques_2005.csv",
                  "caracteristiques_2006.csv",
                  "caracteristiques_2007.csv",
                  "caracteristiques_2008.csv",
                  "caracteristiques_2009.csv",
                  "caracteristiques_2010.csv",
                  "caracteristiques_2011.csv",
                  "caracteristiques_2012.csv",
                  "caracteristiques_2013.csv",
                  "caracteristiques_2014.csv",
                  "caracteristiques_2015.csv",
                  "caracteristiques_2016.csv",
                  "caracteristiques-2017.csv",
                  "caracteristiques-2018.csv"]
users_files = ["usagers_2005.csv",
                  "usagers_2006.csv",
                  "usagers_2007.csv",
                  "usagers_2008.csv",
                  "usagers_2009.csv",
                  "usagers_2010.csv",
                  "usagers_2011.csv",
                  "usagers_2012.csv",
                  "usagers_2013.csv",
                  "usagers_2014.csv",
                  "usagers_2015.csv",
                  "usagers_2016.csv",
                  "usagers-2017.csv",
                  "usagers-2018.csv"]
vehicles_files = ["vehicules_2005.csv",
                  "vehicules_2006.csv",
                  "vehicules_2007.csv",
                  "vehicules_2008.csv",
                  "vehicules_2009.csv",
                  "vehicules_2010.csv",
                  "vehicules_2011.csv",
                  "vehicules_2012.csv",
                  "vehicules_2013.csv",
                  "vehicules_2014.csv",
                  "vehicules_2015.csv",
                  "vehicules_2016.csv",
                  "vehicules-2017.csv",
                  "vehicules-2018.csv"]

characteristics_data_files = [data_dir + s for s in characteristics_files]
users_data_files = [data_dir + s for s in users_files]
vehicles_data_files = [data_dir + s for s in vehicles_files]

Read original data

In [None]:
from pandas.io.parsers import ParserError
characteristics_aux = []
users_aux = []
vehicles_aux = []


for i, (characteristics_data, users_data, vehicles_data) in enumerate(zip(characteristics_data_files, users_data_files, vehicles_data_files)):
    if i+2005 == 2009:
        sep = '\t'
    else:
        sep = ','
    
    data_characteristics = pd.read_csv(characteristics_data, encoding = "ANSI", sep=sep)
    data_users = pd.read_csv(users_data, encoding = "ANSI", sep=',')
    data_vehicles = pd.read_csv(vehicles_data, encoding = "ANSI", sep=',')
    
    print(2005+i, data_characteristics.shape, data_users.shape, data_vehicles.shape)
    characteristics_aux.append(data_characteristics)
    users_aux.append(data_users)
    vehicles_aux.append(data_vehicles)

Prepare outcome data from users files

In [None]:
for i in range(len(users_aux)):
    users = users_aux[i]
    
    users['indemme'] = 0
    users['tues'] = 0
    users['blesse_hosp'] = 0
    users['blesse_leg'] = 0
    
    users.loc[users.grav == 1,'indemme'] = 1
    users.loc[users.grav == 2,'tues'] = 1
    users.loc[users.grav == 3,'blesse_hosp'] = 1
    users.loc[users.grav == 4,'blesse_leg'] = 1
    
    users = (users.groupby('Num_Acc')
              .agg({'indemme': np.sum,
                    'tues': np.sum,
                    'blesse_hosp': np.sum,
                    'blesse_leg': np.sum,})
         )
    users_aux[i] = users

Prepare bicycles data from users files

In [None]:
for i in range(len(vehicles_aux)):
    vehicles = vehicles_aux[i]
    
    vehicles['bicycles'] = 0
    
    vehicles.loc[vehicles.catv == 1,'bicycles'] = 1

    vehicles = (vehicles.groupby('Num_Acc')
                .agg({'bicycles': np.sum})
         )
    
    vehicles_aux[i] = vehicles

Aggregate data

In [None]:
data_aux = []

for i in range(len(characteristics_aux)):
    data = pd.merge(characteristics_aux[i], users_aux[i], on="Num_Acc", how="left")
    data = pd.merge(data, vehicles_aux[i], on="Num_Acc", how="left")
    data_aux.append(data)
    
list_of_dfs = data_aux
list_of_dicts = [cur_df.T.to_dict().values() for cur_df in list_of_dfs]    
data = pd.DataFrame(list(chain(*list_of_dicts)))

Remove rows with invalid lat/long

In [None]:
data['lat'] = data['lat'].fillna(0)
data['long'] = data['long'].fillna(0)

data['lat'] = data['lat'].astype('str')
data['long'] = data['long'].astype('str')

data.loc[data.lat == '-','lat'] = 0
data.loc[data.long == '-','long'] = 0

data['lat'] = data['lat'].astype('float')
data['long'] = data['long'].astype('float')

# Fix lat/long negative sign for french colonies
data.loc[data.gps == 'Y','lat'] = data.lat*-1
data.loc[data.gps == 'R','lat'] = data.lat*-1
data.loc[data.gps == 'G','long'] = data.long*-1
data.loc[data.gps == 'A','long'] = data.long*-1

data = data[(data.lat != 0.0) & (data.long != 0.0)]

data['lat'] = 0.00001 * data['lat'].astype('float')
data['long'] = 0.00001 * data['long'].astype('float')

Create Datetime column

In [None]:
data['an'] = data['an'].astype('int') + 2000
data['hrmn'] = data['hrmn'].astype('str').str.zfill(4)

data['hour'] = data['hrmn'].str[0:2]
data['minute'] = data['hrmn'].str[2:4]

data['Date'] = data['an'].astype('str') + '/' + data['mois'].astype('str') + '/' + data['jour'].astype('str') + ' ' + data['hour'].astype('str') + ':' + data['minute'].astype('str')
data['Date'] = pd.to_datetime(data['Date'])

Setup latitude & longitude column

In [None]:
data['Longitude'] = data['long']
data['Latitude'] = data['lat']

Some key statistics

In [None]:
print('Accidents between '+str(data['Date'].min())+' and '+str(data['Date'].max()))

total_accidents = data.shape[0]
print("There are a total of "+str(total_accidents)+" accidents.")

fatalities = data["tues"].sum()
print("There are a total of "+str(fatalities)+" fatalities.")

serious_injuries = data["blesse_hosp"].sum()
print("There are a total of "+str(serious_injuries)+" seriously injured.")

light_injuries = data["blesse_leg"].sum()
print("There are a total of "+str(light_injuries)+" injured.")

bicycles = data["bicycles"].sum()
print("There are a total of "+str(bicycles)+" bicycles involved in all the accidents.")

Slice all bicycle accidents

In [None]:
data_bicycles = data[data['bicycles']>0]

In [None]:
data_bicycles.head()

Save to file

In [None]:
print(data_bicycles.shape)
data_bicycles.to_csv('cycling_safety_france.csv')
print('Wrote file to: cycling_safety_france.csv')