# Barcelona, Spain

**Source of original dataset:** https://opendata-ajuntament.barcelona.cat/data/en/dataset?q=accident&sort=fecha_publicacion+desc

**Location of accidents:** Latitude, Longitude

**Date of accidents:** Date

**Outcome of accidents:** Numero_morts, Numero_lesionats_greus, Numero_lesionats_lleus (Fatalities, Serious Injury, Light Injury

In [None]:
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('display.max_colwidth', -1)
import numpy as np
from plotly import graph_objects as go
import plotly.express as px
from itertools import chain
import matplotlib.pyplot as plt
import pyproj

Setup input files

In [None]:
data_dir = "../data/barcelona/"

accident_files = ["2010_ACCIDENTS_GU_BCN_2010.csv",
                  "2011_ACCIDENTS_GU_BCN_2011.csv",
                  "2012_ACCIDENTS_GU_BCN_2012.csv",
                  "2013_ACCIDENTS_GU_BCN_2013.csv",
                  "2014_ACCIDENTS_GU_BCN_2014.csv",
                  "2015_accidents_gu_bcn.csv",
                  "2016_accidents_gu_bcn.csv",
                  "2017_accidents_gu_bcn.csv",
                  "2018_accidents_gu_bcn.csv"]
people_files = ["2010_ACCIDENTS_PERSONES_GU_BCN_2010.csv",
                "2011_ACCIDENTS_PERSONES_GU_BCN_2011.csv",
                "2012_ACCIDENTS_PERSONES_GU_BCN_2012.csv",
                "2013_ACCIDENTS_PERSONES_GU_BCN_2013.csv",
                "2014_ACCIDENTS_PERSONES_GU_BCN_2014.csv",
                "2015_ACCIDENTS_PERSONES_GU_BCN_2015.csv",
                "2016_accidents_persones_gu_bcn.csv",
                "2017_accidents_persones_gu_bcn_.csv",
                "2018_accidents_persones_gu_bcn_.csv"]
people_data_files = [data_dir + s for s in people_files]
accidents_data_files = [data_dir + s for s in accident_files]

Read original data

In [None]:
from pandas.io.parsers import ParserError
data_aux = []

for i, (accidents_data, people_data) in enumerate(zip(accidents_data_files, people_data_files)):
    try:
        data_acc = pd.read_csv(accidents_data, encoding = "ANSI")
    except ParserError:
        data_acc = pd.read_csv(accidents_data, encoding = "ANSI", sep = ';')
    try:
        data_people = pd.read_csv(people_data, encoding = "ANSI")
    except ParserError:
        data_people = pd.read_csv(people_data, encoding = "ANSI", sep = ';')
    
   
    
    for idx in ['Número d expedient','Numero_expedient', 'N£mero d\'expedient']:
        try:
            data_acc = data_acc.rename(columns={idx: "Número d\'expedient"})
        except:
            pass
        try:
            data_people = data_people.rename(columns={idx: "Número d\'expedient"})
        except:
            pass
    
    for idx in ['Descripci¢ causa vianant','Descripcio_causa_vianant']:
        try:
            data_acc = data_acc.rename(columns={idx: "Descripció causa vianant"})
        except:
            pass
        try:
            data_people = data_people.rename(columns={idx: "Descripció causa vianant"})
        except:
            pass
    for idx in ['Descripci¢ victimitzaci¢','Descripcio_victimitzacio']:
        try:
            data_acc = data_acc.rename(columns={idx: "Descripció victimització"})
        except:
            pass
        try:
            data_people = data_people.rename(columns={idx: "Descripció victimització"})
        except:
            pass
    for idx in ['Desc_Tipus_vehicle_implicat']:
        try:
            data_acc = data_acc.rename(columns={idx: "Desc. Tipus vehicle implicat"})
        except:
            pass
        try:
            data_people = data_people.rename(columns={idx: "Desc. Tipus vehicle implicat"})
        except:
            pass
 
    data_people = data_people.groupby( 'Número d\'expedient').agg({
                                 'Desc. Tipus vehicle implicat': '; '.join, 
                                 'Descripció victimització': '; '.join,
                                 'Descripció causa vianant': '; '.join}).reset_index()
    
    data_acc = data_acc.sort_values(by ='Número d\'expedient' ).reset_index()    
    data_acc['Número d\'expedient'] = data_acc['Número d\'expedient'].str.rstrip()
    data_people['Número d\'expedient'] = data_people['Número d\'expedient'].str.rstrip()
    
    data = pd.merge(data_acc, data_people, on="Número d\'expedient", how="left")

    data = data.sort_values(by ='Número d\'expedient' )
    data_aux.append(data)

Fixing column names

In [None]:
print('Fixing column names')
for i in range(len(data_aux)):
    name1 = ['Any',
             'Codi barri',
             'Codi carrer',
             'Codi districte',
             'Coordenada UTM (X)',
             'Coordenada UTM (Y)',
             'Desc. Tipus vehicle implicat',
             'Descripció dia setmana',
             'Descripció tipus dia',
             'Descripció torn',
             'Descripció victimització',
             'Descripció causa vianant_x',
             'Dia de mes',
             'Dia de setmana',
             'Dia setmana',
             'Hora de dia',
             'Latitud',
             'Longitud',
             'Mes de any',
             'Nom barri',
             'Nom carrer',
             'Nom districte',
             'Nom mes',
             'Num postal caption',
             'Num_postal',
             'Num_postal ',
             'Número de lesionats greus',
             'Número de lesionats lleus',
             'Número de morts',
             'Número de vehicles implicats',
             'Número de víctimes'
            ]
    name2 = ['NK Any',
             'Codi_barri',
             'Codi_carrer',
             'Codi_districte',
             'Coordenada_UTM_X',
             'Coordenada_UTM_Y',
             'Desc. Tipus vehicle implicat',
             'Descripcio_dia_setmana',
             'Descripcio_tipus_dia',
             'Descripcio_torn',
             'Descripció victimització',
             'Descripció causa vianant_y',
             'Dia_mes',
             'Dia_setmana',
             'Dia_setmana',
             'Hora_dia',
             'Latitud',
             'Longitud',
             'Mes_any',
             'Nom_barri',
             'Nom_carrer',
             'Nom_districte',
             'Nom_mes',
             'Num_postal',
             'Num_postal',
             'Num_postal',
             'Numero_lesionats_greus',
             'Numero_lesionats_lleus',
             'Numero_morts',
             'Numero_vehicles_implicats',
             'Numero_victimes'
             ]
    for key1, key2 in zip(name1, name2):
        try:
            data_aux[i] = data_aux[i].rename(columns={key1: key2})
        except:
            pass

for i in range(len(data_aux)):       
    if 'Latitud' not in data_aux[i]:
        data_aux[i]['Latitud'] = np.nan
    if 'Longitud' not in data_aux[i]:
        data_aux[i]['Longitud'] = np.nan
        
list_of_dfs = data_aux
list_of_dicts = [cur_df.T.to_dict().values() for cur_df in list_of_dfs]    
data = pd.DataFrame(list(chain(*list_of_dicts)))    
    
    
print(data.shape)




Create Datetime column

In [None]:
data['Dia_mes'] = data['Dia_mes'].fillna(method='pad').astype('int')
data['Mes_any'] = data['Mes_any'].fillna(method='pad').astype('int')
data['NK Any'] = data['NK Any'].fillna(method='pad').astype('int')
data['Hora_dia'] = data['Hora_dia'].fillna(method='pad').astype('int')

data['Date'] = data['Dia_mes'].astype('str') + '/' + data['Mes_any'].astype('str') + '/' + data['NK Any'].astype('str') + ' ' + data['Hora_dia'].astype('str') + ':00:00' 
data['Date'] = pd.to_datetime(data['Date'])

data.head()

Setup bicycles column

In [None]:
bicycle_related_codes = ['Bicicleta']
data['Bicycles'] = data['Desc. Tipus vehicle implicat'].str.contains('|'.join(bicycle_related_codes))
data['Bicycles'].fillna(value=0, inplace=True)
data['Bicycles'] = data['Bicycles'].astype('int')

Setup latitude & longitude column

In [None]:
data['Coordenada_UTM_X'] = data['Coordenada_UTM_X'].astype('str').str.replace(',','.')#.astype('float')
data['Coordenada_UTM_Y'] = data['Coordenada_UTM_Y'].astype('str').str.replace(',','.')#.astype('float')

In [None]:
_projections = {}

def unproject(z, l, x, y):
    if z not in _projections:
        _projections[z] = pyproj.Proj(proj='utm', zone=z, ellps='WGS84')
    if l < 'N':
        y -= 10000000
    lng, lat = _projections[z](x, y, inverse=True)
    return (lng, lat)

In [None]:
def rule(row):
    lon, lat = unproject('31','T', row['Coordenada_UTM_X'],row['Coordenada_UTM_Y'])
    return pd.Series({"Latitude": lat, "Longitude": lon})

data = data.merge(data.apply(rule, axis=1), left_index= True, right_index= True)

Some key statistics

In [None]:
print('Accidents between '+str(data['Date'].min())+' and '+str(data['Date'].max()))

total_accidents = data.shape[0]
print("There are a total of "+str(total_accidents)+" accidents.")

fatalities = data["Numero_morts"].sum()
print("There are a total of "+str(fatalities)+" fatalities.")

serious_injuries = data["Numero_lesionats_greus"].sum()
print("There are a total of "+str(serious_injuries)+" seriously injured.")

light_injuries = data["Numero_lesionats_lleus"].sum()
print("There are a total of "+str(light_injuries)+" injured.")

bicycles = data["Bicycles"].sum()
print("There are a total of "+str(bicycles)+" bicycles involved in all the accidents.")

Slice all bicycle accidents

In [None]:
data_bicycles = data[data['Bicycles']>0]

In [None]:
data_bicycles.head()

Save to file

In [None]:
print(data_bicycles.shape)
data_bicycles.to_csv('cycling_safety_barcelona.csv')
print('Wrote file to: cycling_safety_barcelona.csv')