In [1]:
import pandas as pd
import eurostat
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df = eurostat.get_data_df('tran_r_acci')
df.columns = df.columns.astype(str)
# Drop all years before 2012, keep only with unit measure Per Million Inhabitants and accident type deadly
df = df.drop(df.loc[:, '2011': ].columns, axis = 1)
df = df[df.unit == 'P_MHAB']
df = df[df.victim == 'KIL']




In [3]:
# Rename to avoid problems using \
df = df.rename(columns={'geo\\time': ' NUTS 2'})

# Insert missing cloumn for 2019
df.insert(3,"2019",np.NaN)
df.head()

Unnamed: 0,victim,unit,NUTS 2,2019,2018,2017,2016,2015,2014,2013,2012
1410,KIL,P_MHAB,AT,,46.0,47.0,50.0,56.0,51.0,54.0,63.0
1411,KIL,P_MHAB,AT1,,35.0,36.0,40.0,45.0,45.0,40.0,55.0
1412,KIL,P_MHAB,AT11,,44.0,86.0,65.0,83.0,77.0,59.0,105.0
1413,KIL,P_MHAB,AT12,,62.0,56.0,68.0,80.0,74.0,69.0,90.0
1414,KIL,P_MHAB,AT13,,10.0,11.0,10.0,7.0,12.0,10.0,14.0


In [4]:
# Merge on all entries which are also in the target variable cities to extract only the interesting cities
target_cities = pd.read_csv("Cities_with_codes.csv")
road_accidents = pd.merge(target_cities, df, on=[' NUTS 2'])

# Check for missing cities
missing_cities = target_cities[-target_cities[' NUTS 2'].isin(road_accidents[' NUTS 2'])]


print("Missing cities: ", missing_cities.values[:,0])

# Data from Icelandic ministry of transportation
#https://www.samgongustofa.is/umferd/tolfraedi/slysatolur/arsskyrslur-slysaskraningar/
# Reported as deaths per 10,000
Reykjavik  = [0.0, 30.0, 20.0, 10.0, 20.0, 0.0, 10.0, 10.0]
Reykjavik = ['Reykjavik', 'IS001C1','IS00','IS','KIL','P_MHAB']+Reykjavik
road_accidents.loc[77] = Reykjavik

# Table 1-1 in : https://www.abs.gov.rs/admin/upload/documents/20181016102533-statistical_report_2016_english.pdf
# 20% of the population is in Belgrade so we take 20% of the accidents as occuring there
# and divide by 1.7 (million inhabitants)
Belgrade = [np.NaN, np.NaN,np.NaN, 619, 594, 476, 548, 551]
Belgrade = np.multiply(Belgrade,(0.2/1.7))
Belgrade = np.around(Belgrade, 1)
Belgrade = ['Belgrade', '-','RS11','RS','KIL','P_MHAB']+Belgrade.tolist()
road_accidents.loc[78] = Belgrade

# Traffic death data from Scotland
# https://statistics.gov.scot/data/road-safety
# Select by region

# Divide by 0.5 (million inhabitants)
Edinburgh = [np.NaN, 5, 6, 9, 3, 11, 8, 13]
Edinburgh = np.divide(Edinburgh,0.5)
Edinburgh = np.around(Edinburgh, 1)
Edinburgh = ['Edinburgh', 'UK007C1','UKM7','UK','KIL','P_MHAB']+Edinburgh.tolist()
road_accidents.loc[79] = Edinburgh

# Divide by 0.6 (million inhabitants)
Glasgow = [np.NaN, 10, 7, 8, 15, 18, 4, 7]
Glasgow = np.divide(Glasgow,0.6)
Glasgow = np.around(Glasgow, 1)
Glasgow = ['Glasgow', 'UK004C1','UKM3','UK','KIL','P_MHAB']+Glasgow.tolist()
road_accidents.loc[80] = Glasgow

road_accidents

Missing cities:  ['Belgrade' 'Edinburgh' 'Glasgow' 'Reykjavik']


Unnamed: 0,City,City Code,NUTS 2,Country,victim,unit,2019,2018,2017,2016,2015,2014,2013,2012
0,Amsterdam,NL002C1,NL32,NL,KIL,P_MHAB,,32.0,29.0,31.0,28.0,26.0,26.0,33.0
1,Ankara,TR001C1,TR51,TR,KIL,P_MHAB,,72.0,73.0,73.0,75.0,30.0,32.0,42.0
2,Antwerp,BE002C1,BE2,BE,KIL,P_MHAB,,47.0,46.0,52.0,61.0,63.0,60.0,60.0
3,Athens,EL001C1,EL30,EL,KIL,P_MHAB,,48.0,43.0,53.0,54.0,47.0,54.0,59.0
4,Barcelona,ES002C1,ES51,ES,KIL,P_MHAB,,44.0,38.0,38.0,39.0,37.0,36.0,45.0
5,Valencia,ES003C1,ES51,ES,KIL,P_MHAB,,44.0,38.0,38.0,39.0,37.0,36.0,45.0
6,Belfast,UK012C1,UKN0,UK,KIL,P_MHAB,,29.0,34.0,37.0,40.0,43.0,31.0,26.0
7,Bergen,NO002C1,NO05,NO,KIL,P_MHAB,,23.0,26.0,18.0,16.0,22.0,46.0,29.0
8,Berlin,DE001C1,DE30,DE,KIL,P_MHAB,,12.0,10.0,16.0,14.0,15.0,11.0,13.0
9,Birmingham,UK002C1,UKG3,UK,KIL,P_MHAB,,20.0,20.0,21.0,20.0,20.0,22.0,21.0


In [5]:
# Check for missing values
print('Missing values for columns:')
def NaN_percent(df, column_name):
    row_count = df[column_name].shape[0]
    empty_values = row_count - df[column_name].count()
    return (100.0*empty_values)/row_count
for i in list(road_accidents):
    print("%s: %.2f%%" % (i, NaN_percent(road_accidents,i)))


Missing values for columns:
City: 0.00%
 City Code: 0.00%
 NUTS 2: 0.00%
 Country: 0.00%
victim: 0.00%
unit: 0.00%
2019: 98.77%
2018: 1.23%
2017: 1.23%
2016: 1.23%
2015: 1.23%
2014: 1.23%
2013: 8.64%
2012: 8.64%


In [6]:
has_nan = road_accidents[road_accidents.isna().any(axis=1)]
has_nan

Unnamed: 0,City,City Code,NUTS 2,Country,victim,unit,2019,2018,2017,2016,2015,2014,2013,2012
0,Amsterdam,NL002C1,NL32,NL,KIL,P_MHAB,,32.0,29.0,31.0,28.0,26.0,26.0,33.0
1,Ankara,TR001C1,TR51,TR,KIL,P_MHAB,,72.0,73.0,73.0,75.0,30.0,32.0,42.0
2,Antwerp,BE002C1,BE2,BE,KIL,P_MHAB,,47.0,46.0,52.0,61.0,63.0,60.0,60.0
3,Athens,EL001C1,EL30,EL,KIL,P_MHAB,,48.0,43.0,53.0,54.0,47.0,54.0,59.0
4,Barcelona,ES002C1,ES51,ES,KIL,P_MHAB,,44.0,38.0,38.0,39.0,37.0,36.0,45.0
5,Valencia,ES003C1,ES51,ES,KIL,P_MHAB,,44.0,38.0,38.0,39.0,37.0,36.0,45.0
6,Belfast,UK012C1,UKN0,UK,KIL,P_MHAB,,29.0,34.0,37.0,40.0,43.0,31.0,26.0
7,Bergen,NO002C1,NO05,NO,KIL,P_MHAB,,23.0,26.0,18.0,16.0,22.0,46.0,29.0
8,Berlin,DE001C1,DE30,DE,KIL,P_MHAB,,12.0,10.0,16.0,14.0,15.0,11.0,13.0
9,Birmingham,UK002C1,UKG3,UK,KIL,P_MHAB,,20.0,20.0,21.0,20.0,20.0,22.0,21.0


In [14]:
# Impute using padding
road_accidents.loc[:, '2019':'2013'] = road_accidents.loc[:, '2019':'2012'].fillna(method='backfill',axis=1)
road_accidents.loc[:, '2019':'2013'] = road_accidents.loc[:, '2019':'2012'].fillna(method='ffill',axis=1)
road_accidents

Unnamed: 0,City,City Code,NUTS 2,Country,victim,unit,2019,2018,2017,2016,2015,2014,2013,2012
0,Amsterdam,NL002C1,NL32,NL,KIL,P_MHAB,32.0,32.0,29.0,31.0,28.0,26.0,26.0,33.0
1,Ankara,TR001C1,TR51,TR,KIL,P_MHAB,72.0,72.0,73.0,73.0,75.0,30.0,32.0,42.0
2,Antwerp,BE002C1,BE2,BE,KIL,P_MHAB,47.0,47.0,46.0,52.0,61.0,63.0,60.0,60.0
3,Athens,EL001C1,EL30,EL,KIL,P_MHAB,48.0,48.0,43.0,53.0,54.0,47.0,54.0,59.0
4,Barcelona,ES002C1,ES51,ES,KIL,P_MHAB,44.0,44.0,38.0,38.0,39.0,37.0,36.0,45.0
5,Valencia,ES003C1,ES51,ES,KIL,P_MHAB,44.0,44.0,38.0,38.0,39.0,37.0,36.0,45.0
6,Belfast,UK012C1,UKN0,UK,KIL,P_MHAB,29.0,29.0,34.0,37.0,40.0,43.0,31.0,26.0
7,Bergen,NO002C1,NO05,NO,KIL,P_MHAB,23.0,23.0,26.0,18.0,16.0,22.0,46.0,29.0
8,Berlin,DE001C1,DE30,DE,KIL,P_MHAB,12.0,12.0,10.0,16.0,14.0,15.0,11.0,13.0
9,Birmingham,UK002C1,UKG3,UK,KIL,P_MHAB,20.0,20.0,20.0,21.0,20.0,20.0,22.0,21.0


In [8]:
#To be able to merge with the other data
#Transform the columns of each year to a variable year
yearly_data = dict()
deaths_in_road_accidents  = pd.DataFrame()
for year in range(2012,2019):
    yearly_data= road_accidents[ list(road_accidents.loc[:,'City':' Country']) + [f"{year}"]]
    yearly_data.insert(4, "Year", year)
    yearly_data = yearly_data.rename(columns={f"{year}": "Deaths_in_road_accidents"})
    deaths_in_road_accidents = deaths_in_road_accidents.append(yearly_data)
deaths_in_road_accidents = deaths_in_road_accidents.reset_index(drop=True)

In [9]:
# Save as csv
deaths_in_road_accidents.to_csv(path_or_buf='deaths_in_road_accidents.csv', index=False)

In [10]:
deaths_in_road_accidents

Unnamed: 0,City,City Code,NUTS 2,Country,Year,Deaths_in_road_accidents
0,Amsterdam,NL002C1,NL32,NL,2012,33.0
1,Ankara,TR001C1,TR51,TR,2012,42.0
2,Antwerp,BE002C1,BE2,BE,2012,60.0
3,Athens,EL001C1,EL30,EL,2012,59.0
4,Barcelona,ES002C1,ES51,ES,2012,45.0
5,Valencia,ES003C1,ES51,ES,2012,45.0
6,Belfast,UK012C1,UKN0,UK,2012,26.0
7,Bergen,NO002C1,NO05,NO,2012,29.0
8,Berlin,DE001C1,DE30,DE,2012,13.0
9,Birmingham,UK002C1,UKG3,UK,2012,21.0


In [11]:
df

Unnamed: 0,victim,unit,NUTS 2,2019,2018,2017,2016,2015,2014,2013,2012
1410,KIL,P_MHAB,AT,,46.0,47.0,50.0,56.0,51.0,54.0,63.0
1411,KIL,P_MHAB,AT1,,35.0,36.0,40.0,45.0,45.0,40.0,55.0
1412,KIL,P_MHAB,AT11,,44.0,86.0,65.0,83.0,77.0,59.0,105.0
1413,KIL,P_MHAB,AT12,,62.0,56.0,68.0,80.0,74.0,69.0,90.0
1414,KIL,P_MHAB,AT13,,10.0,11.0,10.0,7.0,12.0,10.0,14.0
1415,KIL,P_MHAB,AT2,,55.0,59.0,59.0,66.0,63.0,65.0,72.0
1416,KIL,P_MHAB,AT21,,53.0,53.0,59.0,72.0,47.0,74.0,83.0
1417,KIL,P_MHAB,AT22,,56.0,61.0,58.0,63.0,71.0,60.0,67.0
1418,KIL,P_MHAB,AT3,,56.0,54.0,57.0,63.0,50.0,64.0,68.0
1419,KIL,P_MHAB,AT31,,65.0,56.0,62.0,61.0,53.0,70.0,66.0
