In [6]:
%config IPCompleter.use_jedi=False
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from Utils import print_memory_usage
import calendar

from statsmodels.stats.proportion import proportion_confint

In [3]:
data = pd.read_csv("Data/Motor_Vehicle_Collisions_-_Crashes.csv",low_memory=False,parse_dates=[["CRASH DATE","CRASH TIME"]])

In [4]:
data = data.rename({"CRASH DATE_CRASH TIME" : "TIME"},axis=1)

In [5]:
#Convert string columns to category
obj_cols = data.columns[data.dtypes == "object"].tolist()
for col in obj_cols:
    data[col] = data[col].astype("category")
data.dtypes

TIME                             datetime64[ns]
BOROUGH                                category
ZIP CODE                               category
LATITUDE                                float64
LONGITUDE                               float64
LOCATION                               category
ON STREET NAME                         category
CROSS STREET NAME                      category
OFF STREET NAME                        category
NUMBER OF PERSONS INJURED               float64
NUMBER OF PERSONS KILLED                float64
NUMBER OF PEDESTRIANS INJURED             int64
NUMBER OF PEDESTRIANS KILLED              int64
NUMBER OF CYCLIST INJURED                 int64
NUMBER OF CYCLIST KILLED                  int64
NUMBER OF MOTORIST INJURED                int64
NUMBER OF MOTORIST KILLED                 int64
CONTRIBUTING FACTOR VEHICLE 1          category
CONTRIBUTING FACTOR VEHICLE 2          category
CONTRIBUTING FACTOR VEHICLE 3          category
CONTRIBUTING FACTOR VEHICLE 4          c

In [6]:
data.to_parquet("Data/Collisions.parquet",index=False,engine="pyarrow")

In [7]:
data = pd.read_parquet("Data/Collisions.parquet",engine="pyarrow")

We see that the vehicle codes are very mixed

In [8]:
counts = data["VEHICLE TYPE CODE 1"].value_counts()
counts[counts > 1000]

Sedan                                  511674
PASSENGER VEHICLE                      416206
Station Wagon/Sport Utility Vehicle    404304
SPORT UTILITY / STATION WAGON          180291
Taxi                                    47803
4 dr sedan                              40131
TAXI                                    31911
Pick-up Truck                           31488
VAN                                     25266
OTHER                                   22967
Box Truck                               22056
UNKNOWN                                 19935
Bus                                     18831
LARGE COM VEH(6 OR MORE TIRES)          14397
BUS                                     13993
SMALL COM VEH(4 TIRES)                  13216
Bike                                    12483
PICK-UP TRUCK                           11505
LIVERY VEHICLE                          10481
Tractor Truck Diesel                     9396
Van                                      8137
Motorcycle                        

The same vehicle can appear in both capitalized and lowercase. Let us combine these:

In [9]:
codes = ["VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2", "VEHICLE TYPE CODE 3", "VEHICLE TYPE CODE 4","VEHICLE TYPE CODE 5"]
data[codes] = data[codes].applymap(lambda s: s.lower() if pd.notnull(s) else np.NaN)

In [10]:
counts = data["VEHICLE TYPE CODE 1"].value_counts()
counts[counts > 1000]

sedan                                  511684
passenger vehicle                      416206
station wagon/sport utility vehicle    404304
sport utility / station wagon          180291
taxi                                    79716
pick-up truck                           42993
4 dr sedan                              40131
van                                     33801
bus                                     32852
other                                   22972
box truck                               22086
unknown                                 19944
large com veh(6 or more tires)          14397
small com veh(4 tires)                  13216
bike                                    12483
motorcycle                              11098
livery vehicle                          10481
tractor truck diesel                     9396
ambulance                                6133
dump                                     3474
convertible                              3437
2 dr sedan                        

Lets look at taxis vs bikes (including e-bikes).
More information about taxi trips can be found here: https://data.cityofnewyork.us/Transportation/2018-Yellow-Taxi-Trip-Data/t29m-gskq
From this, we see that there were about 112M taxi trips in 2018, and we know that there were about 17M CitiBike trips.
Total bike trips in 2016:
https://www1.nyc.gov/html/dot/downloads/pdf/cycling-in-the-city-2018.pdf
about 167M

In [11]:
taxi_trips_2018 = 112e6
bike_trips_2016 = 167e6

In [12]:
taxi_crashes = data[(data[codes] == "taxi").any(axis="columns")]
bike_crashes = data[(data[codes].isin(["bike","e-bike"]).any(axis="columns"))]

In [13]:
len(taxi_crashes)

131643

In [14]:
len(bike_crashes)

44398

In [15]:
data_2018 = data[data["TIME"].dt.year==2018]
data_2018["NUMBER OF CYCLIST INJURED"].sum()

4725

Estimate proportion of crashes per trip for taxis and bikes

In [20]:
bike_crashcount_2016 = (bike_crashes["TIME"].dt.year==2016).sum()
bike_crashes_per_trip = np.array(proportion_confint(bike_crashcount_2016,bike_trips_2016))
bike_crashes_per_trip

array([3.46466088e-05, 3.64551876e-05])

In [21]:
taxi_crashcount_2018 = (taxi_crashes["TIME"].dt.year==2018).sum()
taxi_crashes_per_trip = np.array(proportion_confint(taxi_crashcount_2018,taxi_trips_2018))
taxi_crashes_per_trip

array([0.00016905, 0.0001739 ])

In [22]:
ratio = np.array([taxi_crashes_per_trip[0] / bike_crashes_per_trip[1], taxi_crashes_per_trip[1] / bike_crashes_per_trip[0]])

In [23]:
ratio

array([4.63715293, 5.01919681])

It looks like one is about 4-5 times more likely to crash in a taxi than on a bike

What about injuries and death though? (Compare with https://www1.nyc.gov/html/dot/html/bicyclists/bikestats.shtml#crashdata)

In [27]:
data_2016 = data[data["TIME"].dt.year==2016]

In [29]:
bike_injurycount_2016 = data_2016["NUMBER OF CYCLIST INJURED"].sum() + data_2016["NUMBER OF CYCLIST KILLED"].sum()
bike_injuries_per_trip = np.array(proportion_confint(bike_injurycount_2016,bike_trips_2016))
bike_injuries_per_trip

array([2.90689146e-05, 3.07274926e-05])

In [32]:
taxi_crashes_2018 = taxi_crashes[taxi_crashes["TIME"].dt.year==2018]
taxi_injurycount_2018 = taxi_crashes_2018["NUMBER OF MOTORIST INJURED"].sum() + taxi_crashes_2018["NUMBER OF MOTORIST KILLED"].sum() #Maybe unfair because this includes also the opposing party, not just the taxi
taxi_injuries_per_trip = np.array(proportion_confint(taxi_injurycount_2018,taxi_trips_2018))
taxi_injuries_per_trip

array([3.50031065e-05, 3.72290364e-05])

In terms of death and injury, cycling is also slightly less dangerous.

However, if an accident occurs, injury is much more likely:

In [35]:
#Notes that this also counts injuries of the other party
(taxi_crashes["NUMBER OF MOTORIST INJURED"].sum()  + taxi_crashes["NUMBER OF MOTORIST KILLED"].sum()) / len(taxi_crashes)

0.2100605425278974

In [36]:
(bike_crashes["NUMBER OF CYCLIST INJURED"].sum() + bike_crashes["NUMBER OF CYCLIST KILLED"].sum()) / len(bike_crashes)

0.7284111896932294

Whose fault are accidents usually?

In [183]:
len(bike_crashes)

44398

In [12]:
contrib = [f"CONTRIBUTING FACTOR VEHICLE {i}" for i in range(1,6)]

In [11]:
codes

['VEHICLE TYPE CODE 1',
 'VEHICLE TYPE CODE 2',
 'VEHICLE TYPE CODE 3',
 'VEHICLE TYPE CODE 4',
 'VEHICLE TYPE CODE 5']

In [13]:
contrib

['CONTRIBUTING FACTOR VEHICLE 1',
 'CONTRIBUTING FACTOR VEHICLE 2',
 'CONTRIBUTING FACTOR VEHICLE 3',
 'CONTRIBUTING FACTOR VEHICLE 4',
 'CONTRIBUTING FACTOR VEHICLE 5']

In [187]:
count = 0
for v,c in zip(codes,contrib):
    crashes = bike_crashes[bike_crashes[v].isin(["bike","e-bike"])]
    count += crashes[crashes[c] != "Unspecified"][c].count()
print(f"The bycicle had a contributing factor in {count/len(bike_crashes):.0%} of bike crashes.")

The bycicle had a contributing factor in 42% of bike crashes.


What are typical causes of crashes for bikes?

In [38]:
total_crashes = pd.Series(dtype="Int64")
for v,c in zip(codes,contrib):
    crashes = bike_crashes[bike_crashes[v].isin(["bike","e-bike"])]
    #print(crashes[v])
    total_crashes = total_crashes.add(crashes[c].value_counts(),fill_value=0)
total_crashes=total_crashes.sort_values(ascending=False)

In [39]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(total_crashes)

Unspecified                                              26256.0
Driver Inattention/Distraction                            6794.0
Pedestrian/Bicyclist/Other Pedestrian Error/Confusion     2670.0
Failure to Yield Right-of-Way                             1885.0
Passing or Lane Usage Improper                            1205.0
Traffic Control Disregarded                               1023.0
Following Too Closely                                      681.0
Unsafe Speed                                               621.0
Other Vehicular                                            609.0
Passing Too Closely                                        516.0
Driver Inexperience                                        470.0
Turning Improperly                                         272.0
Unsafe Lane Changing                                       226.0
View Obstructed/Limited                                    216.0
Reaction to Uninvolved Vehicle                             192.0
Alcohol Involvement      

I see 53 counts of defective brakes, 9 counts of defective tires, and 609 of "other vehicular".

In [40]:
count = 609 + 9 + 53
print(f"The bycicle had a defect in {count/len(bike_crashes):.0%} of bike crashes.")

The bycicle had a defect in 2% of bike crashes.
