In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, time
import matplotlib.pyplot as plt
import plotly.graph_objects as go

References:
- https://www.kite.com/python/answers/how-to-select-rows-by-multiple-label-conditions-with-pandas-loc-in-python
- https://stackoverflow.com/questions/50375985/pandas-add-column-with-value-based-on-condition-based-on-other-columns
- https://stackoverflow.com/questions/19384532/get-statistics-for-each-group-such-as-count-mean-etc-using-pandas-groupby
- https://stackoverflow.com/questions/44111307/python-pandas-count-rows-based-on-column
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html
- https://datatofish.com/line-chart-python-matplotlib/
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html

In [None]:
crashes = None
persons = None

In [None]:
NYC_collision_crashes_file = "Motor_Vehicle_Collisions_-_Crashes.csv"
NYC_collision_persons_file = "Motor_Vehicle_Collisions_-_Person.csv"

In [None]:
def load_collision_data(crashes, persons, crashes_file, persons_file):
    crashes = pd.read_csv(crashes_file, low_memory=False)
    persons = pd.read_csv(persons_file, low_memory=False)
    
    crashes.loc[:, 'CRASH_YEAR'] = crashes['CRASH DATE'].astype(np.str_).apply(lambda x: int(x.split('/')[-1]))
    
    return crashes, persons

In [None]:
crashes, persons = load_collision_data(crashes, persons, NYC_collision_crashes_file, NYC_collision_persons_file)

In [None]:
crashes.columns

In [None]:
persons.columns

## Hypothesis:
#### Of all collisions occurring late in the night (between 12 am - 5 am), the majority are caused due to overspeeding.

In [None]:
night_crash_data = None

In [None]:
def get_night_crashes(crashes):
    
    time_data = crashes['CRASH TIME']
    crashes['CRASH TIME'] = crashes['CRASH TIME'].apply(lambda x: datetime.strptime(x, "%H:%M").time())

    night_crash_data = crashes[(crashes['CRASH TIME'] < time(5, 0, 0))]

    return night_crash_data

In [None]:
def check_for_unsafe_speed(night_crash_data):
    night_crash_data = night_crash_data.assign(hasUnsafeSpeed=False)
    night_crash_data.loc[((night_crash_data['CONTRIBUTING FACTOR VEHICLE 1']=='Unsafe Speed') | 
                          (night_crash_data['CONTRIBUTING FACTOR VEHICLE 2']=='Unsafe Speed') | 
                          (night_crash_data['CONTRIBUTING FACTOR VEHICLE 3']=='Unsafe Speed') | 
                          (night_crash_data['CONTRIBUTING FACTOR VEHICLE 4']=='Unsafe Speed') | 
                          (night_crash_data['CONTRIBUTING FACTOR VEHICLE 5']=='Unsafe Speed')), 
                         'hasUnsafeSpeed'] = True
    
    return night_crash_data

In [None]:
def calculate_percentage_of_speedy_collisions(night_crash_data):
    unsafe_speed_metrics = night_crash_data['hasUnsafeSpeed'].value_counts().to_frame()
    percentage_unsafe_speed_collisions = unsafe_speed_metrics.iloc[1]*100/night_crash_data.shape[0]
    
    return percentage_unsafe_speed_collisions

In [None]:
def calculate_invalid_collision_percentage(night_crash_data):
    
    unwanted_contributing_factors = ['1','80','Unspecified']
    
    night_crash_data['isUnspecified'] = np.where((((
        night_crash_data['CONTRIBUTING FACTOR VEHICLE 1'].isin(unwanted_contributing_factors)) | 
        (night_crash_data['CONTRIBUTING FACTOR VEHICLE 1'].isnull())) & 
        ((night_crash_data['CONTRIBUTING FACTOR VEHICLE 2'].isin(unwanted_contributing_factors)) | 
         (night_crash_data['CONTRIBUTING FACTOR VEHICLE 2'].isnull())) & 
        ((night_crash_data['CONTRIBUTING FACTOR VEHICLE 3'].isin(unwanted_contributing_factors))  | 
         (night_crash_data['CONTRIBUTING FACTOR VEHICLE 3'].isnull())) & 
        ((night_crash_data['CONTRIBUTING FACTOR VEHICLE 4'].isin(unwanted_contributing_factors))  | 
         (night_crash_data['CONTRIBUTING FACTOR VEHICLE 4'].isnull())) & 
        ((night_crash_data['CONTRIBUTING FACTOR VEHICLE 5'].isin(unwanted_contributing_factors))  | 
         (night_crash_data['CONTRIBUTING FACTOR VEHICLE 5'].isnull()))), True, False)
    
    invalid_night_crash_data_metrics = night_crash_data['isUnspecified'].value_counts().to_frame()
    
    percentage_invalid_collision_data = invalid_night_crash_data_metrics.iloc[1]*100/night_crash_data.shape[0]
    
    return percentage_invalid_collision_data

In [None]:
night_crash_data = get_night_crashes(crashes)

In [None]:
night_crash_unsafe_speed_data = check_for_unsafe_speed(night_crash_data)

In [None]:
percentage_unsafe_speed_collisions = calculate_percentage_of_speedy_collisions(night_crash_unsafe_speed_data)

In [None]:
percentage_unsafe_speed_collisions

In [None]:
percentage_invalid_collision_data = calculate_invalid_collision_percentage(night_crash_data)

In [None]:
percentage_invalid_collision_data

## Hypothesis:

#### Of all crashes, a majority number is caused by persons between the age of 16-25.

In [None]:
crashes_persons = pd.merge(crashes, persons, left_on='COLLISION_ID', right_on='COLLISION_ID', how='inner')
crashes_persons.shape

In [None]:
crashes_persons.loc[:, 'CRASH_YEAR'] = crashes_persons['CRASH_DATE'].astype(np.str_).apply(lambda x: x.split('/')[-1])

In [None]:
del crashes_persons['CRASH_DATE']
del crashes_persons['CRASH_TIME']
del crashes_persons['UNIQUE_ID']

In [None]:
crashes_persons.columns

In [None]:
crashes_persons['PERSON_TYPE'].unique()

In [None]:
crashes_persons[(crashes_persons['VEHICLE_ID'].isna())][['COLLISION_ID', 'PERSON_TYPE', 'POSITION_IN_VEHICLE']]

In [None]:
crashes_persons[(crashes_persons['VEHICLE_ID'].isna()) & 
                (crashes_persons['PERSON_TYPE'] == 'Bicyclist')][['COLLISION_ID', 'PERSON_TYPE', 'POSITION_IN_VEHICLE']]

In [None]:
crashes_persons.drop(crashes_persons.loc[crashes_persons['VEHICLE_ID'].isna()].index, inplace=True)
crashes_persons.shape

In [None]:
data_subset = crashes_persons[crashes_persons['POSITION_IN_VEHICLE'] == 'Driver'][['COLLISION_ID', 'VEHICLE_ID', 'PERSON_TYPE', 'POSITION_IN_VEHICLE', 'PERSON_AGE']]

In [None]:
data_subset

In [None]:
data_subset.loc[:, 'age16-25'] = np.where((data_subset['PERSON_AGE'] > 15) & 
                                          (data_subset['PERSON_AGE'] < 26), True, False)

In [None]:
data_subset['age16-25'].value_counts()

## Hypothesis

#### The number of collisions increased with an increase in population

Source - https://worldpopulationreview.com/us-cities/new-york-city-ny-population

In [None]:
def get_NYC_population_data():
    #Source - https://worldpopulationreview.com/us-cities/new-york-city-ny-population
    
    NYC_Population_data = {'Year':  [2012,2013,2014,2015,2016,2017,2018,2019,2020],
        'Population': [8348030,8398740,8437390,8468180,8475980,8438270, 8398750, 8361040, 8323340],
        }

    NYC_Population = pd.DataFrame (NYC_Population_data, columns = ['Year','Population'])
    
    NYC_area = 300.4 #(in sq miles) Source - https://worldpopulationreview.com/us-cities/new-york-city-ny-population
    
    NYC_Population['Population_Density'] = NYC_Population['Population'].apply(lambda x: x/NYC_area)
    
    return NYC_Population

In [None]:
def get_total_crashes_per_year(crashes):
    crashes_data = crashes.copy()
    crashes_total = crashes_data.groupby(['CRASH_YEAR'], sort=False).size().reset_index(name='Total_Crashes')
    crashes_total['CRASH_YEAR'] = crashes_total['CRASH_YEAR'].astype('int64')
    
    crashes_total = crashes_total.sort_values(by=['CRASH_YEAR'])
    
    return crashes_total

In [None]:
def calculate_crashes_per_capita(crashes_total, NYC_Population):
    crashes_population = pd.merge(crashes_total, NYC_Population, left_on='CRASH_YEAR', right_on='Year', how='inner')
    
    crashes_population.loc[:, 'Crashes_per_capita'] = crashes_population['Total_Crashes']/crashes_population['Population']
    
    return crashes_population

In [None]:
def plot_crashes_per_capita_vs_year(crashes_population):
    plt.plot(crashes_population['Year'], crashes_population['Crashes_per_capita'], color='red', marker='o')
    plt.title('Crashes_per_Capita Vs Year for NYC')
    plt.xlabel('Year')
    plt.ylabel('Crashes_per_capita')
    plt.show()

In [None]:
def plot_crashes_per_capita_vs_population_density(crashes_population):
    plt.plot(crashes_population['Crashes_per_capita'], crashes_population['Population_Density'], color='red', marker='o')
    plt.title('Crashes_per_Capita Vs Population for NYC')
    plt.xlabel('Population Density')
    plt.ylabel('Crashes_per_capita')
    plt.show()

In [None]:
NYC_Population = get_NYC_population_data()

In [None]:
crashes_total = get_total_crashes_per_year(crashes)

In [None]:
crashes_population = calculate_crashes_per_capita(crashes_total, NYC_Population)

In [None]:
plot_crashes_per_capita_vs_year(crashes_population_sorted)

In [None]:
plot_crashes_per_capita_vs_population_density(crashes_population_sorted)

In [None]:
crashes_population_subset = crashes_population.drop([0,8],0)

In [None]:
plot_crashes_per_capita_vs_year(crashes_population_subset)

In [None]:
plot_crashes_per_capita_vs_population_density(crashes_population_subset)

## Hypothesis

#### Crash locations are not random. The collisions are bound to specific areas due to a badly planned network of roads/traffic signs.

In [None]:
mapbox_access_token = 'pk.eyJ1IjoiYWdhcndhbGFkYXJzaCIsImEiOiJja2h5ZGYyd3UwZTN3MnFwYzM1YW9qNnFvIn0.SasVV15822weUxlZ3G0P8Q'

fig = go.Figure(go.Scattermapbox(
    lat=crashes[(crashes['BOROUGH'] == 'STATEN ISLAND') &
                (crashes['CRASH_YEAR'] == 2020)]['LATITUDE'].tolist(),
    lon=crashes[(crashes['BOROUGH'] == 'STATEN ISLAND') &
                (crashes['CRASH_YEAR'] == 2020)]['LONGITUDE'].tolist(),
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=5
    ),
    text=crashes[(crashes['BOROUGH'] == 'STATEN ISLAND') & 
                 (crashes['CRASH_YEAR'] == 2020)]['CONTRIBUTING FACTOR VEHICLE 1'],
))

fig.update_layout(
    hovermode='closest',
    width=960,
    height=600,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=40.7,
            lon=-74
        ),
        pitch=0,
        zoom=8
    )
)

fig.show()