## Loading data

In [1]:
import numpy as np
import pandas as pd
import geoplotlib as gp
from scipy import stats
from collections import Counter
from __future__ import division
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder as LE

In [2]:
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions_reduced_data.csv", low_memory=False)

In [3]:
# Drop NaN values from Lat and Lon
df = df.dropna(subset=['LATITUDE','LONGITUDE'])

In [4]:
df = df[(np.abs(stats.zscore(df[['LATITUDE','LONGITUDE']])) < 1).all(axis=1)]
df.shape

(802922, 37)

In [5]:
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,Maximum Humidity,Mean Temperature,Min Temperature,Minimum Humidity,Precipitation,Sea Level Pressure,Snow,Snow Depth,Visibility,Wind Speed
0,2017/03/28,0:00,BRONX,10466,40.8946,-73.861206,"(40.8946, -73.861206)",,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
1,2017/03/28,0:00,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
2,2017/03/28,0:00,BROOKLYN,11235,40.583847,-73.94059,"(40.583847, -73.94059)",,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
3,2017/03/28,0:00,MANHATTAN,10019,40.76277,-73.97559,"(40.76277, -73.97559)",,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
4,2017/03/28,0:00,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9


In [6]:
df.columns

Index(['Date', 'TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'Average Humidity', 'Dew Point', 'Events',
       'Max Gust Speed', 'Max Temperature', 'Max Wind Speed',
       'Maximum Humidity', 'Mean Temperature', 'Min Temperature',
       'Minimum Humidity', 'Precipitation', 'Sea Level Pressure', 'Snow',
       'Snow Depth', 'Visibility', 'Wind Speed'],
      dtype='object')

## Preprocessing data for modelling

First, model time as an hour of the day.

In [7]:
df['TIME'] = df['TIME'].apply(lambda x: int(x.split(':')[0]))

In [8]:
np.unique(df['TIME'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

Next, create a new column for months only.

In [9]:
df['Month'] = df['Date'].apply(lambda x: int(x.split('/')[1]))

In [10]:
np.unique(df['Month'])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

Next, we use label encoder to numerically encode boroughs in NYC.

In [11]:
Counter(df['BOROUGH'])

Counter({nan: 93407,
         'BROOKLYN': 217596,
         'BRONX': 92414,
         'MANHATTAN': 183018,
         'STATEN ISLAND': 33073,
         'QUEENS': 183414})

Since there are some missing values - we have to deal with them. In this case, the missing borough values are replaced with 'UNKNOWN'.

In [12]:
df['BOROUGH'] = df['BOROUGH'].fillna(value='UNKNOWN')

Now the boroughs can be encoded with one-hot encoding and added to the dataframe for modelling. 

In [13]:
df_BR_dummies = pd.get_dummies(df['BOROUGH'])
df_BR_dummies.head()

Unnamed: 0,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN ISLAND,UNKNOWN
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,0,1,0,0,0
4,0,0,0,1,0,0


Add the one-hot encoded columns to the dataframe.

In [14]:
df = df.join(df_BR_dummies)
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,Snow Depth,Visibility,Wind Speed,Month,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN ISLAND,UNKNOWN
0,2017/03/28,0,BRONX,10466,40.8946,-73.861206,"(40.8946, -73.861206)",,0,0,...,0.0,7.0,9,3,1,0,0,0,0,0
1,2017/03/28,0,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,0.0,7.0,9,3,0,1,0,0,0,0
2,2017/03/28,0,BROOKLYN,11235,40.583847,-73.94059,"(40.583847, -73.94059)",,0,0,...,0.0,7.0,9,3,0,1,0,0,0,0
3,2017/03/28,0,MANHATTAN,10019,40.76277,-73.97559,"(40.76277, -73.97559)",,0,0,...,0.0,7.0,9,3,0,0,1,0,0,0
4,2017/03/28,0,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,0.0,7.0,9,3,0,0,0,1,0,0


Next, vehicle types will be encoded with one hot as well.

In [15]:
Counter(df['VEHICLE TYPE CODE 1'])

Counter({nan: 3932,
         'VAN': 22869,
         'SMALL COM VEH(4 TIRES) ': 12352,
         'BICYCLE': 1537,
         'BUS': 12317,
         'PEDICAB': 25,
         'UNKNOWN': 17859,
         'LIVERY VEHICLE': 8304,
         'SCOOTER': 262,
         'LARGE COM VEH(6 OR MORE TIRES)': 11129,
         'PASSENGER VEHICLE': 456919,
         'MOTORCYCLE': 4406,
         'PICK-UP TRUCK': 12611,
         'FIRE TRUCK': 700,
         'SPORT UTILITY / STATION WAGON': 181379,
         'OTHER': 20714,
         'AMBULANCE': 2087,
         'TAXI': 33520})

We will NaN values with 'OTHER'.

In [16]:
df[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']] = df[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']].fillna('OTHER')

Before that is done, vehicles are grouped into 4 classes depending on their size:

In [17]:
name_dict = {
'two_wheeler': ['BICYCLE', 'PEDICAB', 'SCOOTER', 'MOTORCYCLE'],
'small': ['PASSENGER VEHICLE', 'TAXI'],
'medium': ['AMBULANCE', 'SPORT UTILITY / STATION WAGON', 'PICK-UP TRUCK', 'SMALL COM VEH(4 TIRES) ', 'LIVERY VEHICLE', 'VAN'],
'large': ['BUS', 'FIRE TRUCK', 'LARGE COM VEH(6 OR MORE TIRES)'],
'other': ['OTHER', 'UNKNOWN']}

And replace those values in the dictionary.

In [18]:
for new_name in name_dict:
    
    for old_name in name_dict[new_name]:
    
        df_repl = df[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']].replace(to_replace=old_name, value=new_name)

Now we can apply one-hot endcoding to the vehicle type as well.

In [19]:
df_VT_dummies = pd.get_dummies(df_repl[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']], prefix=['VTC1', 'VTC2'])
df_VT_dummies.head()

Unnamed: 0,VTC1_AMBULANCE,VTC1_BICYCLE,VTC1_BUS,VTC1_FIRE TRUCK,VTC1_LARGE COM VEH(6 OR MORE TIRES),VTC1_LIVERY VEHICLE,VTC1_MOTORCYCLE,VTC1_OTHER,VTC1_PASSENGER VEHICLE,VTC1_PEDICAB,...,VTC2_OTHER,VTC2_PASSENGER VEHICLE,VTC2_PEDICAB,VTC2_PICK-UP TRUCK,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


And we join that to the dataframe as well.

In [20]:
df = df.join(df_VT_dummies)
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,VTC2_OTHER,VTC2_PASSENGER VEHICLE,VTC2_PEDICAB,VTC2_PICK-UP TRUCK,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium
0,2017/03/28,0,BRONX,10466,40.8946,-73.861206,"(40.8946, -73.861206)",,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2017/03/28,0,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2017/03/28,0,BROOKLYN,11235,40.583847,-73.94059,"(40.583847, -73.94059)",,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2017/03/28,0,MANHATTAN,10019,40.76277,-73.97559,"(40.76277, -73.97559)",,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2017/03/28,0,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,0,1,0,0,0,0,0,0,0,0


Lastly, we do one-hot encoding for events. First we start by replacing NaN with Other.

In [105]:
df['Events'] = df['Events'].fillna('Other')

In [114]:
np.unique(df['Events'])

array(['Fog', 'Fog\n\t,\nRain', 'Fog\n\t,\nRain\n\t,\nSnow',
       'Fog\n\t,\nSnow', 'Other', 'Rain', 'Rain\n\t,\nSnow', 'Snow'], dtype=object)

In [158]:
df_EV_dummies = pd.DataFrame(columns=['Rain', 'Snow', 'Fog', 'Other'], index=df.index)

In [159]:
for column in df_EV_dummies.columns:
    df_EV_dummies[column] = np.zeros(df.shape[0])

In [163]:
df_EV_dummies.head()

Unnamed: 0,Rain,Snow,Fog,Other
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0


In [161]:
# loop through all rows in the dataframe
for index, row in enumerate(df['Events']):
    
    for event in row.split(','):
        
        event = event.strip()
        
        for c_idx, column in enumerate(['Rain', 'Snow', 'Fog', 'Other']):
            
            if event == column:
                df_EV_dummies.iloc[index, c_idx] = 1

In [162]:
# checking if the code worked
df_EV_dummies[df['Events'] == 'Fog\n\t,\nRain\n\t,\nSnow'].head()

Unnamed: 0,Rain,Snow,Fog,Other
230844,1.0,1.0,1.0,0.0
230845,1.0,1.0,1.0,0.0
230846,1.0,1.0,1.0,0.0
230847,1.0,1.0,1.0,0.0
230848,1.0,1.0,1.0,0.0


Rename columns because of same names in the main df, and remove the Other column as it does not provide valuable information.

In [166]:
df_EV_dummies.columns = ['Rain_EV', 'Snow_EV', 'Fog_EV', 'Other_EV']
df_EV_dummies = df_EV_dummies.drop('Other_EV', axis=1)

In [167]:
df_EV_dummies.head()

Unnamed: 0,Rain_EV,Snow_EV,Fog_EV
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


Now we can join the even and main dataframes.

In [168]:
df = df.join(df_EV_dummies)

In [169]:
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium,Y,Rain_EV,Snow_EV,Fog_EV
0,2017/03/28,0,BRONX,10466,40.8946,-73.861206,"(40.8946, -73.861206)",,0,0,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
1,2017/03/28,0,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
2,2017/03/28,0,BROOKLYN,11235,40.583847,-73.94059,"(40.583847, -73.94059)",,0,0,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
3,2017/03/28,0,MANHATTAN,10019,40.76277,-73.97559,"(40.76277, -73.97559)",,0,0,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
4,2017/03/28,0,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0


Now we can select the relevant columns for modelling:

In [170]:
df.columns

Index(['Date', 'TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'Average Humidity', 'Dew Point', 'Events',
       'Max Gust Speed', 'Max Temperature', 'Max Wind Speed',
       'Maximum Humidity', 'Mean Temperature', 'Min Temperature',
       'Minimum Humidity', 'Precipitation', 'Sea Level Pressure', 'Snow',
       'Snow Depth', 'Visibility', 'Wind Speed', 'Month', 'BRONX', 'BROOKLYN',
       'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'UNKNOWN', 'VTC1_AMBULANCE',
       'VTC1_BICYCLE', 'VTC1_BUS', 'VTC1_FIRE TRUCK',
       'VTC1_LARGE COM VEH(6 OR 

In [171]:
df_ml = df.drop(['Date', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'Events'], axis=1)

In [172]:
df_ml.head(10)

Unnamed: 0,TIME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,Average Humidity,...,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium,Y,Rain_EV,Snow_EV,Fog_EV
0,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
5,0,0,0,0,0,0,0,0,0,95,...,0,0,0,1,0,0,0.0,1.0,0.0,0.0
6,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
7,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
8,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
9,0,1,0,0,0,0,0,1,0,95,...,0,0,0,1,0,0,0.0,1.0,0.0,0.0


Now we need to add our y (any person injured or killed) to our dataframe.

In [173]:
df_ml['Y'] = np.zeros(df.shape[0])

In [174]:
def fill_Y(row):
    
    if np.any(row[1:9] > 0):
        return 1
    else:
        return 0

In [175]:
df_ml['Y'] = df_ml.apply(lambda x: fill_Y(x), axis=1)

In [176]:
df_ml.head()

Unnamed: 0,TIME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,Average Humidity,...,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium,Y,Rain_EV,Snow_EV,Fog_EV
0,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0,1.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0,1.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0,1.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0,1.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0,1.0,0.0,0.0


In [177]:
df_ml.to_csv('NYPD_ML_df_wY.csv', index=False)

## Creating a balanced dataset

In [180]:
df_ml = pd.read_csv('NYPD_ML_df_wY.csv', low_memory=False)

In [181]:
Counter(df_ml['Y'])

Counter({0: 651474, 1: 151448})

First, we have to balance the dataset so there is the same amount of casualties and non-casualties.

In [182]:
pos_all = df_ml[df_ml['Y'] == 1]
neg_all = df_ml[df_ml['Y'] == 0]

In [183]:
no_pos = pos_all.shape[0]

neg_sub = neg_all.sample(no_pos)

In [184]:
neg_sub.head()

Unnamed: 0,TIME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,Average Humidity,...,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium,Y,Rain_EV,Snow_EV,Fog_EV
783071,13,0,0,0,0,0,0,0,0,63,...,0,0,0,0,0,0,0,0.0,0.0,0.0
342971,13,0,0,0,0,0,0,0,0,27,...,0,0,0,0,0,0,0,0.0,0.0,0.0
304519,14,0,0,0,0,0,0,0,0,74,...,0,0,1,0,0,0,0,0.0,0.0,0.0
709343,19,0,0,0,0,0,0,0,0,49,...,0,0,0,1,0,0,0,0.0,0.0,0.0
666716,11,0,0,0,0,0,0,0,0,32,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [185]:
df_comb = pd.concat([pos_all, neg_sub])

In [188]:
df_comb.head()

Unnamed: 0,TIME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,Average Humidity,...,VTC2_SCOOTER,VTC2_SMALL COM VEH(4 TIRES),VTC2_SPORT UTILITY / STATION WAGON,VTC2_TAXI,VTC2_UNKNOWN,VTC2_medium,Y,Rain_EV,Snow_EV,Fog_EV
9,0,1,0,0,0,0,0,1,0,95,...,0,0,0,1,0,0,1,1.0,0.0,0.0
16,10,1,0,1,0,0,0,0,0,95,...,0,0,0,0,0,0,1,1.0,0.0,0.0
20,10,1,0,0,0,0,0,1,0,95,...,0,0,1,0,0,0,1,1.0,0.0,0.0
30,10,1,0,0,0,0,0,1,0,95,...,0,0,1,0,0,0,1,1.0,0.0,0.0
31,10,2,0,0,0,0,0,2,0,95,...,0,0,1,0,0,0,1,1.0,0.0,0.0


In [187]:
df_comb.to_csv('NYPD_ML_df_wY_balanced.csv', index=False)

## Actual modelling

In [189]:
df_comb = pd.read_csv('NYPD_ML_df_wY_balanced.csv', low_memory=False)

In [195]:
df_comb.columns

Index(['TIME', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
       'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
       'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
       'Average Humidity', 'Dew Point', 'Max Gust Speed', 'Max Temperature',
       'Max Wind Speed', 'Maximum Humidity', 'Mean Temperature',
       'Min Temperature', 'Minimum Humidity', 'Precipitation',
       'Sea Level Pressure', 'Snow', 'Snow Depth', 'Visibility', 'Wind Speed',
       'Month', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND',
       'UNKNOWN', 'VTC1_AMBULANCE', 'VTC1_BICYCLE', 'VTC1_BUS',
       'VTC1_FIRE TRUCK', 'VTC1_LARGE COM VEH(6 OR MORE TIRES)',
       'VTC1_LIVERY VEHICLE', 'VTC1_MOTORCYCLE', 'VTC1_OTHER',
       'VTC1_PASSENGER VEHICLE', 'VTC1_PEDICAB', 'VTC1_PICK-UP TRUCK',
       'VTC1_SCOOTER', 'VTC1_SMALL COM VEH(4 TIRES) ',
       'VTC1_SPORT UTILITY / STATION WAGON', 'V

In [190]:
from sklearn.ensemble import RandomForestClassifier

In [191]:
# Selecting X and y
X, y = df_comb.iloc[:, 1:-1].values, df_comb.iloc[:, -1].values 
y = y.astype(int) 

In [192]:
# Defining the model
forest = RandomForestClassifier(criterion='gini', random_state=0, n_estimators=500, n_jobs=-1)

In [193]:
# Evaluating performance
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [194]:
forest.fit(X_train, y_train)

ValueError: could not convert string to float: 'T'