## Loading data

In [1]:
import numpy as np
import pandas as pd
import geoplotlib as gp
from scipy import stats
from collections import Counter
import pickle
from __future__ import division
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder as LE

In [2]:
df = pd.read_csv("NYPD_Motor_Vehicle_Collisions_reduced_data.csv", low_memory=False)

In [3]:
# Drop NaN values from Lat and Lon
df = df.dropna(subset=['LATITUDE','LONGITUDE', 'ON STREET NAME', 'BOROUGH'])

# Drop empty strings from street names
df = df[~df['ON STREET NAME'].str.contains('^\s+$')]

In [4]:
df = df[(np.abs(stats.zscore(df[['LATITUDE','LONGITUDE']])) < 1).all(axis=1)]
df.shape

(656094, 37)

In [5]:
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,Maximum Humidity,Mean Temperature,Min Temperature,Minimum Humidity,Precipitation,Sea Level Pressure,Snow,Snow Depth,Visibility,Wind Speed
1,2017/03/28,0:00,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
4,2017/03/28,0:00,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
7,2017/03/28,0:01,BROOKLYN,11234,40.6224,-73.936646,"(40.6224, -73.936646)",FLATBUSH AVENUE,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
16,2017/03/28,10:02,BROOKLYN,11230,40.62713,-73.97528,"(40.62713, -73.97528)",ELMWOOD AVENUE,1,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9
21,2017/03/28,10:16,BRONX,10455,40.81446,-73.89686,"(40.81446, -73.89686)",BRUCKNER BOULEVARD,0,0,...,100,7,5,89,18.29,1014.0,0.0,0.0,7.0,9


In [6]:
Counter(df['ON STREET NAME']).most_common(5)

[('BROADWAY                        ', 8818),
 ('ATLANTIC AVENUE                 ', 7872),
 ('NORTHERN BOULEVARD              ', 5924),
 ('3 AVENUE                        ', 5688),
 ('FLATBUSH AVENUE                 ', 4681)]

## Preprocessing data for modelling

First, we load and add the speed limit of each street to the dataframe.

In [7]:
with open('speed_limits/speeds.pickle', 'rb') as handle:
    speed_dict = pickle.load(handle)

In [8]:
df['street_SL'] = np.zeros(df.shape[0])

In [9]:
boroughs = list(df['BOROUGH'].copy())
streets = list(df['ON STREET NAME'].copy())
speeds = []

boroughs = [x.lower() for x in boroughs]
streets = [x.strip() for x in streets]

for i in range(len(boroughs)):
    
    try:
        speed = speed_dict[boroughs[i]][streets[i]]
        speeds.append(speed)
        
    except KeyError:
        speeds.append(np.nan)

In [10]:
speeds = np.array(speeds)

In [11]:
df['street_SL'] = speeds

In [12]:
df = df.dropna(subset=['street_SL'])

Then, model time as an hour of the day.

In [13]:
df['TIME'] = df['TIME'].apply(lambda x: int(x.split(':')[0]))

In [14]:
np.unique(df['TIME'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

Next, create a new column for months only.

In [15]:
df['Month'] = df['Date'].apply(lambda x: int(x.split('/')[1]))

In [16]:
np.unique(df['Month'])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

Next, we use label encoder to numerically encode boroughs in NYC.

In [17]:
Counter(df['BOROUGH'])

Counter({'BRONX': 79292,
         'BROOKLYN': 198040,
         'MANHATTAN': 168870,
         'QUEENS': 166370})

Since there are some missing values - we have to deal with them. In this case, the missing borough values are replaced with 'UNKNOWN'.

In [18]:
df['BOROUGH'] = df['BOROUGH'].fillna(value='UNKNOWN')

Now the boroughs can be encoded with one-hot encoding and added to the dataframe for modelling. 

In [19]:
df_BR_dummies = pd.get_dummies(df['BOROUGH'])
df_BR_dummies.head()

Unnamed: 0,BRONX,BROOKLYN,MANHATTAN,QUEENS
1,0,1,0,0
4,0,0,0,1
7,0,1,0,0
16,0,1,0,0
21,1,0,0,0


Add the one-hot encoded columns to the dataframe.

In [20]:
df = df.join(df_BR_dummies)
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,Snow,Snow Depth,Visibility,Wind Speed,street_SL,Month,BRONX,BROOKLYN,MANHATTAN,QUEENS
1,2017/03/28,0,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,0.0,0.0,7.0,9,25.0,3,0,1,0,0
4,2017/03/28,0,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,0.0,0.0,7.0,9,25.0,3,0,0,0,1
7,2017/03/28,0,BROOKLYN,11234,40.6224,-73.936646,"(40.6224, -73.936646)",FLATBUSH AVENUE,0,0,...,0.0,0.0,7.0,9,35.0,3,0,1,0,0
16,2017/03/28,10,BROOKLYN,11230,40.62713,-73.97528,"(40.62713, -73.97528)",ELMWOOD AVENUE,1,0,...,0.0,0.0,7.0,9,25.0,3,0,1,0,0
21,2017/03/28,10,BRONX,10455,40.81446,-73.89686,"(40.81446, -73.89686)",BRUCKNER BOULEVARD,0,0,...,0.0,0.0,7.0,9,30.0,3,1,0,0,0


Next, vehicle types will be encoded with one hot as well.

In [21]:
Counter(df['VEHICLE TYPE CODE 1'])

Counter({nan: 2229,
         'TAXI': 29362,
         'UNKNOWN': 15562,
         'MOTORCYCLE': 3303,
         'VAN': 19883,
         'LIVERY VEHICLE': 7517,
         'FIRE TRUCK': 621,
         'AMBULANCE': 1852,
         'PEDICAB': 21,
         'SMALL COM VEH(4 TIRES) ': 10591,
         'PASSENGER VEHICLE': 338452,
         'OTHER': 17428,
         'LARGE COM VEH(6 OR MORE TIRES)': 9213,
         'SCOOTER': 240,
         'BICYCLE': 1170,
         'SPORT UTILITY / STATION WAGON': 134787,
         'PICK-UP TRUCK': 9176,
         'BUS': 11165})

We will NaN values with 'OTHER'.

In [22]:
df[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']] = df[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']].fillna('OTHER')

Before that is done, vehicles are grouped into 4 classes depending on their size:

In [23]:
name_dict = {
'two_wheeler': ['BICYCLE', 'PEDICAB', 'SCOOTER', 'MOTORCYCLE'],
'small': ['PASSENGER VEHICLE', 'TAXI'],
'medium': ['AMBULANCE', 'SPORT UTILITY / STATION WAGON', 'PICK-UP TRUCK', 'SMALL COM VEH(4 TIRES) ', 'LIVERY VEHICLE', 'VAN'],
'large': ['BUS', 'FIRE TRUCK', 'LARGE COM VEH(6 OR MORE TIRES)'],
'other': ['OTHER', 'UNKNOWN']}

And replace those values in the dictionary.

In [24]:
df_repl = df[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']].copy()

for new_name in name_dict:
    
    for old_name in name_dict[new_name]:
    
        df_repl = df_repl[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']].replace(to_replace=old_name, value=new_name)

Now we can apply one-hot endcoding to the vehicle type as well.

In [25]:
df_VT_dummies = pd.get_dummies(df_repl[['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2']], prefix=['VTC1', 'VTC2'])
df_VT_dummies.head()

Unnamed: 0,VTC1_large,VTC1_medium,VTC1_other,VTC1_small,VTC1_two_wheeler,VTC2_large,VTC2_medium,VTC2_other,VTC2_small,VTC2_two_wheeler
1,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,1,0
7,0,0,1,0,0,0,0,1,0,0
16,0,1,0,0,0,0,0,1,0,0
21,0,0,1,0,0,0,0,1,0,0


And we join that to the dataframe as well.

In [26]:
df = df.join(df_VT_dummies)
df.head()

Unnamed: 0,Date,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,...,VTC1_large,VTC1_medium,VTC1_other,VTC1_small,VTC1_two_wheeler,VTC2_large,VTC2_medium,VTC2_other,VTC2_small,VTC2_two_wheeler
1,2017/03/28,0,BROOKLYN,11218,40.65408,-73.97761,"(40.65408, -73.97761)",18 STREET,0,0,...,0,0,0,1,0,0,0,1,0,0
4,2017/03/28,0,QUEENS,11429,40.713715,-73.73144,"(40.713715, -73.73144)",222 STREET,0,0,...,0,1,0,0,0,0,0,0,1,0
7,2017/03/28,0,BROOKLYN,11234,40.6224,-73.936646,"(40.6224, -73.936646)",FLATBUSH AVENUE,0,0,...,0,0,1,0,0,0,0,1,0,0
16,2017/03/28,10,BROOKLYN,11230,40.62713,-73.97528,"(40.62713, -73.97528)",ELMWOOD AVENUE,1,0,...,0,1,0,0,0,0,0,1,0,0
21,2017/03/28,10,BRONX,10455,40.81446,-73.89686,"(40.81446, -73.89686)",BRUCKNER BOULEVARD,0,0,...,0,0,1,0,0,0,0,1,0,0


Lastly, we do one-hot encoding for events. First we start by replacing NaN with Other.

In [27]:
df['Events'] = df['Events'].fillna('Other')

In [28]:
np.unique(df['Events'])

array(['Fog', 'Fog\n\t,\nRain', 'Fog\n\t,\nRain\n\t,\nSnow',
       'Fog\n\t,\nSnow', 'Other', 'Rain', 'Rain\n\t,\nSnow', 'Snow'], dtype=object)

In [29]:
df_EV_dummies = pd.DataFrame(columns=['Rain', 'Snow', 'Fog', 'Other'], index=df.index)

In [30]:
for column in df_EV_dummies.columns:
    df_EV_dummies[column] = np.zeros(df.shape[0])

In [31]:
df_EV_dummies.head()

Unnamed: 0,Rain,Snow,Fog,Other
1,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0


In [32]:
# loop through all rows in the dataframe
for index, row in enumerate(df['Events']):
    
    for event in row.split(','):
        
        event = event.strip()
        
        for c_idx, column in enumerate(['Rain', 'Snow', 'Fog', 'Other']):
            
            if event == column:
                df_EV_dummies.iloc[index, c_idx] = 1

In [33]:
# checking if the code worked
df_EV_dummies[df['Events'] == 'Fog\n\t,\nRain\n\t,\nSnow'].head()

Unnamed: 0,Rain,Snow,Fog,Other
230846,1.0,1.0,1.0,0.0
230847,1.0,1.0,1.0,0.0
230848,1.0,1.0,1.0,0.0
230850,1.0,1.0,1.0,0.0
230851,1.0,1.0,1.0,0.0


Rename columns because of same names in the main df, and remove the Other column as it does not provide valuable information.

In [34]:
df_EV_dummies.columns = ['Rain_EV', 'Snow_EV', 'Fog_EV', 'Other_EV']
df_EV_dummies = df_EV_dummies.drop('Other_EV', axis=1)

In [35]:
df_EV_dummies.head()

Unnamed: 0,Rain_EV,Snow_EV,Fog_EV
1,1.0,0.0,0.0
4,1.0,0.0,0.0
7,1.0,0.0,0.0
16,1.0,0.0,0.0
21,1.0,0.0,0.0


Now we can join the even and main dataframes.

In [36]:
df = df.join(df_EV_dummies)

In [None]:
df.head()

We also model the contributing factor in vehicle one.

In [None]:
cont_to_keep = []

for i in Counter(df['CONTRIBUTING FACTOR VEHICLE 1']).most_common(16):
    cont_to_keep.append(i[0])

cont_to_keep.remove('Unspecified')
cont_to_keep

In the end we have 338228 observations that had some contributing factor:

In [None]:
df[df['CONTRIBUTING FACTOR VEHICLE 1'].isin(cont_to_keep)].shape

In [None]:
df = df[df['CONTRIBUTING FACTOR VEHICLE 1'].isin(cont_to_keep)]

In [None]:
df_CNT_dummies = pd.get_dummies(df['CONTRIBUTING FACTOR VEHICLE 1'])
df_CNT_dummies.head()

In [None]:
df = df.join(df_CNT_dummies)
df.head()

Now we can select the relevant columns for modelling:

In [37]:
df.columns

Index(['Date', 'TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'Average Humidity', 'Dew Point', 'Events',
       'Max Gust Speed', 'Max Temperature', 'Max Wind Speed',
       'Maximum Humidity', 'Mean Temperature', 'Min Temperature',
       'Minimum Humidity', 'Precipitation', 'Sea Level Pressure', 'Snow',
       'Snow Depth', 'Visibility', 'Wind Speed', 'street_SL', 'Month', 'BRONX',
       'BROOKLYN', 'MANHATTAN', 'QUEENS', 'VTC1_large', 'VTC1_medium',
       'VTC1_other', 'VTC1_small', 'VTC1_two_wheeler', 'VTC2_large',
       'VTC2_medium',

In [38]:
df_ml = df.drop(['Date', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'Events', 'Dew Point'], axis=1)

In [39]:
df_ml.shape

(612572, 42)

Now we need to add our y (any person injured or killed) to our dataframe.

In [40]:
df_ml['Y'] = np.zeros(df.shape[0])

In [41]:
def fill_Y(row):
    
    if np.any(row[['NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED']] > 0):
        return 1
    else:
        return 0

In [42]:
df_ml['Y'] = df_ml.apply(lambda x: fill_Y(x), axis=1)

In [43]:
df_ml.head(10)

Unnamed: 0,TIME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,Average Humidity,...,VTC1_two_wheeler,VTC2_large,VTC2_medium,VTC2_other,VTC2_small,VTC2_two_wheeler,Rain_EV,Snow_EV,Fog_EV,Y
1,0,0,0,0,0,0,0,0,0,95,...,0,0,0,1,0,0,1.0,0.0,0.0,0
4,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,1,0,1.0,0.0,0.0,0
7,0,0,0,0,0,0,0,0,0,95,...,0,0,0,1,0,0,1.0,0.0,0.0,0
16,10,1,0,1,0,0,0,0,0,95,...,0,0,0,1,0,0,1.0,0.0,0.0,1
21,10,0,0,0,0,0,0,0,0,95,...,0,0,0,1,0,0,1.0,0.0,0.0,0
23,10,0,0,0,0,0,0,0,0,95,...,0,0,0,0,1,0,1.0,0.0,0.0,0
24,10,0,0,0,0,0,0,0,0,95,...,0,0,1,0,0,0,1.0,0.0,0.0,0
30,10,1,0,0,0,0,0,1,0,95,...,0,0,1,0,0,0,1.0,0.0,0.0,1
33,10,2,0,0,0,0,0,2,0,95,...,0,0,0,0,1,0,1.0,0.0,0.0,1
35,10,0,0,0,0,0,0,0,0,95,...,0,0,0,1,0,0,1.0,0.0,0.0,0


In [44]:
Counter(df_ml['Y'])

Counter({0: 494704, 1: 117868})

There are some strange values in some of the rows still that should be numeric - for example in the 'Snow' column there are some rows with 'T' instead of snow depth. This also causes that column to be encoded as strings instead of numbers.

Therefore we need to cast the cells to numbers where we can.

In [45]:
Counter(df_ml['Snow']).most_common(5)

[('0.00', 560204),
 ('T', 25906),
 ('1.02', 2110),
 ('0.25', 1786),
 ('1.27', 1668)]

Those that are encoded as strings:

- Max Gust Speed
- Max Wind Speed
- Precipitation
- Snow
- Snow Depth
- Wind Speed

In [46]:
def cast_to_float(x):
    
    try:
        return float(x)
        
    except Exception as error:
        return x

In [47]:
for column in ['Max Gust Speed', 'Max Wind Speed', 'Precipitation', 'Snow', 'Snow Depth', 'Wind Speed']:
    
    df_ml[column] = df_ml[column].apply(lambda x: cast_to_float(x))

Now we can see that they have been casted to float.

In [48]:
df_ml.head()['Snow']

1     0
4     0
7     0
16    0
21    0
Name: Snow, dtype: object

In [49]:
df_ml.head().applymap(np.isreal).all(1)

1     True
4     True
7     True
16    True
21    True
dtype: bool

Now we can select those rows that only have numeric values.

In [50]:
df_ml = df_ml[df_ml.applymap(np.isreal).all(1)]

In [51]:
df_ml.shape

(543921, 43)

Then of course we have to drop the columns that directly measure if people were killed or injured:

In [52]:
df_ml = df_ml.drop(['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED', 
                    'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
                   'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED'], axis=1)

In [53]:
df_ml.columns

Index(['TIME', 'Average Humidity', 'Max Gust Speed', 'Max Temperature',
       'Max Wind Speed', 'Maximum Humidity', 'Mean Temperature',
       'Min Temperature', 'Minimum Humidity', 'Precipitation',
       'Sea Level Pressure', 'Snow', 'Snow Depth', 'Visibility', 'Wind Speed',
       'street_SL', 'Month', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS',
       'VTC1_large', 'VTC1_medium', 'VTC1_other', 'VTC1_small',
       'VTC1_two_wheeler', 'VTC2_large', 'VTC2_medium', 'VTC2_other',
       'VTC2_small', 'VTC2_two_wheeler', 'Rain_EV', 'Snow_EV', 'Fog_EV', 'Y'],
      dtype='object')

In [54]:
df_ml.to_csv('NYPD_ML_df_wY.csv', index=False)

## Creating a balanced dataset

In [55]:
df_ml = pd.read_csv('NYPD_ML_df_wY.csv', low_memory=False)

There are still some NaN values in the dataset - we drop them before balancing.

In [56]:
df_ml = df_ml.dropna()

In [57]:
Counter(df_ml['Y'])

Counter({0: 435155, 1: 103891})

First, we have to balance the dataset so there is the same amount of casualties and non-casualties.

In [58]:
pos_all = df_ml[df_ml['Y'] == 1]
neg_all = df_ml[df_ml['Y'] == 0]

In [59]:
no_pos = pos_all.shape[0]

neg_sub = neg_all.sample(no_pos)

In [60]:
neg_sub.head()

Unnamed: 0,TIME,Average Humidity,Max Gust Speed,Max Temperature,Max Wind Speed,Maximum Humidity,Mean Temperature,Min Temperature,Minimum Humidity,Precipitation,...,VTC1_two_wheeler,VTC2_large,VTC2_medium,VTC2_other,VTC2_small,VTC2_two_wheeler,Rain_EV,Snow_EV,Fog_EV,Y
268933,14,59,27.0,27,19.0,81,23,18,36,0.0,...,0,0,0,0,1,0,0.0,0.0,0.0,0
231422,8,64,34.0,22,16.0,81,19,17,47,1.52,...,0,0,0,0,1,0,1.0,0.0,0.0,0
108607,18,69,29.0,18,14.0,89,14,11,48,0.0,...,0,0,0,1,0,0,0.0,0.0,0.0,0
409812,10,52,21.0,28,13.0,73,23,18,30,0.0,...,0,0,0,1,0,0,0.0,0.0,0.0,0
279178,12,54,31.0,31,14.0,73,26,22,34,0.0,...,0,0,1,0,0,0,0.0,0.0,0.0,0


In [61]:
df_comb = pd.concat([pos_all, neg_sub])

In [62]:
df_comb.tail()

Unnamed: 0,TIME,Average Humidity,Max Gust Speed,Max Temperature,Max Wind Speed,Maximum Humidity,Mean Temperature,Min Temperature,Minimum Humidity,Precipitation,...,VTC1_two_wheeler,VTC2_large,VTC2_medium,VTC2_other,VTC2_small,VTC2_two_wheeler,Rain_EV,Snow_EV,Fog_EV,Y
412195,8,83,29.0,25,19.0,97,23,20,69,21.59,...,0,0,1,0,0,0,1.0,0.0,0.0,0
413063,14,48,26.0,26,16.0,67,21,17,29,0.0,...,0,0,0,1,0,0,0.0,0.0,0.0,0
530800,19,84,29.0,28,16.0,100,24,19,67,14.48,...,0,0,0,0,1,0,1.0,0.0,1.0,0
109843,20,49,34.0,32,19.0,60,28,24,38,0.0,...,0,0,0,1,0,0,0.0,0.0,0.0,0
159413,10,68,50.0,28,23.0,90,24,21,45,9.91,...,0,0,0,1,0,0,1.0,0.0,1.0,0


In [63]:
df_comb.to_csv('NYPD_ML_df_wY_balanced.csv', index=False)