In [52]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [53]:

# import dataset
df = pd.read_csv('/content/Motor_Vehicle_Collisions_-_Crashes.csv')
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765.0,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547.0,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903.0,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314.0,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609.0,,,,,


In [44]:
# preprocess the data

In [54]:
# check null values
df.isnull().sum()

CRASH DATE                            0
CRASH TIME                            0
BOROUGH                          283855
ZIP CODE                         283989
LATITUDE                          60943
LONGITUDE                         60943
LOCATION                          60943
ON STREET NAME                   200675
CROSS STREET NAME                415720
OFF STREET NAME                  602544
NUMBER OF PERSONS INJURED             8
NUMBER OF PERSONS KILLED             16
NUMBER OF PEDESTRIANS INJURED         0
NUMBER OF PEDESTRIANS KILLED          0
NUMBER OF CYCLIST INJURED             0
NUMBER OF CYCLIST KILLED              0
NUMBER OF MOTORIST INJURED            0
NUMBER OF MOTORIST KILLED             0
CONTRIBUTING FACTOR VEHICLE 1      3071
CONTRIBUTING FACTOR VEHICLE 2    143573
CONTRIBUTING FACTOR VEHICLE 3    740361
CONTRIBUTING FACTOR VEHICLE 4    787463
CONTRIBUTING FACTOR VEHICLE 5    798022
COLLISION_ID                          1
VEHICLE TYPE CODE 1                6469


In [55]:
# impute missing values
# using mean for numerical values and mode for categorical values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586917 entries, 0 to 586916
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH DATE                     586917 non-null  object 
 1   CRASH TIME                     586917 non-null  object 
 2   BOROUGH                        381875 non-null  object 
 3   ZIP CODE                       381781 non-null  float64
 4   LATITUDE                       539066 non-null  float64
 5   LONGITUDE                      539066 non-null  float64
 6   LOCATION                       539066 non-null  object 
 7   NUMBER OF PERSONS INJURED      586914 non-null  float64
 8   NUMBER OF PERSONS KILLED       586909 non-null  float64
 9   NUMBER OF PEDESTRIANS INJURED  586916 non-null  float64
 10  NUMBER OF PEDESTRIANS KILLED   586916 non-null  float64
 11  NUMBER OF CYCLIST INJURED      586916 non-null  float64
 12  NUMBER OF CYCLIST KILLED      

In [63]:
df.drop(['LOCATION','ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'], axis=1, inplace=True)

In [45]:
df.drop(['ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME'], axis=1, inplace=True)

In [57]:
df.drop(['CRASH DATE', 'CRASH TIME'], axis=1, inplace=True)


In [58]:
# ENCODING THE CATEGORICAL DATA

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['BOROUGH']  = le.fit_transform(df['BOROUGH'])
df['CONTRIBUTING FACTOR VEHICLE 1'] = le.fit_transform(df['CONTRIBUTING FACTOR VEHICLE 1'])
df['CONTRIBUTING FACTOR VEHICLE 2'] = le.fit_transform(df['CONTRIBUTING FACTOR VEHICLE 2'])
df['CONTRIBUTING FACTOR VEHICLE 3'] = le.fit_transform(df['CONTRIBUTING FACTOR VEHICLE 3'])
df['CONTRIBUTING FACTOR VEHICLE 4'] = le.fit_transform(df['CONTRIBUTING FACTOR VEHICLE 4'])
df['CONTRIBUTING FACTOR VEHICLE 5'] = le.fit_transform(df['CONTRIBUTING FACTOR VEHICLE 5'])
df['VEHICLE TYPE CODE 1'] = le.fit_transform(df['VEHICLE TYPE CODE 1'])
df['VEHICLE TYPE CODE 2'] = le.fit_transform(df['VEHICLE TYPE CODE 2'])
df['VEHICLE TYPE CODE 3'] = le.fit_transform(df['VEHICLE TYPE CODE 3'])
df['VEHICLE TYPE CODE 4'] = le.fit_transform(df['VEHICLE TYPE CODE 4'])
df['VEHICLE TYPE CODE 5'] = le.fit_transform(df['VEHICLE TYPE CODE 5'])

In [64]:
df.head()

Unnamed: 0,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,1,10878.45557,40.557053,-73.618795,2.0,0.0,0,0,0,0,...,51,39,22,15,4455765.0,724,786,108,44,22
1,1,10878.45557,40.557053,-73.618795,1.0,0.0,0,0,0,0,...,51,39,22,15,4513547.0,724,786,108,44,22
2,1,10878.45557,40.557053,-73.618795,0.0,0.0,0,0,0,0,...,51,39,22,15,4541903.0,724,671,108,44,22
3,1,11208.0,40.667202,-73.8665,0.0,0.0,0,0,0,0,...,51,39,22,15,4456314.0,724,786,108,44,22
4,1,11233.0,40.683304,-73.917274,0.0,0.0,0,0,0,0,...,51,39,22,15,4486609.0,724,786,108,44,22


In [65]:
# train test split
X = df.drop('NUMBER OF PERSONS INJURED', axis=1)
y = df['NUMBER OF PERSONS INJURED']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
# using lasso regression for feature selection
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
lasso = Lasso()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
lasso.fit(X_train_scaled, y_train)

# check which columns is more effective and usefull
selected_columns = X_train.columns[lasso.coef_ != 0]
print(selected_columns)

# create a new dataframe with only the selected columns
X_train_selected = X_train[selected_columns]
X_test_selected = X_test[selected_columns]
new_df = pd.concat([X_train_selected, y_train], axis=1)

# save the new dataframe to a csv file
new_df.to_csv('new_df.csv', index=False)
# new_df.to_csv('new_df.csv', index=False)

Index([], dtype='object')


In [72]:
df = pd.read_csv('/content/new_df.csv')
df.head()

Unnamed: 0,NUMBER OF PERSONS INJURED
0,0.0
1,0.0
2,1.0
3,0.0
4,0.0
