## Model Training

##### Importing Libraries

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Feature Transformation
from sklearn.preprocessing import LabelEncoder

# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [3]:
df = pd.read_csv('crime.csv', encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


##### Preparing X and Y variables

In [5]:
X = df.drop(columns=['OFFENSE_CODE_GROUP'],axis=1)
y = df['OFFENSE_CODE_GROUP']

In [6]:
X = X.drop(columns=['INCIDENT_NUMBER'], axis=1)

In [7]:
len(X.columns)

15

In [8]:
X['SHOOTING'] = np.where(X['SHOOTING'] == 'Y', 1, 0)
X['SHOOTING'].head()

0    0
1    0
2    0
3    0
4    0
Name: SHOOTING, dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

X['DAY_OF_WEEK'] = label_encoder.fit_transform(X['DAY_OF_WEEK'])
y = label_encoder.fit_transform(y)
X['DISTRICT'] = label_encoder.fit_transform(X['DISTRICT'])

X.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,619,LARCENY ALL OTHERS,7,808,0,2018-09-02 13:00:00,2018,9,3,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,1402,VANDALISM,5,347,0,2018-08-21 00:00:00,2018,8,5,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,3410,TOWED MOTOR VEHICLE,8,151,0,2018-09-03 19:27:00,2018,9,1,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,3114,INVESTIGATE PROPERTY,8,272,0,2018-09-03 21:16:00,2018,9,1,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,3114,INVESTIGATE PROPERTY,4,421,0,2018-09-03 21:05:00,2018,9,1,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [10]:
X['OCCURRED_ON_DATE'] = pd.to_datetime(X['OCCURRED_ON_DATE'])
print(X['OCCURRED_ON_DATE'].dtype)

datetime64[ns]


In [11]:
# Extract date features
X['YEAR'] = X['OCCURRED_ON_DATE'].dt.year
X['MONTH'] = X['OCCURRED_ON_DATE'].dt.month
X['DAY'] = X['OCCURRED_ON_DATE'].dt.day
X['HOUR'] = X['OCCURRED_ON_DATE'].dt.hour

In [12]:
X.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location,DAY
0,619,LARCENY ALL OTHERS,7,808,0,2018-09-02 13:00:00,2018,9,3,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)",2
1,1402,VANDALISM,5,347,0,2018-08-21 00:00:00,2018,8,5,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)",21
2,3410,TOWED MOTOR VEHICLE,8,151,0,2018-09-03 19:27:00,2018,9,1,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)",3
3,3114,INVESTIGATE PROPERTY,8,272,0,2018-09-03 21:16:00,2018,9,1,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)",3
4,3114,INVESTIGATE PROPERTY,4,421,0,2018-09-03 21:05:00,2018,9,1,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)",3


In [13]:
# Transform location data into latitude and longitude features
X['LATITUDE'] = X['Location'].str.extract(r'\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)')[0].astype(float)
X['LONGITUDE'] = X['Location'].str.extract(r'\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)')[1].astype(float)

In [14]:
X.drop(columns=['Location', 'Lat', 'Long'], axis=1, inplace=True)
X.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE
0,619,LARCENY ALL OTHERS,7,808,0,2018-09-02 13:00:00,2018,9,3,13,Part One,LINCOLN ST,2,42.357791,-71.139371
1,1402,VANDALISM,5,347,0,2018-08-21 00:00:00,2018,8,5,0,Part Two,HECLA ST,21,42.306821,-71.0603
2,3410,TOWED MOTOR VEHICLE,8,151,0,2018-09-03 19:27:00,2018,9,1,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429
3,3114,INVESTIGATE PROPERTY,8,272,0,2018-09-03 21:16:00,2018,9,1,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664
4,3114,INVESTIGATE PROPERTY,4,421,0,2018-09-03 21:05:00,2018,9,1,21,Part Three,DELHI ST,3,42.275365,-71.090361


In [15]:
# Split the REPORTING_AREA feature into numeric and non-numeric components
X['REPORTING_AREA'] = X['REPORTING_AREA'].str.extract(r'(\d+)')[0].astype(float)
X['REPORTING_AREA_STR'] = X['REPORTING_AREA'].astype(str)
X.drop('REPORTING_AREA', axis=1, inplace=True)

In [16]:
X.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,DISTRICT,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,LARCENY ALL OTHERS,7,0,2018-09-02 13:00:00,2018,9,3,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,VANDALISM,5,0,2018-08-21 00:00:00,2018,8,5,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,TOWED MOTOR VEHICLE,8,0,2018-09-03 19:27:00,2018,9,1,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,INVESTIGATE PROPERTY,8,0,2018-09-03 21:16:00,2018,9,1,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,INVESTIGATE PROPERTY,4,0,2018-09-03 21:05:00,2018,9,1,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [17]:
# Encode categorical variables as integers
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)
X['UCR_PART'] = label_encoder.fit_transform(X['UCR_PART'])
X['STREET'] = label_encoder.fit_transform(X['STREET'])
X['REPORTING_AREA_STR'] = label_encoder.fit_transform(X['REPORTING_AREA_STR'])

In [18]:
max(y)

66

In [19]:
df['OFFENSE_CODE_GROUP'].unique()

array(['Larceny', 'Vandalism', 'Towed', 'Investigate Property',
       'Motor Vehicle Accident Response', 'Auto Theft', 'Verbal Disputes',
       'Robbery', 'Fire Related Reports', 'Other', 'Property Lost',
       'Medical Assistance', 'Assembly or Gathering Violations',
       'Larceny From Motor Vehicle', 'Residential Burglary',
       'Simple Assault', 'Restraining Order Violations', 'Violations',
       'Harassment', 'Ballistics', 'Property Found',
       'Police Service Incidents', 'Drug Violation', 'Warrant Arrests',
       'Disorderly Conduct', 'Property Related Damage',
       'Missing Person Reported', 'Investigate Person', 'Fraud',
       'Aggravated Assault', 'License Plate Related Incidents',
       'Firearm Violations', 'Other Burglary', 'Arson', 'Bomb Hoax',
       'Harbor Related Incidents', 'Counterfeiting', 'Liquor Violation',
       'Firearm Discovery', 'Landlord/Tenant Disputes',
       'Missing Person Located', 'Auto Theft Recovery', 'Service',
       'Operating Und

In [20]:
X.drop(columns=['OFFENSE_DESCRIPTION', 'OCCURRED_ON_DATE'], axis=1,inplace=True)
X.head()

Unnamed: 0,OFFENSE_CODE,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,7,0,2018,9,3,13,1,2535,2,42.357791,-71.139371,783
1,1402,5,0,2018,8,5,0,3,2073,21,42.306821,-71.0603,274
2,3410,8,0,2018,9,1,19,2,784,3,42.346589,-71.072429,59
3,3114,8,0,2018,9,1,21,2,3065,3,42.334182,-71.078664,191
4,3114,4,0,2018,9,1,21,2,1240,3,42.275365,-71.090361,356


In [21]:
X.isna().sum()

OFFENSE_CODE          0
DISTRICT              0
SHOOTING              0
YEAR                  0
MONTH                 0
DAY_OF_WEEK           0
HOUR                  0
UCR_PART              0
STREET                0
DAY                   0
LATITUDE              0
LONGITUDE             0
REPORTING_AREA_STR    0
dtype: int64

In [22]:
df['REPORTING_AREA'].dtype

dtype('O')

In [23]:
X['REPORTING_AREA_STR'].dtype

dtype('int64')

In [26]:
len(X.columns)

13

In [27]:
label_encoder.fit_transform(df['SHOOTING'])

array([1, 1, 1, ..., 1, 1, 1])

In [28]:
X.head()

Unnamed: 0,OFFENSE_CODE,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,7,0,2018,9,3,13,1,2535,2,42.357791,-71.139371,783
1,1402,5,0,2018,8,5,0,3,2073,21,42.306821,-71.0603,274
2,3410,8,0,2018,9,1,19,2,784,3,42.346589,-71.072429,59
3,3114,8,0,2018,9,1,21,2,3065,3,42.334182,-71.078664,191
4,3114,4,0,2018,9,1,21,2,1240,3,42.275365,-71.090361,356


In [32]:
X['REPORTING_AREA_STR'].dtype

dtype('int64')

In [34]:
X.head()

Unnamed: 0,OFFENSE_CODE,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,7,0,2018,9,3,13,1,2535,2,42.357791,-71.139371,783
1,1402,5,0,2018,8,5,0,3,2073,21,42.306821,-71.0603,274
2,3410,8,0,2018,9,1,19,2,784,3,42.346589,-71.072429,59
3,3114,8,0,2018,9,1,21,2,3065,3,42.334182,-71.078664,191
4,3114,4,0,2018,9,1,21,2,1240,3,42.275365,-71.090361,356


In [52]:
import numpy as np

# Create an example array
arr = np.random.rand(254493, 13)
arr

array([[0.69421789, 0.90181203, 0.8858282 , ..., 0.85901967, 0.66294448,
        0.52601846],
       [0.96417804, 0.89220036, 0.33166563, ..., 0.78813233, 0.60809365,
        0.07444038],
       [0.03915133, 0.6551909 , 0.18153055, ..., 0.18543371, 0.198534  ,
        0.94897086],
       ...,
       [0.97648505, 0.36063227, 0.65473512, ..., 0.04339091, 0.22869705,
        0.75357944],
       [0.83002827, 0.33256384, 0.61847006, ..., 0.51977739, 0.32421853,
        0.32402335],
       [0.64920019, 0.62934139, 0.95256105, ..., 0.08800473, 0.52233744,
        0.87244737]])

In [54]:
selected_cols = arr[:,1]
selected_cols

array([0.90181203, 0.89220036, 0.6551909 , ..., 0.36063227, 0.33256384,
       0.62934139])

In [57]:
remaining_col = np.delete(arr, 1, axis=1)
len(remaining_col[0])

12