## **Crime featuring and analysis**

### Load Dataset

In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [59]:
data_path = "/content/drive/MyDrive/ML_project/NYPD_Complaint_Data_Current__Year_To_Date__20250325.csv"
df = pd.read_csv(data_path)
field_names = list(df.columns)

  df = pd.read_csv(data_path)


In [60]:
# Drop rows with missing crime severity
crime_df = df[df['LAW_CAT_CD'].notna()]

In [61]:
print (crime_df.columns)
print (crime_df.shape)

Index(['CMPLNT_NUM', 'ADDR_PCT_CD', 'BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM',
       'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT',
       'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD', 'LAW_CAT_CD',
       'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD',
       'PD_DESC', 'PREM_TYP_DESC', 'RPT_DT', 'STATION_NAME', 'SUSP_AGE_GROUP',
       'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT', 'VIC_AGE_GROUP',
       'VIC_RACE', 'VIC_SEX', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude',
       'Longitude', 'Lat_Lon', 'New Georeferenced Column'],
      dtype='object')
(577108, 36)


In [62]:
# Feature Selection by dropping Unrelated columns
features = ['ADDR_PCT_CD', 'JURIS_DESC', 'LOC_OF_OCCUR_DESC',
            'OFNS_DESC', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE',
            'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'LAW_CAT_CD',
            'CMPLNT_FR_TM', 'CMPLNT_TO_TM']
crime_df = crime_df[features]

In [63]:
crime_df['JURIS_DESC'].value_counts()

Unnamed: 0_level_0,count
JURIS_DESC,Unnamed: 1_level_1
N.Y. POLICE DEPT,507634
N.Y. HOUSING POLICE,35025
N.Y. TRANSIT POLICE,27137
PORT AUTHORITY,2716
MTA POLICE DEPT,1868
OTHER,834
DEPT OF CORRECTIONS,636
N.Y.C. DEPT OF HOMELESS SERVICES,244
N.Y. STATE POLICE,201
DISTRICT ATTORNEY OFFICE,158


### Feature Engineering

In [64]:
# Extract hour and day of the week from the crime time
crime_df['CMPLNT_FR_TM'] = pd.to_datetime(crime_df['CMPLNT_FR_TM'], errors='coerce')
crime_df['CMPLNT_TO_TM'] = pd.to_datetime(crime_df['CMPLNT_TO_TM'], errors='coerce')

# Add new duration feature(in Hrs)
crime_df['duration'] = (crime_df['CMPLNT_TO_TM'] - crime_df['CMPLNT_FR_TM']).dt.total_seconds() / 3600

  crime_df['CMPLNT_FR_TM'] = pd.to_datetime(crime_df['CMPLNT_FR_TM'], errors='coerce')
  crime_df['CMPLNT_TO_TM'] = pd.to_datetime(crime_df['CMPLNT_TO_TM'], errors='coerce')


In [66]:
# Extract time-based features
crime_df['crime_hour'] = crime_df['CMPLNT_FR_TM'].dt.hour
crime_df['crime_dayofweek'] = crime_df['CMPLNT_TO_TM'].dt.dayofweek

# Drop the original time columns
crime_df.drop(['CMPLNT_FR_TM', 'CMPLNT_TO_TM'], axis=1, inplace=True)

#### Encoding categorical variables

In [67]:
le = LabelEncoder()
categorical_cols = ['JURIS_DESC', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PREM_TYP_DESC',
                    'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX',
                    'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX']

for col in categorical_cols:
    crime_df[col] = le.fit_transform(crime_df[col].astype(str))

In [68]:
print("Jurisdiction description encoding-",crime_df['JURIS_DESC'].unique())
print("Location of occurance description encoding-",crime_df['LOC_OF_OCCUR_DESC'].unique())

Jurisdiction description encoding- [ 7 17 10  6  5 18  9  3 19 15 12 13  2  0  4  8 20 14 11  1 16]
Location of occurance description encoding- [2 0 1 4 3 5]


In [69]:
crime_df

Unnamed: 0,ADDR_PCT_CD,JURIS_DESC,LOC_OF_OCCUR_DESC,OFNS_DESC,PREM_TYP_DESC,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,LAW_CAT_CD,duration,crime_hour,crime_dayofweek
0,42,7,2,35,47,0,0,0,31,7,2,FELONY,,19,
1,48,7,2,51,52,27,4,2,31,3,2,FELONY,,16,
2,34,17,2,4,62,28,5,3,16,5,2,FELONY,,0,
3,116,7,0,44,72,0,0,0,32,5,1,FELONY,0.183333,16,1.0
4,73,7,0,51,72,27,3,2,15,3,2,FELONY,0.083333,14,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577103,101,7,2,7,62,28,5,1,16,3,2,FELONY,13.500000,9,1.0
577104,105,7,0,25,62,16,3,2,17,3,2,VIOLATION,0.333333,19,1.0
577105,105,7,2,23,64,28,5,2,18,5,2,FELONY,12.283333,0,1.0
577106,113,7,2,34,64,16,3,2,17,7,4,FELONY,0.250000,7,1.0


In [70]:
crime_df['LAW_CAT_CD'] = le.fit_transform(crime_df['LAW_CAT_CD'])

In [71]:
crime_df.dtypes

Unnamed: 0,0
ADDR_PCT_CD,int64
JURIS_DESC,int64
LOC_OF_OCCUR_DESC,int64
OFNS_DESC,int64
PREM_TYP_DESC,int64
SUSP_AGE_GROUP,int64
SUSP_RACE,int64
SUSP_SEX,int64
VIC_AGE_GROUP,int64
VIC_RACE,int64


#### Standarize Data

In [72]:
crime_df['ADDR_PCT_CD'] =(crime_df['ADDR_PCT_CD'] - crime_df['ADDR_PCT_CD'].mean() ) / crime_df['ADDR_PCT_CD'].std()
crime_df['duration'] =(crime_df['duration'] - crime_df['duration'].mean() ) / crime_df['duration'].std()
crime_df['crime_hour'] =(crime_df['crime_hour'] - crime_df['crime_hour'].mean() ) / crime_df['crime_hour'].std()

#### Draw feature correlation

In [74]:
print(crime_df.corr()["LAW_CAT_CD"][:])

ADDR_PCT_CD          0.027480
JURIS_DESC          -0.004826
LOC_OF_OCCUR_DESC    0.035642
OFNS_DESC            0.084562
PREM_TYP_DESC       -0.031266
SUSP_AGE_GROUP       0.051757
SUSP_RACE            0.042853
SUSP_SEX            -0.048809
VIC_AGE_GROUP       -0.071959
VIC_RACE            -0.026523
VIC_SEX             -0.002125
LAW_CAT_CD           1.000000
duration             0.029970
crime_hour           0.026687
crime_dayofweek           NaN
Name: LAW_CAT_CD, dtype: float64


In [75]:
# Fill Values contain Nan with 0
crime_df = crime_df.fillna(0)

In [77]:
# Save the updated dataset
output_path = '/content/drive/MyDrive/ML_project/crime_features_updated.csv'
crime_df.to_csv(output_path, index=False)