### Shooting Data Processing

> <sub>⚠️ **Note**: Internal links (like Table of Contents) work best when this notebook is opened in **Jupyter Notebook** or **nbviewer.org**.<br>
> GitHub does **not support scrolling to sections** inside `.ipynb` files.</sub>

---
######  - [Reading Shooting Dataset](#Reading-Shooting-Dataset)
######  - [Key Preprocessing Steps](#Key-Preprocessing-Steps)
######  - [Adding Distance Metrics](#Adding-Distance-Metrics)
######  - [Writing the dataset](#Writing-the-dataset)
---

#### Reading Shooting Dataset
######  - [_Click here to move back to index_](#Shooting-Data-Processing)

In [5]:
import sys
sys.path.append('src') 


In [6]:
from Data_Preprocessing import load_data

df = load_data('data/raw/NYPD_Shooting_Incident_Data__Historic__20250417.csv')


In [7]:
df.head(5)

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,JURISDICTION_CODE,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,...,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,231974218,08/09/2021,01:06:00,BRONX,,40,0.0,,,False,...,,,18-24,M,BLACK,1006343.0,234270.0,40.809673,-73.920193,POINT (-73.92019278899994 40.80967347200004)
1,177934247,04/07/2018,19:48:00,BROOKLYN,,79,0.0,,,True,...,M,WHITE HISPANIC,25-44,M,BLACK,1000082.9375,189064.671875,40.68561,-73.942913,POINT (-73.94291302299996 40.685609672000055)
2,255028563,12/02/2022,22:57:00,BRONX,OUTSIDE,47,0.0,STREET,GROCERY/BODEGA,False,...,(null),(null),25-44,M,BLACK,1020691.0,257125.0,40.872349,-73.868233,POINT (-73.868233 40.872349)
3,25384540,11/19/2006,01:50:00,BROOKLYN,,66,0.0,,PVT HOUSE,True,...,U,UNKNOWN,18-24,M,BLACK,985107.3125,173349.796875,40.64249,-73.996912,POINT (-73.99691224999998 40.642489932000046)
4,72616285,05/09/2010,01:58:00,BRONX,,46,0.0,,MULTI DWELL - APT BUILD,True,...,M,BLACK,<18,F,BLACK,1009853.5,247502.5625,40.845984,-73.907461,POINT (-73.90746098599993 40.84598358900007)


#### Key Preprocessing Steps
######  - [_Click here to move back to index_](#Shooting-Data-Processing)

In [9]:
# Step 1: Filter to relevant columns
columns_to_keep = [
    "OCCUR_DATE", "OCCUR_TIME", "BORO", "PRECINCT",
    "STATISTICAL_MURDER_FLAG", "VIC_AGE_GROUP", "VIC_SEX", "VIC_RACE",
    "JURISDICTION_CODE", "Latitude", "Longitude"
]
df = df[columns_to_keep].copy()


In [10]:
# Step 2: Parse OCCUR_DATE and extract year, month, year_month
import pandas as pd 

df['OCCUR_DATE'] = pd.to_datetime(df['OCCUR_DATE'], errors='coerce')
df['year'] = df['OCCUR_DATE'].dt.year
df['month'] = df['OCCUR_DATE'].dt.month
df['year_month'] = df['OCCUR_DATE'].dt.to_period('M').astype(str)


In [11]:
# Step 3: Parse OCCUR_TIME to time object
def parse_time(value):
    for fmt in ("%I:%M:%S %p", "%H:%M:%S"):
        try:
            return pd.to_datetime(value, format=fmt).time()
        except:
            continue
    return pd.NaT

df['OCCUR_TIME'] = df['OCCUR_TIME'].apply(parse_time)


In [12]:
# Step 4: Assign time buckets
def assign_time_bucket(t):
    if pd.isnull(t):
        return None
    if t <= pd.to_datetime("06:00:00", format="%H:%M:%S").time():
        return "12:01am - 6:00am"
    elif t <= pd.to_datetime("12:00:00", format="%H:%M:%S").time():
        return "6:01am - 12:00pm"
    elif t <= pd.to_datetime("18:00:00", format="%H:%M:%S").time():
        return "12:01pm - 6:00pm"
    else:
        return "6:01pm - 12:00am"

df['time_bucket'] = df['OCCUR_TIME'].apply(assign_time_bucket)


In [13]:
# Step 5: Convert STATISTICAL_MURDER_FLAG to 'Yes'/'No'
df['STATISTICAL_MURDER_FLAG'] = df['STATISTICAL_MURDER_FLAG'].map({True: 'Yes', False: 'No'})

# Step 6: Reorder columns (optional)
df = df[
    ["OCCUR_DATE", "year", "month", "year_month", "time_bucket", "BORO", "PRECINCT",
     "STATISTICAL_MURDER_FLAG", "VIC_AGE_GROUP", "VIC_SEX", "VIC_RACE",
     "JURISDICTION_CODE", "Latitude", "Longitude"]
]

# Step 7: Filter records from 2010 onward
df = df[df['OCCUR_DATE'] >= pd.Timestamp('2010-01-01')]

# Preview the cleaned data
df.head()

Unnamed: 0,OCCUR_DATE,year,month,year_month,time_bucket,BORO,PRECINCT,STATISTICAL_MURDER_FLAG,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,JURISDICTION_CODE,Latitude,Longitude
0,2021-08-09,2021,8,2021-08,12:01am - 6:00am,BRONX,40,No,18-24,M,BLACK,0.0,40.809673,-73.920193
1,2018-04-07,2018,4,2018-04,6:01pm - 12:00am,BROOKLYN,79,Yes,25-44,M,BLACK,0.0,40.68561,-73.942913
2,2022-12-02,2022,12,2022-12,6:01pm - 12:00am,BRONX,47,No,25-44,M,BLACK,0.0,40.872349,-73.868233
4,2010-05-09,2010,5,2010-05,12:01am - 6:00am,BRONX,46,Yes,<18,F,BLACK,0.0,40.845984,-73.907461
5,2012-07-22,2012,7,2012-07,6:01pm - 12:00am,BRONX,42,No,18-24,M,BLACK,2.0,40.824878,-73.903179


#### Adding Distance Metrics
######  - [_Click here to move back to index_](#Shooting-Data-Processing)

In [15]:
from Data_Preprocessing import distance

df['Times Square Latitude'] = 40.758896
df['Times Square Longitude'] = -73.985130

df['Grand Central Latitude'] = 40.752655
df['Grand Central Longitude'] = -73.977295

df['Times Square Distance'] = df.apply(lambda row: distance(row['Times Square Latitude'], row['Times Square Longitude'], row['Latitude'], row['Longitude']), axis=1)
df['Grand Central Distance'] = df.apply(lambda row: distance(row['Grand Central Latitude'], row['Grand Central Longitude'], row['Latitude'], row['Longitude']), axis=1)

df.head(5)

Unnamed: 0,OCCUR_DATE,year,month,year_month,time_bucket,BORO,PRECINCT,STATISTICAL_MURDER_FLAG,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,JURISDICTION_CODE,Latitude,Longitude,Times Square Latitude,Times Square Longitude,Grand Central Latitude,Grand Central Longitude,Times Square Distance,Grand Central Distance
0,2021-08-09,2021,8,2021-08,12:01am - 6:00am,BRONX,40,No,18-24,M,BLACK,0.0,40.809673,-73.920193,40.758896,-73.98513,40.752655,-73.977295,7.861928,7.959473
1,2018-04-07,2018,4,2018-04,6:01pm - 12:00am,BROOKLYN,79,Yes,25-44,M,BLACK,0.0,40.68561,-73.942913,40.758896,-73.98513,40.752655,-73.977295,8.894628,8.00092
2,2022-12-02,2022,12,2022-12,6:01pm - 12:00am,BRONX,47,No,25-44,M,BLACK,0.0,40.872349,-73.868233,40.758896,-73.98513,40.752655,-73.977295,16.002586,16.172424
4,2010-05-09,2010,5,2010-05,12:01am - 6:00am,BRONX,46,Yes,<18,F,BLACK,0.0,40.845984,-73.907461,40.758896,-73.98513,40.752655,-73.977295,11.687525,11.930603
5,2012-07-22,2012,7,2012-07,6:01pm - 12:00am,BRONX,42,No,18-24,M,BLACK,2.0,40.824878,-73.903179,40.758896,-73.98513,40.752655,-73.977295,10.074168,10.17313


#### Writing the dataset
######  - [_Click here to move back to index_](#Shooting-Data-Processing)

In [17]:
from Data_Preprocessing import write_data

# Save to processed
write_data(df, 'Shooting_Data_Processed.csv')

Data written to: C:\Users\utkar\Desktop\PyCharm Projects Spring\Machine Learning for Business\data\processed\Shooting_Data_Processed.csv


'data\\processed\\Shooting_Data_Processed.csv'