In [16]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib  # For saving splits and preprocessor

In [17]:
# Load Environment Variables and Connect to DB
load_dotenv()

DB_USER = os.getenv("db_username")
DB_PASS = os.getenv("db_password")
DB_HOST = os.getenv("db_host")
DB_PORT = os.getenv("db_port")
DB_NAME = os.getenv("db_name")

connection_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_url)

In [18]:
# Load Data
query = "SELECT * FROM nypd_vehicle_stop_reports;"
df = pd.read_sql(query, engine)
print("Data shape:", df.shape)
print(df.dtypes)

Data shape: (2039862, 19)
event_key                  object
date_occurred              object
time_occurred              object
command_code                int64
vehicle_seized               bool
vehicle_searched             bool
vehicle_search_consent     object
vehicle_checkpoint           bool
force_used                   bool
arrest_made                  bool
summons_issued               bool
vehicle_category           object
reported_age              float64
sex_code                   object
race_description           object
latitude                  float64
longitude                 float64
x_coordinate              float64
y_coordinate              float64
dtype: object


In [19]:
# Initial Cleaning
X = df.drop(columns=['event_key'])  # Drop unique ID


In [20]:
# Convert booleans to integers
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

In [21]:
# Handle Missing Values
X['vehicle_search_consent'] = X['vehicle_search_consent'].fillna('No Search')
age_imputer = SimpleImputer(strategy='median')
X['reported_age'] = age_imputer.fit_transform(X[['reported_age']])

In [22]:
# Remove spatial columns (not used for now)
X.drop(columns=['latitude', 'longitude', 'x_coordinate', 'y_coordinate'], inplace=True)

In [23]:
# Fill minimal random missing values
X['vehicle_category'] = X['vehicle_category'].fillna('Unknown')
X['sex_code'] = X['sex_code'].fillna('Unknown')
X['race_description'] = X['race_description'].fillna('Unknown')

In [24]:
# Feature Engineering (Date/Time)
X['date_occurred'] = pd.to_datetime(X['date_occurred'])
X['time_occurred'] = pd.to_datetime(X['time_occurred'], format='%H:%M:%S').dt.time

X['day_of_week'] = X['date_occurred'].dt.dayofweek
X['month'] = X['date_occurred'].dt.month
X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)

X['hour'] = pd.to_datetime(X['time_occurred'], format='%H:%M:%S').dt.hour
X['is_night'] = X['hour'].isin([0,1,2,3,4,5,22,23]).astype(int)

X.drop(columns=['date_occurred', 'time_occurred'], inplace=True)

In [25]:
# Final Feature Selection for Modeling
features = [
    'day_of_week', 'month', 'hour', 'is_weekend', 'is_night',
    'vehicle_checkpoint',
    'vehicle_category',
    'reported_age',
    'sex_code',
    'race_description',
    'command_code'
]
X_filtered = X[features]
y = X['arrest_made']


In [26]:
# Define Preprocessor
categorical_features = [
    'command_code',
    'vehicle_category',
    'sex_code',
    'race_description'
]

numeric_features = ['reported_age', 'day_of_week', 'month', 'hour']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('passthrough', 'passthrough', ['vehicle_checkpoint', 'is_weekend', 'is_night'])
])


In [27]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y, test_size=0.2, random_state=42, stratify=y
)

In [28]:
# Save Processed Data and Preprocessor
joblib.dump(X_train, 'X_train.pkl')
joblib.dump(X_test, 'X_test.pkl')
joblib.dump(y_train, 'y_train.pkl')
joblib.dump(y_test, 'y_test.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("✅ Data preparation complete. Files saved.")

✅ Data preparation complete. Files saved.
