In [1]:
# Data Loading and Exploring
import pandas as pd
import numpy as np
from pathlib import Path
from sodapy import Socrata

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import plotly.express as px

# Data Preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.model_selection import GridSearchCV
import joblib
import tensorflow as tf


# Performance Measurement Metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

# Custom Modules
from utils.data_loader import load_crime_dataset
from utils.maps import ChicagoMap

# Filter out Warnings
import warnings
warnings.filterwarnings("ignore")

# Visualization Configuration
sns.set_theme(style="ticks", color_codes=True)


In [2]:
# Location of stored dataset
dataset_path = Path('../datasets/chicago-crime-data.csv')

if dataset_path.exists():
    print(f"File found: {dataset_path.name}")
else:
    load_crime_dataset()

File found: chicago-crime-data.csv


In [3]:
crime_df = pd.read_csv(dataset_path)
crime_df['date'] = pd.to_datetime(crime_df['date'])

##### Data Preprocessing

In [4]:
def preprocess_crime_data(crime_df):
    # Step 1: Drop redundant columns
    crime_df.drop(columns=crime_df.columns[22:], axis=1, inplace=True)
    print("Dataset Shape:",crime_df.shape)
    
    # Step 2: Detect and drop duplicates
    print(f"Duplicated rows detected: {sum(crime_df.duplicated())}")
    crime_df.drop_duplicates(inplace=True)
    
    # Step 3: Check and handle missing values
    print(f"Missing Values: {sum(crime_df.isna().sum())}")
    crime_df.dropna(inplace=True)
    
    # Step 4: Compute arrest rate and create encoding_dict
    encoding_dict = {
        primary_type: idx
        for idx, (primary_type, _) in enumerate(
            crime_df.groupby('primary_type')
            .apply(lambda g: ((g['arrest'].sum() / len(g)) * 100).round(2))
            .sort_values()
            .items()
        )
    }
    
    # Step 5: Encode 'primary_type' and update 'arrest' column
    crime_df['primary_type_encoded'] = crime_df['primary_type'].map(encoding_dict).fillna(-1).astype(int)
    crime_df['arrest'] = crime_df['arrest'].astype(int)
    
    # Step 6: Feature selection
    features = ['domestic', 'district', 'beat', 'community_area', 'ward',
                'x_coordinate', 'y_coordinate', 'latitude', 'longitude',
                'year', 'primary_type_encoded']
    target = 'arrest'
    
    X = crime_df[features]
    Y = crime_df[target]
    
    # Step 7: Scale features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X, Y

##### Model Training and Tuning

In [11]:
def train_and_evaluate_neural_network():
    # Separate the classes
    arrest_true = crime_df[crime_df['arrest'] == True]
    arrest_false = crime_df[crime_df['arrest'] == False]

    # Sample rows from each class
    arrest_true_sample = arrest_true.sample(n=200000, random_state=42)
    arrest_false_sample = arrest_false.sample(n=500000, random_state=42)

    # Combine the samples into one DataFrame
    balanced_df = pd.concat([arrest_true_sample, arrest_false_sample])

    # Shuffle the resulting DataFrame
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    X, Y = preprocess_crime_data(balanced_df)

    # Step 1: Split the data into training and testing sets
    print("Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
    print("Data split complete. Training set size:", X_train.shape, "Test set size:", X_test.shape)

    # Step 9: Print counts of True (1) and False (0) in train and test sets
    print("Training set:")
    print(f"True (1): {sum(y_train == 1)}, False (0): {sum(y_train == 0)}")
    
    print("Test set:")
    print(f"True (1): {sum(y_test == 1)}, False (0): {sum(y_test == 0)}")


In [12]:
def train_and_evaluate_random_forest(df):
    X, Y = preprocess_crime_data(df)
    
    # Step 1: Split the data into training and testing sets
    print("Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
    print("Data split complete. Training set size:", X_train.shape, "Test set size:", X_test.shape)

    # Step 9: Print counts of True (1) and False (0) in train and test sets
    print("Training set:")
    print(f"True (1): {sum(y_train == 1)}, False (0): {sum(y_train == 0)}")
    
    print("Test set:")
    print(f"True (1): {sum(y_test == 1)}, False (0): {sum(y_test == 0)}")

##### Report

In [13]:
train_and_evaluate_neural_network()

Dataset Shape: (700000, 22)
Duplicated rows detected: 0
Missing Values: 50977
Splitting data into training and testing sets...
Data split complete. Training set size: (481606, 11) Test set size: (206403, 11)
Training set:
True (1): 137957, False (0): 343649
Test set:
True (1): 59124, False (0): 147279


In [14]:
train_and_evaluate_random_forest(crime_df)

Dataset Shape: (1408934, 22)
Duplicated rows detected: 0
Missing Values: 102057
Splitting data into training and testing sets...
Data split complete. Training set size: (969004, 11) Test set size: (415288, 11)
Training set:
True (1): 142313, False (0): 826691
Test set:
True (1): 60991, False (0): 354297
