In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef, cohen_kappa_score
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [13]:
# Read the csv file into a pandas DataFrame
murder_target = pd.read_csv(
    Path("../Crime/murder_target.csv"), encoding='iso-8859-1'
)

# Review the DataFrame
murder_target.head()

Unnamed: 0,offense_code,offense_code_extension,offense_type_id,offense_category_id,reported_date,incident_address,geo_lon,geo_lat,district_id,precinct_id,neighborhood_id,victim_count,offense_type_name,murders
0,2999,0,criminal-mischief-other,public-disorder,2/10/2022 3:16:00 AM,1107 N SANTA FE DR,-104.99891,39.733957,1,123.0,lincoln-park,1.0,Criminal mischief - other,0
1,2999,0,criminal-mischief-other,public-disorder,7/8/2021 12:55:00 AM,815 16TH ST,-104.993342,39.746248,6,611.0,cbd,1.0,Criminal mischief - other,0
2,2999,0,criminal-mischief-other,public-disorder,10/29/2020 4:31:00 AM,4745 N FEDERAL BLVD,-105.02552,39.782888,1,111.0,berkeley,1.0,Criminal mischief - other,0
3,2999,0,criminal-mischief-other,public-disorder,9/7/2018 9:58:00 AM,65 S FEDERAL BLVD,-105.02533,39.715357,4,411.0,barnum,1.0,Criminal mischief - other,0
4,2999,0,criminal-mischief-other,public-disorder,5/13/2020 10:00:00 AM,12295 E ALBROOK DR,-104.845074,39.783082,5,521.0,montbello,1.0,Criminal mischief - other,0


In [24]:
murder_target.columns

Index(['offense_code', 'offense_code_extension', 'offense_type_id',
       'offense_category_id', 'reported_date', 'incident_address', 'geo_lon',
       'geo_lat', 'district_id', 'precinct_id', 'neighborhood_id',
       'victim_count', 'offense_type_name', 'murders'],
      dtype='object')

In [25]:
murder_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386978 entries, 0 to 386977
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   offense_code            386978 non-null  int64  
 1   offense_code_extension  386978 non-null  int64  
 2   offense_type_id         386978 non-null  object 
 3   offense_category_id     386978 non-null  object 
 4   reported_date           386865 non-null  object 
 5   incident_address        371362 non-null  object 
 6   geo_lon                 371096 non-null  float64
 7   geo_lat                 371096 non-null  float64
 8   district_id             386808 non-null  object 
 9   precinct_id             386865 non-null  float64
 10  neighborhood_id         386176 non-null  object 
 11  victim_count            386865 non-null  float64
 12  offense_type_name       386978 non-null  object 
 13  murders                 386978 non-null  int64  
dtypes: float64(4), int64

In [14]:
# Select features and target
X = murder_target[['district_id', 'precinct_id', 'neighborhood_id', 'victim_count']].copy()
y = murder_target['murders']


In [15]:
# Ensure categorical columns are strings
X.loc[:, 'district_id'] = X['district_id'].astype(str)
X.loc[:, 'neighborhood_id'] = X['neighborhood_id'].astype(str)

# Ensure numerical columns are numeric
X.loc[:, 'precinct_id'] = pd.to_numeric(X['precinct_id'], errors='coerce')
X.loc[:, 'victim_count'] = pd.to_numeric(X['victim_count'], errors='coerce')

In [16]:
# Handle missing values
X = X.fillna(-1)

In [17]:
# One-hot encode categorical features
categorical_features = ['district_id', 'neighborhood_id']
numerical_features = ['precinct_id', 'victim_count']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


In [18]:
# Create a pipeline for preprocessing
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Sample a subset of data (e.g., 10%)
X_sampled = X.sample(frac=0.1, random_state=42)
y_sampled = y[X_sampled.index]


In [19]:
# Check memory usage
print(f"Original data size: {X.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
print(f"Sampled data size: {X_sampled.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

Original data size: 52.43 MB
Sampled data size: 5.54 MB


In [20]:
# Preprocess the sampled data
X_preprocessed_sampled = preprocessing_pipeline.fit_transform(X_sampled)
print("Preprocessing done")

Preprocessing done


In [21]:
# Adjust SMOTETomek parameters for efficiency
smotetomek = SMOTETomek(random_state=42, n_jobs=-1, sampling_strategy='auto')

# Apply SMOTETomek to the sampled data
X_resampled_sampled, y_resampled_sampled = smotetomek.fit_resample(X_preprocessed_sampled, y_sampled)
print("Resampling done with sampled data")



Resampling done with sampled data


In [22]:
# Split the resampled sampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled_sampled, y_resampled_sampled, test_size=0.2, random_state=42)
print("Train-test split done")

# Set up a simplified parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'class_weight': ['balanced']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           scoring='f1')

Train-test split done


In [23]:
# Fit the model
grid_search.fit(X_train, y_train)
print("Grid search done")

# Make predictions
y_pred = grid_search.best_estimator_.predict(X_test)

# Generate classification report
print(classification_report(y_test, y_pred, zero_division=1))

# Display confusion matrix
print(confusion_matrix(y_test, y_pred))

# Calculate Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc}")

# Calculate Cohen's Kappa
kappa = cohen_kappa_score(y_test, y_pred)
print(f"Cohen's Kappa: {kappa}")


Grid search done
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7673
           1       0.97      0.90      0.94      7789

    accuracy                           0.94     15462
   macro avg       0.94      0.94      0.94     15462
weighted avg       0.94      0.94      0.94     15462

[[7485  188]
 [ 779 7010]]
Matthews Correlation Coefficient (MCC): 0.8775490771933643
Cohen's Kappa: 0.8749838187837562
