<table><tr>
<td> <img src=https://www.baltimorepolice.org/themes/custom/bpd/images/bpd_logo.png alt="Drawing" style="height: 250px;"/> </td>
<td> <img src=https://beam-images.warnermediacdn.com/BEAM_LWM_DELIVERABLES/1bc3aff5-0d6a-4c0b-8ed0-5716ca30ab3b/fbbc7a604f327cfa8a7bbe614a89be13a246d266.jpg?host=wbd-images.prod-vod.h264.io&partner=beamcom style="height: 250px;"/> </td>
</tr></table>

MACHINE LEARNING
=

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv('New_BPD.csv')

INGENIERÍA DE CARACTERÍSTICAS
=============================

In [16]:
# Eliminamos filas con valores nulos en columnas clave
df_clean = df.dropna(subset=['District', 'Race', 'Age'])

# Convertimos 'District' y 'Race' a variables categóricas
df_clean['District'] = df_clean['District'].astype('category').cat.codes
df_clean['Race'] = df_clean['Race'].astype('category').cat.codes

# Seleccionamos las características y la variable objetivo
features = ['Year', 'Month', 'DayOfWeek', 'Hour', 'Latitude', 'Longitude']
X = df_clean[features]
y_district = df_clean['District']
y_race = df_clean['Race']
y_age = df_clean['Age']

In [None]:
# ENTRENAMIENTO Y EVALUACION DE MODELOS
# =====================================

In [None]:
# Predicción de Distritos

In [9]:
# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_district, y_test_district = train_test_split(X, y_district, test_size=0.3, random_state=42)

# Entrenamos un RandomForestClassifier
rf_district = RandomForestClassifier(n_estimators=100, random_state=42)
rf_district.fit(X_train, y_train_district)

# Realizamos predicciones
y_pred_district = rf_district.predict(X_test)

# Evaluamos el modelo
print("Classification Report for District Prediction")
print(classification_report(y_test_district, y_pred_district))

Classification Report for District Prediction
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2566
           1       1.00      0.99      0.99      3139
           2       0.99      1.00      0.99      2707
           3       0.99      0.98      0.98      1560
           4       0.99      1.00      1.00      2117
           5       0.99      1.00      1.00      2870
           6       0.99      1.00      0.99      3372
           7       0.99      0.99      0.99      2092
           8       0.98      0.99      0.98      3153

    accuracy                           0.99     23576
   macro avg       0.99      0.99      0.99     23576
weighted avg       0.99      0.99      0.99     23576



In [None]:
# Predicción de Raza

In [11]:
# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_race, y_test_race = train_test_split(X, y_race, test_size=0.3, random_state=42)

# Entrenamos un RandomForestClassifier
rf_race = RandomForestClassifier(n_estimators=100, random_state=42)
rf_race.fit(X_train, y_train_race)

# Realizamos predicciones
y_pred_race = rf_race.predict(X_test)

# Evaluamos el modelo
print("Classification Report for Race Prediction")
print(classification_report(y_test_race, y_pred_race))

Classification Report for Race Prediction
              precision    recall  f1-score   support

           0       0.11      0.02      0.03        64
           1       0.87      0.95      0.91     19452
           2       0.17      0.02      0.03        65
           3       0.41      0.14      0.21       495
           4       0.55      0.34      0.42      3500

    accuracy                           0.84     23576
   macro avg       0.42      0.29      0.32     23576
weighted avg       0.81      0.84      0.82     23576



In [None]:
# AFINACIÓN DE HIPERPARÁMETROS
# ============================

In [None]:
GridSearchCV para encontrar los mejores hiperparámetros

In [15]:
# Parámetros a probar
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Grid search para el modelo de distritos
grid_search_district = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search_district.fit(X_train, y_train_district)
print(f"Mejores parámetros para distritos: {grid_search_district.best_params_}")

# Grid search para el modelo de razas
grid_search_race = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search_race.fit(X_train, y_train_race)
print(f"Mejores parámetros para razas: {grid_search_race.best_params_}")

Mejores parámetros para distritos: {'max_depth': 20, 'n_estimators': 200}
Mejores parámetros para razas: {'max_depth': 20, 'n_estimators': 200}


In [16]:
# INTERPRETACIÓN DE RESULTADOS
# ============================

In [18]:
# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_age, y_test_age = train_test_split(X, y_age, test_size=0.3, random_state=42)

# Entrenamos un RandomForestRegressor
rf_age = RandomForestRegressor(n_estimators=100, random_state=42)
rf_age.fit(X_train, y_train_age)

# Realizamos predicciones
y_pred_age = rf_age.predict(X_test)

# Evaluamos el modelo
mse = mean_squared_error(y_test_age, y_pred_age)
print(f"Mean Squared Error for Age Prediction: {mse}")

Mean Squared Error for Age Prediction: 132.8079319480721


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets