In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import geopandas as gpd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('New_BPD.csv')

In [3]:
# INGENIERÍA DE CARACTERÍSTICAS
# =============================

# Eliminamos filas con valores nulos en columnas clave
df_clean = df.dropna(subset=['District', 'Race', 'Age', 'OffenseCategory'])

# Convertimos 'District' y 'Race' a variables categóricas
df_clean['District'] = df_clean['District'].astype('category').cat.codes
df_clean['Race'] = df_clean['Race'].astype('category').cat.codes
df_clean['Age'] = df_clean['Age'].astype('category').cat.codes
df_clean['OffenseCategory'] = df_clean['OffenseCategory'].astype('category').cat.codes

# Seleccionamos las características y la variable objetivo
features = ['Year', 'Month', 'DayOfWeek', 'Hour', 'Latitude', 'Longitude']
X = df_clean[features]
y_district = df_clean['District']
y_race = df_clean['Race']
y_age = df_clean['Age']
y_offense = df_clean['OffenseCategory']

ENTRENAMIENTO Y EVALUACION DE MODELOS:

In [10]:
# Predicción de Distritos

# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_district, y_test_district = train_test_split(X, y_district, test_size=0.3, random_state=42)

# Entrenamos un RandomForestClassifier
rf_district = RandomForestClassifier(n_estimators=100, random_state=42)
rf_district.fit(X_train, y_train_district)

# Realizamos predicciones
y_pred_district = rf_district.predict(X_test)

# Evaluamos el modelo
mse = mean_squared_error(y_test_district, y_pred_district)
print(f"Mean Squared Error for District Prediction: {mse}")
print("Classification Report for District Prediction")
print(classification_report(y_test_district, y_pred_district))

Mean Squared Error for District Prediction: 0.23419783656979018
Classification Report for District Prediction
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2569
           1       1.00      0.99      0.99      3138
           2       0.99      1.00      0.99      2568
           3       0.99      0.98      0.99      1499
           4       0.99      1.00      0.99      2029
           5       1.00      1.00      1.00      2840
           6       1.00      1.00      1.00      3180
           7       0.99      0.98      0.99      2118
           8       0.97      0.99      0.98      3078

    accuracy                           0.99     23019
   macro avg       0.99      0.99      0.99     23019
weighted avg       0.99      0.99      0.99     23019



In [12]:
# Predicción de Raza

# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_race, y_test_race = train_test_split(X, y_race, test_size=0.3, random_state=42)

# Entrenamos un RandomForestClassifier
rf_race = RandomForestClassifier(n_estimators=100, random_state=42)
rf_race.fit(X_train, y_train_race)

# Realizamos predicciones
y_pred_race = rf_race.predict(X_test)

# Evaluamos el modelo
mse = mean_squared_error(y_test_race, y_pred_race)
print(f"Mean Squared Error for Race Prediction: {mse}")
print("Classification Report for Race Prediction")
print(classification_report(y_test_race, y_pred_race))

Mean Squared Error for Race Prediction: 1.2750336678396108
Classification Report for Race Prediction
              precision    recall  f1-score   support

           0       0.25      0.03      0.06        59
           1       0.87      0.95      0.91     19011
           2       0.12      0.02      0.03        66
           3       0.47      0.13      0.20       515
           4       0.54      0.34      0.42      3368

    accuracy                           0.84     23019
   macro avg       0.45      0.29      0.32     23019
weighted avg       0.81      0.84      0.82     23019



In [27]:
# Predicción de Edad
# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_age, y_test_age = train_test_split(X, y_age, test_size=0.3, random_state=42)

# Entrenamos un RandomForestRegressor
rf_age = RandomForestRegressor(n_estimators=100, random_state=42)
rf_age.fit(X_train, y_train_age)

# Realizamos predicciones
y_pred_age = rf_age.predict(X_test)

# Evaluamos el modelo
mse = mean_squared_error(y_test_age, y_pred_age)
print(f"Mean Squared Error for Age Prediction: {mse}")
print("Classification Report for Age Prediction")
print(classification_report(y_test_age, y_pred_age))

Mean Squared Error for Age Prediction: 134.1449380325116
Classification Report for Age Prediction


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [18]:
# AFINACIÓN DE HIPERPARÁMETROS
# ============================
# GridSearchCV para encontrar los mejores hiperparámetros
# Parámetros a probar
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Grid search para el modelo de distritos
grid_search_district = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search_district.fit(X_train, y_train_district)
print(f"Mejores parámetros para distritos: {grid_search_district.best_params_}")

# Grid search para el modelo de razas
grid_search_race = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search_race.fit(X_train, y_train_race)
print(f"Mejores parámetros para razas: {grid_search_race.best_params_}")

Mejores parámetros para distritos: {'max_depth': 20, 'n_estimators': 200}
Mejores parámetros para razas: {'max_depth': 20, 'n_estimators': 100}


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report, r2_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [24]:
# Cargar y limpiar los datos
df = pd.read_csv('New_BPD.csv')

In [36]:
# Eliminar filas con valores nulos en columnas clave
df_clean = df.dropna(subset=['District', 'Race', 'Age', 'OffenseCategory'])

# Convertir variables categóricas
df_clean['District'] = df_clean['District'].astype('category').cat.codes
df_clean['Race'] = df_clean['Race'].astype('category').cat.codes
df_clean['Age'] = df_clean['Age'].astype('category').cat.codes
df_clean['OffenseCategory'] = df_clean['OffenseCategory'].astype('category').cat.codes
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [9]:
# Seleccionar características y variables objetivo
features = ['Year', 'Month', 'DayOfWeek', 'Hour', 'Latitude', 'Longitude']
X = df_clean[features]
y_district = df_clean['District']
y_race = df_clean['Race']
y_age = df_clean['Age']

# Normalización de características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# ---------------------------
# PREDICCIÓN DE DISTRITOS
# ---------------------------
X_train, X_test, y_train_district, y_test_district = train_test_split(X_scaled, y_district, test_size=0.3, random_state=42)

# Entrenamiento y ajuste de hiperparámetros usando GridSearchCV
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
rf_district = RandomForestClassifier(random_state=42)
grid_search_district = GridSearchCV(rf_district, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_district.fit(X_train, y_train_district)

In [None]:
# Mejor modelo de RandomForestClassifier
best_rf_district = grid_search_district.best_estimator_

In [None]:
# ---------------------------
# PREDICCIÓN DE RAZA (Manejo del Desbalanceo)
# ---------------------------
# Aplicar SMOTE para el balanceo de clases
X_train_race, X_test_race, y_train_race, y_test_race = train_test_split(X_scaled, y_race, test_size=0.3, random_state=42)
smote = SMOTE(random_state=42)
X_train_race_smote, y_train_race_smote = smote.fit_resample(X_train_race, y_train_race)

In [32]:
# Evaluación del modelo
y_pred_district = best_rf_district.predict(X_test)
print(f"Mean Squared Error for District Prediction: {mean_squared_error(y_test_district, y_pred_district)}")
print("Classification Report for District Prediction")
print(classification_report(y_test_district, y_pred_district))

Mean Squared Error for District Prediction: 0.18945219166775273
Classification Report for District Prediction
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2569
           1       1.00      0.99      0.99      3138
           2       0.99      1.00      0.99      2568
           3       0.99      0.98      0.99      1499
           4       0.99      1.00      0.99      2029
           5       0.99      1.00      1.00      2840
           6       1.00      1.00      1.00      3180
           7       0.99      0.99      0.99      2118
           8       0.98      0.99      0.98      3078

    accuracy                           0.99     23019
   macro avg       0.99      0.99      0.99     23019
weighted avg       0.99      0.99      0.99     23019



In [26]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Entrenamiento y ajuste de hiperparámetros
rf_race = RandomForestClassifier(random_state=42)
grid_search_race = GridSearchCV(rf_race, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_race.fit(X_train_race_smote, y_train_race_smote)

# Mejor modelo
best_rf_race = grid_search_race.best_estimator_

NameError: name 'X_train_race_smote' is not defined

In [None]:
# Evaluación del modelo
y_pred_race = best_rf_race.predict(X_test_race)
print(f"Mean Squared Error for Race Prediction: {mean_squared_error(y_test_race, y_pred_race)}")
print("Classification Report for Race Prediction")
print(classification_report(y_test_race, y_pred_race))

In [11]:
# ---------------------------
# PREDICCIÓN DE EDAD
# ---------------------------
X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(X_scaled, y_age, test_size=0.3, random_state=42)

In [13]:
# Entrenamiento y ajuste de hiperparámetros usando GridSearchCV
rf_age = RandomForestRegressor(random_state=42)
param_grid_reg = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
grid_search_age = GridSearchCV(rf_age, param_grid_reg, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_age.fit(X_train_age, y_train_age)

# Mejor modelo de RandomForestRegressor
best_rf_age = grid_search_age.best_estimator_

In [20]:
# Evaluación del modelo
y_pred_age = best_rf_age.predict(X_test_age)
mse_age = mean_squared_error(y_test_age, y_pred_age)
r2_age = r2_score(y_test_age, y_pred_age)
print(f"Mean Squared Error for Age Prediction: {mse_age}")
print(f"R^2 Score for Age Prediction: {r2_age}")

Mean Squared Error for Age Prediction: 129.22467305366166
R^2 Score for Age Prediction: 0.0787392667078225


In [22]:
# ---------------------------
# VISUALIZACIÓN Y ANÁLISIS EXPLORATORIO
# ---------------------------
# Correlación entre características
plt.figure(figsize=(10, 8))
sns.heatmap(df_clean.corr(), annot=True, cmap='coolwarm')
plt.title('Correlación entre características')
plt.show()

ValueError: could not convert string to float: 'M'

<Figure size 1000x800 with 0 Axes>