In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Cargar el dataset
df = pd.read_csv('New_BPD.csv')

# Mostrar las columnas del DataFrame
print("Columnas del DataFrame:", df.columns)

# Preprocesamiento de datos
# Convertir variables categóricas a numéricas con get_dummies
df = pd.get_dummies(df, columns=['Sex', 'Race', 'District', 'DayOfWeek', 'OffenseCategory'], drop_first=True)

# Manejar valores nulos (si hay)
df = df.dropna()

# Verificar el resultado del preprocesamiento
print("Columnas del DataFrame después de get_dummies y dropna:", df.columns)

# Separar las características (X) y la variable objetivo (y)
# Aquí debes asegurarte de que la columna objetivo esté correctamente especificada
X = df.drop(['Year', 'Month', 'Latitude', 'Longitude'], axis=1, errors='ignore')
# Asegurémonos de que el nombre de la columna objetivo sea correcto
# Si OffenseCategory_* no existe, revisaremos las columnas y elegiremos una válida
objective_column = [col for col in df.columns if 'OffenseCategory' in col]
if objective_column:
    y = df[objective_column[0]]
else:
    raise KeyError("La columna 'OffenseCategory' no se encuentra en el DataFrame")

# Dividir el dataset en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Entrenar el modelo RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predicciones
y_pred = clf.predict(X_test)

# Evaluación del modelo
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Afinación de hiperparámetros
# Definir los parámetros a buscar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Mejor estimador
best_clf = grid_search.best_estimator_

# Predicciones con el mejor estimador
y_pred_best = best_clf.predict(X_test)

# Evaluación del mejor modelo
print("Best Classification Report:\n", classification_report(y_test, y_pred_best))
print("Best Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Columnas del DataFrame: Index(['Age', 'Sex', 'Race', 'District', 'Year', 'Month', 'DayOfWeek', 'Hour',
       'OffenseCategory', 'Latitude', 'Longitude'],
      dtype='object')
Columnas del DataFrame después de get_dummies y dropna: Index(['Age', 'Year', 'Month', 'Hour', 'Latitude', 'Longitude', 'Sex_M',
       'Race_B', 'Race_I', 'Race_U', 'Race_W', 'District_Eastern',
       'District_Northeastern', 'District_Northern', 'District_Northwestern',
       'District_Southeastern', 'District_Southern', 'District_Southwestern',
       'District_Western', 'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3',
       'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6',
       'OffenseCategory_Alcohol Violations', 'OffenseCategory_Animal Crimes',
       'OffenseCategory_Arson', 'OffenseCategory_Child Abuse',
       'OffenseCategory_Domestic Crimes', 'OffenseCategory_Drug Offenses',
       'OffenseCategory_Environmental Violations',
       'OffenseCategory_Fire Incidents', 'OffenseCategory_Fraud',
       'Offens

150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
67 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

Best Classification Report:
               precision    recall  f1-score   support

       False       1.00      1.00      1.00     22932
        True       1.00      1.00      1.00        19

    accuracy                           1.00     22951
   macro avg       1.00      1.00      1.00     22951
weighted avg       1.00      1.00      1.00     22951

Best Confusion Matrix:
 [[22932     0]
 [    0    19]]


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76501 entries, 0 to 78665
Data columns (total 48 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Age                                       76501 non-null  float64
 1   Year                                      76501 non-null  int64  
 2   Month                                     76501 non-null  int64  
 3   Hour                                      76501 non-null  float64
 4   Latitude                                  76501 non-null  float64
 5   Longitude                                 76501 non-null  float64
 6   Sex_M                                     76501 non-null  bool   
 7   Race_B                                    76501 non-null  bool   
 8   Race_I                                    76501 non-null  bool   
 9   Race_U                                    76501 non-null  bool   
 10  Race_W                                 