In [1]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Cargar el dataset
df = pd.read_csv('New_BPD.csv')

In [3]:
# Limpieza de datos
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [4]:
# Selección de características y etiquetas
X = df.drop('OffenseCategory', axis=1)
y = df['OffenseCategory']

In [5]:
# Convertir características categóricas en variables dummy
X = pd.get_dummies(X, columns=['Sex', 'Race', 'District'])

In [6]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balanceo de datos con RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Codificación de etiquetas
label_encoder = LabelEncoder()
y_train_ros_encoded = label_encoder.fit_transform(y_train_ros)
y_test_encoded = label_encoder.transform(y_test)

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ros)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Definición de modelos y parámetros para GridSearchCV
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'KNN': KNeighborsClassifier()
}

params = {
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    }
}

In [8]:
# Función para realizar GridSearchCV y evaluar los modelos
def grid_search_evaluate(models, params, X_train, y_train, X_test, y_test):
    best_models = {}
    for model_name in models:
        print(f"Running GridSearchCV for {model_name}...")
        grid_search = GridSearchCV(models[model_name], params[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best params for {model_name}: {grid_search.best_params_}")
        y_pred = grid_search.predict(X_test)
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
    return best_models

best_models = grid_search_evaluate(models, params, X_train_scaled, y_train_ros_encoded, X_test_scaled, y_test_encoded)

Running GridSearchCV for RandomForest...
Best params for RandomForest: {'max_depth': None, 'n_estimators': 200}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.55      0.09      0.15       128
           1       0.50      0.03      0.05        38
           2       0.00      0.00      0.00         2
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00        17
           5       0.33      0.05      0.08        84
           6       0.56      0.82      0.67      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       1.00      0.12      0.22         8
          11       0.65      0.29      0.40       443
          12       0.00      0.00      0.00         7
          13       0.50      0.29      0.36         7
          14       1.00      0.13      0.24        15
          15       0.73      0.86      0.79     11710
          16       0.52      0.19      0.28      2061
          17       0.51      0.15      0.24       265
          18       0.60    

Parameters: { "use_label_encoder" } are not used.



Best params for XGBoost: {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.38      0.12      0.18       128
           1       0.25      0.05      0.09        38
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        17
           5       0.20      0.06      0.09        84
           6       0.60      0.77      0.67      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         8
          11       0.49      0.46      0.48       443
          12       0.33      0.14      0.20         7
          13       0.17      0.14      0.15         7
          14       0.67      0.13      0.22        15
          15       0.76      0.78      0.77     11710
          16       0.40      0.34      0.37      2061
          17       0.37      0.21      0.27       265
          18       0.28    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Comparar los mejores modelos
print("Comparación de los mejores modelos:")
for model_name in best_models:
    model = best_models[model_name]
    y_pred = model.predict(X_test_scaled)
    print(f"Resultados para {model_name}:")
    print(classification_report(y_test_encoded, y_pred))
    print(confusion_matrix(y_test_encoded, y_pred))
    print("\n")


Comparación de los mejores modelos:
Resultados para RandomForest:
              precision    recall  f1-score   support

           0       0.55      0.09      0.15       128
           1       0.50      0.03      0.05        38
           2       0.00      0.00      0.00         2
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00        17
           5       0.33      0.05      0.08        84
           6       0.56      0.82      0.67      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       1.00      0.12      0.22         8
          11       0.65      0.29      0.40       443
          12       0.00      0.00      0.00         7
          13       0.50      0.29      0.36         7
          14       1.00      0.13      0.24        15
          15       0.73      0.86      0.79     11710
          16       0.52      0.19      0.28      2061
          17   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Resultados para XGBoost:
              precision    recall  f1-score   support

           0       0.38      0.12      0.18       128
           1       0.25      0.05      0.09        38
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        17
           5       0.20      0.06      0.09        84
           6       0.60      0.77      0.67      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         8
          11       0.49      0.46      0.48       443
          12       0.33      0.14      0.20         7
          13       0.17      0.14      0.15         7
          14       0.67      0.13      0.22        15
          15       0.76      0.78      0.77     11710
          16       0.40      0.34      0.37      2061
          17       0.37      0.21      0.27       265
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Resultados para KNN:
              precision    recall  f1-score   support

           0       0.07      0.09      0.08       128
           1       0.05      0.05      0.05        38
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        17
           5       0.03      0.04      0.03        84
           6       0.51      0.60      0.55      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         8
          10       0.00      0.00      0.00         0
          11       0.21      0.28      0.24       443
          12       0.00      0.00      0.00         7
          13       0.25      0.29      0.27         7
          14       0.00      0.00      0.00        15
          15       0.75      0.57      0.65     11710
          16       0.19      0.24      0.21      2061
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
