In [8]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Cargar el dataset
df = pd.read_csv('New_BPD.csv')

# Limpieza de datos
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 73942 entries, 0 to 78665
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              73942 non-null  float64
 1   Sex              73942 non-null  object 
 2   Race             73942 non-null  object 
 3   District         73942 non-null  object 
 4   Year             73942 non-null  int64  
 5   Month            73942 non-null  int64  
 6   DayOfWeek        73942 non-null  int64  
 7   Hour             73942 non-null  float64
 8   OffenseCategory  73942 non-null  object 
 9   Latitude         73942 non-null  float64
 10  Longitude        73942 non-null  float64
dtypes: float64(4), int64(3), object(4)
memory usage: 6.8+ MB
None


In [3]:
# Selección de características y etiquetas
X = df.drop('OffenseCategory', axis=1)
y = df['OffenseCategory']

In [4]:
# Convertir características categóricas en variables dummy
X = pd.get_dummies(X, columns=['Sex', 'Race', 'District'])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balanceo de datos con RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Codificación de etiquetas
label_encoder = LabelEncoder()
y_train_ros_encoded = label_encoder.fit_transform(y_train_ros)
y_test_encoded = label_encoder.transform(y_test)

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ros)
X_test_scaled = scaler.transform(X_test)

In [5]:
# RandomForest
print("GridSearchCV for RandomForest...")
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train_scaled, y_train_ros_encoded)
best_rf_model = rf_grid_search.best_estimator_
print(f"Best params for RandomForest: {rf_grid_search.best_params_}")
y_rf_pred = best_rf_model.predict(X_test_scaled)
print("RandomForest Classification Report:")
print(classification_report(y_test_encoded, y_rf_pred))
print("RandomForest Confusion Matrix:")
print(confusion_matrix(y_test_encoded, y_rf_pred))

GridSearchCV for RandomForest...
Best params for RandomForest: {'max_depth': None, 'n_estimators': 200}
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.10      0.17       128
           1       1.00      0.03      0.05        38
           2       0.00      0.00      0.00         2
           3       1.00      0.33      0.50         3
           4       0.00      0.00      0.00        17
           5       0.40      0.05      0.09        84
           6       0.56      0.83      0.67      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       1.00      0.12      0.22         8
          11       0.65      0.28      0.39       443
          12       0.00      0.00      0.00         7
          13       0.50      0.29      0.36         7
          14       1.00      0.13      0.24        15
          15       0.73      0.86      0.79     1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# El resultado muestra que el modelo de RandomForest no está funcionando bien para muchas de las clases, 
# especialmente las clases minoritarias. La precisión, recall y f1-score para muchas clases son muy bajas. 
# Esto sugiere que el modelo está teniendo dificultades para predecir correctamente estas clases, 
# probablemente debido al desbalanceo en los datos.

In [6]:
# XGBoost
print("GridSearchCV for XGBoost...")
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
xgb_grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), xgb_params, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train_scaled, y_train_ros_encoded)
best_xgb_model = xgb_grid_search.best_estimator_
print(f"Best params for XGBoost: {xgb_grid_search.best_params_}")
y_xgb_pred = best_xgb_model.predict(X_test_scaled)
print("XGBoost Classification Report:")
print(classification_report(y_test_encoded, y_xgb_pred))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test_encoded, y_xgb_pred))

GridSearchCV for XGBoost...


Parameters: { "use_label_encoder" } are not used.



Best params for XGBoost: {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200}
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.12      0.19       128
           1       0.12      0.03      0.04        38
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.50      0.06      0.11        17
           5       0.33      0.08      0.13        84
           6       0.60      0.77      0.68      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         8
          11       0.50      0.47      0.48       443
          12       0.50      0.14      0.22         7
          13       0.29      0.29      0.29         7
          14       0.67      0.13      0.22        15
          15       0.76      0.77      0.77     11710
          16       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# KNN
print("GridSearchCV for KNN...")
knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
knn_grid_search = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid_search.fit(X_train_scaled, y_train_ros_encoded)
best_knn_model = knn_grid_search.best_estimator_
print(f"Best params for KNN: {knn_grid_search.best_params_}")
y_knn_pred = best_knn_model.predict(X_test_scaled)
print("KNN Classification Report:")
print(classification_report(y_test_encoded, y_knn_pred))
print("KNN Confusion Matrix:")
print(confusion_matrix(y_test_encoded, y_knn_pred))


GridSearchCV for KNN...
Best params for KNN: {'n_neighbors': 3, 'weights': 'distance'}
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.07      0.09      0.08       128
           1       0.05      0.05      0.05        38
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        17
           5       0.03      0.04      0.03        84
           6       0.51      0.60      0.55      3916
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         8
          10       0.00      0.00      0.00         0
          11       0.21      0.28      0.24       443
          12       0.00      0.00      0.00         7
          13       0.25      0.29      0.27         7
          14       0.00      0.00      0.00        15
          15       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
