Ajuste Fino de Hiperparámetros con GridSearchCV y RandomForest

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Cargar el dataset
df = pd.read_csv('New_BPD.csv')

# Limpieza de datos
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Selección de características y etiquetas
X = df.drop('OffenseCategory', axis=1)
y = df['OffenseCategory']

# Identificar características numéricas y categóricas
numeric_features = ['Age', 'Year', 'Month', 'DayOfWeek', 'Hour', 'Latitude', 'Longitude']
categorical_features = ['Sex', 'Race', 'District']

# Crear el preprocesador con escalamiento y codificación
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Crear el pipeline con preprocesamiento y RandomForest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ajustar los hiperparámetros usando GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 150, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Mejores parámetros: {grid_search.best_params_}")

# Evaluar el modelo con el mejor ajuste
y_pred = grid_search.predict(X_test)
print("Random Forest con ajuste fino:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))





Mejores parámetros: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Random Forest con ajuste fino:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

          Administrative       0.67      0.06      0.11       128
      Alcohol Violations       1.00      0.03      0.05        38
           Drug Offenses       0.59      0.79      0.67      3916
Environmental Violations       0.00      0.00      0.00         3
                   Fraud       0.00      0.00      0.00         8
          Investigations       0.71      0.23      0.34       443
  Obstruction of Justice       1.00      0.07      0.12        15
                   Other       0.71      0.90      0.79     11750
         Property Crimes       0.55      0.14      0.22      2061
     Public Order Crimes       0.57      0.12      0.20       265
                 Robbery       0.68      0.08      0.14       386
              Sex Crimes       0.54      0.40      0.46       200
      Traffic Violations       0.00      0.00      0.00        12
          Violent Crimes       0.45      0.15      0.23      2853
      Wea

XGBoost

In [4]:
import xgboost as xgb
from imblearn.pipeline import Pipeline as ImbPipeline

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Crear el pipeline con XGBoost
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'))
])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ajustar los hiperparámetros usando GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [3, 6, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Mejores parámetros: {grid_search.best_params_}")

# Evaluar el modelo con el mejor ajuste
y_pred = grid_search.predict(X_test)
print("XGBoost:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))




ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 333, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\TONAC\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14], got ['Administrative' 'Alcohol Violations' 'Drug Offenses'
 'Environmental Violations' 'Fraud' 'Investigations'
 'Obstruction of Justice' 'Other' 'Property Crimes' 'Public Order Crimes'
 'Robbery' 'Sex Crimes' 'Traffic Violations' 'Violent Crimes'
 'Weapons Violations']


Combinar Modelos

In [8]:
from sklearn.ensemble import VotingClassifier

# Crear los modelos
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Crear el ensemble de modelos
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('xgb', xgb_model)
], voting='soft')

# Crear el pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ajustar y evaluar el modelo
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Ensemble de modelos:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Ensemble de modelos:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

          Administrative       0.88      0.05      0.10       128
      Alcohol Violations       1.00      0.03      0.05        38
           Drug Offenses       0.57      0.81      0.67      3916
Environmental Violations       0.00      0.00      0.00         3
                   Fraud       0.00      0.00      0.00         8
          Investigations       0.62      0.18      0.28       443
  Obstruction of Justice       1.00      0.13      0.24        15
                   Other       0.71      0.91      0.79     11750
         Property Crimes       0.64      0.13      0.22      2061
     Public Order Crimes       0.57      0.09      0.16       265
                 Robbery       0.48      0.03      0.05       386
              Sex Crimes       0.62      0.43      0.51       200
      Traffic Violations       0.00      0.00      0.00        12
          Violent Crimes       0.45      0.12      0.20      2853
      Wea

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Reducción de Dimensionalidad con PCA

In [10]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Crear el preprocesador con PCA
preprocessor = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )),
    ('pca', PCA(n_components=0.95))  # Mantener el 95% de la varianza
])

# Crear el pipeline con PCA y RandomForest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ajustar y evaluar el modelo
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Random Forest con PCA:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))



Random Forest con PCA:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

          Administrative       0.60      0.02      0.05       128
      Alcohol Violations       1.00      0.03      0.05        38
           Drug Offenses       0.54      0.76      0.63      3916
Environmental Violations       0.00      0.00      0.00         3
                   Fraud       0.00      0.00      0.00         8
          Investigations       0.53      0.12      0.20       443
  Obstruction of Justice       0.00      0.00      0.00        15
                   Other       0.69      0.89      0.78     11750
         Property Crimes       0.43      0.06      0.11      2061
     Public Order Crimes       0.58      0.08      0.15       265
                 Robbery       0.70      0.04      0.08       386
              Sex Crimes       0.37      0.16      0.22       200
      Traffic Violations       0.00      0.00      0.00        12
          Violent Crimes       0.37      0.10      0.16      2853
      Wea

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
