# XGBoost

In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score
import cupy as cp

In [8]:
X_train = pd.read_excel('../data/processed/X_train_sin_outliers_raw.xlsx')
X_test = pd.read_excel('../data/processed/X_test_sin_outliers_raw.xlsx')

y_train = pd.read_excel('../data/processed/y_train.xlsx')
y_test = pd.read_excel('../data/processed/y_test.xlsx')

In [None]:
### Este codigo sirve para evitar el error de datos en CPU/GPU

# # Para evitar la advertencia, convierte X_test a GPU y usa inplace_predict:
# X_test_gpu = cp.array(X_test.values)         # Convierte los datos a un array de CuPy
# dtest_gpu = xgb.DMatrix(X_test_gpu)            # Crea un DMatrix con los datos en GPU
# booster = model_untrained.get_booster()        # Obtén el booster subyacente
# y_pred_test_gpu = booster.inplace_predict(dtest_gpu)  # Predicción en GPU

# # Convertir las predicciones (si es necesario) y evaluar
# y_pred_test_gpu_bin = [1 if pred >= 0.5 else 0 for pred in y_pred_test_gpu]
# print('Test accuracy (inplace_predict): {0:0.4f}'.format(accuracy_score(y_test, y_pred_test_gpu_bin)))

In [9]:
model_untrained = xgb.XGBClassifier(seed=42, objective='binary:logistic', tree_method='hist', device='cuda')

model_untrained.fit(X_train, y_train)
y_pred_train = model_untrained.predict(X_train)
y_pred_test = model_untrained.predict(X_test)

print('Train accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
print('Test accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Train accuracy score: 1.0000
Test accuracy score: 0.8961


In [None]:
param_grid = {
    'objective': ['binary:logistic'],
    'eval_metric': ['error'],
    'learning_rate': [0.05, 0.1],            # Reduje las opciones
    'max_depth': [3, 5],                     # Reduje a dos valores
    'min_child_weight': [1, 3],              # Reduje a dos opciones
    'gamma': [0, 0.1],                       # Menos opciones
    'subsample': [0.8, 1.0],                 # Reduje a dos opciones
    'colsample_bytree': [0.8, 1.0],          # Reduje a dos opciones
    'reg_alpha': [0, 0.001],                 # Menos opciones
    'reg_lambda': [1],                       # Fijado a un valor, ya que el 1 suele ser un buen punto de partida
    'seed': [42]                           # Para reproducibilidad
}

In [11]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model_untrained, param_grid, scoring='accuracy', cv=5, n_jobs=1)

grid.fit(X_train, y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor accuracy:", grid.best_score_)

best_model = grid.best_estimator_
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

print('Train accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
print('Test accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Mejores parámetros: {'colsample_bytree': 0.8, 'eval_metric': 'error', 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'objective': 'binary:logistic', 'reg_alpha': 0.001, 'reg_lambda': 1, 'seed': 42, 'subsample': 0.8}
Mejor accuracy: 0.8957616953218711
Train accuracy score: 0.9593
Test accuracy score: 0.8831
