# XGBoost

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [2]:
X_train = pd.read_excel('../data/processed/X_train_sin_outliers_raw.xlsx')
X_test = pd.read_excel('../data/processed/X_test_sin_outliers_raw.xlsx')

y_train = pd.read_excel('../data/processed/y_train.xlsx')
y_test = pd.read_excel('../data/processed/y_test.xlsx')

In [3]:
# Create regression matrices
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

In [4]:
model_untrained = xgb.XGBClassifier(seed=42)
model_untrained.fit(X_train, y_train)
y_pred_train = model_untrained.predict(X_train)
y_pred_test = model_untrained.predict(X_test)
print('Train accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
print('Test accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))


Train accuracy score: 1.0000
Test accuracy score: 0.8896


In [5]:
import itertools
import numpy as np

param_grid = {
    'objective': ['binary:logistic'],
    'eval_metric': ['error'],
    'learning_rate': [0.05, 0.1],            # Reduje las opciones
    'max_depth': [3, 5],                     # Reduje a dos valores
    'min_child_weight': [1, 3],              # Reduje a dos opciones
    'gamma': [0, 0.1],                       # Menos opciones
    'subsample': [0.8, 1.0],                 # Reduje a dos opciones
    'colsample_bytree': [0.8, 1.0],          # Reduje a dos opciones
    'reg_alpha': [0, 0.001],                 # Menos opciones
    'reg_lambda': [1],                       # Fijado a un valor, ya que el 1 suele ser un buen punto de partida
    'seed': [42]                           # Para reproducibilidad
}


# Prepara las listas de claves y valores para iterar sobre todas las combinaciones
keys = list(param_grid.keys())
values_list = [param_grid[k] for k in keys]

In [6]:
best_accuracy = -np.inf
best_params = None
best_num_round = 0

for combination in itertools.product(*values_list):
    # Combina la configuración actual en un diccionario
    params = dict(zip(keys, combination))
    # Configura para usar GPU: método 'hist' y dispositivo 'cuda'
    params['tree_method'] = 'hist'
    params['device'] = 'cuda'
    
    # Ejecuta validación cruzada (3 folds, early stopping para evitar sobreentrenamiento)
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=300,
        nfold=3,
        stratified=True,
        metrics='error',  # Cambiado a 'error' para evaluar el error de clasificación
        early_stopping_rounds=100,
        verbose_eval=False
    )
    
    # Calcula la accuracy a partir del error: accuracy = 1 - error
    mean_error = cv_results['test-error-mean'].iloc[-1]
    mean_accuracy = 1 - mean_error
    print("Parámetros evaluados:", params, "-> Accuracy:", mean_accuracy)
    
    # Actualiza la mejor combinación si es necesario
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_params = params.copy()
        best_num_round = len(cv_results)
        
print("\nMejor Accuracy:", best_accuracy)
print("Mejores hiperparámetros:", best_params)
print("Mejor número de boosting rounds:", best_num_round)

Parámetros evaluados: {'objective': 'binary:logistic', 'eval_metric': 'error', 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 1, 'seed': 42, 'tree_method': 'hist', 'device': 'cuda'} -> Accuracy: 0.8892475689462777
Parámetros evaluados: {'objective': 'binary:logistic', 'eval_metric': 'error', 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.001, 'reg_lambda': 1, 'seed': 42, 'tree_method': 'hist', 'device': 'cuda'} -> Accuracy: 0.8892475689462777
Parámetros evaluados: {'objective': 'binary:logistic', 'eval_metric': 'error', 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 1.0, 'reg_alpha': 0, 'reg_lambda': 1, 'seed': 42, 'tree_method': 'hist', 'device': 'cuda'} -> Accuracy: 0.8892555396142197
Parámetros evaluados: {'objective': 'binary:logistic',

In [7]:
final_model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=best_num_round
)

In [8]:
import numpy as np
from sklearn.metrics import accuracy_score

# Realiza predicciones: la salida es la probabilidad para la clase positiva
preds = final_model.predict(dtest)

# Convierte probabilidades a etiquetas (umbral 0.5)
pred_labels = np.where(preds > 0.5, 1, 0)

# Evalúa la precisión
accuracy = accuracy_score(y_test, pred_labels)
print("Accuracy del modelo final:", accuracy)

Accuracy del modelo final: 0.8896103896103896
