In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from xgboost import XGBClassifier
import optuna

# Cargar los datos
X = pd.read_csv("../../Data/preprocessed_training_simple_dataset.csv", index_col="respondent_id")
y = pd.read_csv("../../Data/training_set_labels.csv", index_col="respondent_id", usecols=["respondent_id", "h1n1_vaccine", "seasonal_vaccine"])

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [2]:
def objective(trial):
    # Hiperparámetros sugeridos por Optuna
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 2.0),
    }
    
    # Crear modelos para cada etiqueta
    model_h1n1 = XGBClassifier(**params, random_state=42)
    model_seasonal = XGBClassifier(**params, random_state=42)
    
    # Entrenar modelos
    model_h1n1.fit(X_train, y_train['h1n1_vaccine'])
    model_seasonal.fit(X_train, y_train['seasonal_vaccine'])
    
    # Predicciones de probabilidad
    y_proba_h1n1 = model_h1n1.predict_proba(X_test)[:, 1]
    y_proba_seasonal = model_seasonal.predict_proba(X_test)[:, 1]
    
    # Calcular AUROC para cada etiqueta
    roc_auc_h1n1 = roc_auc_score(y_test['h1n1_vaccine'], y_proba_h1n1)
    roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_proba_seasonal)
    
    # Retornar la media de las AUROCs como métrica objetivo
    return (roc_auc_h1n1 + roc_auc_seasonal) / 2

In [3]:
# Silenciar Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Configurar el estudio
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)

# Ejecutar la optimización
study.optimize(objective, n_trials=50)

# Mostrar los mejores hiperparámetros
print("Mejores hiperparámetros:", study.best_params)
print("Mejor puntuación AUROC promedio:", study.best_value)

Mejores hiperparámetros: {'learning_rate': 0.04212430307464364, 'max_depth': 10, 'n_estimators': 277, 'min_child_weight': 6, 'gamma': 4.282630483852609, 'subsample': 0.5937995709661643, 'colsample_bytree': 0.8692863672978095, 'scale_pos_weight': 1.0905098880402837}
Mejor puntuación AUROC promedio: 0.870771494689784


In [4]:
# Usar los mejores hiperparámetros para entrenar los modelos finales
best_params = study.best_params

# Modelos finales
final_model_h1n1 = XGBClassifier(**best_params, random_state=42)
final_model_seasonal = XGBClassifier(**best_params, random_state=42)

# Entrenar modelos
final_model_h1n1.fit(X_train, y_train['h1n1_vaccine'])
final_model_seasonal.fit(X_train, y_train['seasonal_vaccine'])

# Guardar modelos
import joblib
joblib.dump(final_model_h1n1, "Modelos/XGBoost_h1n1_model.pkl")
joblib.dump(final_model_seasonal, "Modelos/XGBoost_seasonal_model.pkl")

['Modelos/XGBoost_seasonal_model.pkl']

In [10]:
y_pred_h1n1 = final_model_h1n1.predict(X_test)
y_pred_seasonal = final_model_seasonal.predict(X_test)

# Predicciones de probabilidad en el conjunto de prueba
y_proba_h1n1 = final_model_h1n1.predict_proba(X_test)[:, 1]
y_proba_seasonal = final_model_seasonal.predict_proba(X_test)[:, 1]

# Calcular AUROC para cada etiqueta
roc_auc_h1n1 = roc_auc_score(y_test['h1n1_vaccine'], y_proba_h1n1)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_proba_seasonal)

print(f"AUROC para H1N1: {roc_auc_h1n1}")
print(f"AUROC para vacuna estacional: {roc_auc_seasonal}")

# Informe de clasificación
print("Reporte de Clasificación H1N1:")
print(classification_report(y_test['h1n1_vaccine'], y_pred_h1n1))

print("Reporte de Clasificación Seasonal:")
print(classification_report(y_test['seasonal_vaccine'], y_pred_seasonal))

AUROC para H1N1: 0.8776323078066868
AUROC para vacuna estacional: 0.8639106815728813
Reporte de Clasificación H1N1:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4207
           1       0.72      0.54      0.62      1135

    accuracy                           0.86      5342
   macro avg       0.80      0.74      0.77      5342
weighted avg       0.85      0.86      0.85      5342

Reporte de Clasificación Seasonal:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      2854
           1       0.77      0.77      0.77      2488

    accuracy                           0.79      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.79      0.79      0.79      5342



In [11]:
# Cargar el dataset de prueba preprocesado
test_set_preprocessed = pd.read_csv("../../Data/preprocessed_test_simple_dataset.csv")

# Asegurarse de que respondent_id esté disponible
respondent_ids = test_set_preprocessed['respondent_id']

# Eliminar respondent_id del conjunto de características
X_test_final = test_set_preprocessed.drop(columns=['respondent_id'])

# Predicciones de probabilidad para H1N1
y_proba_h1n1_test = final_model_h1n1.predict_proba(X_test_final)[:, 1]  # Probabilidades de la clase positiva (1)

# Predicciones de probabilidad para vacuna estacional
y_proba_seasonal_test = final_model_seasonal.predict_proba(X_test_final)[:, 1]  # Probabilidades de la clase positiva (1)

# Crear el DataFrame de submission
submission = pd.DataFrame({
    "respondent_id": respondent_ids,
    "h1n1_vaccine": y_proba_h1n1_test,
    "seasonal_vaccine": y_proba_seasonal_test
})

# Guardar el archivo de submission
submission.to_csv("Submissions/XGBoost_Optuna_submission.csv", index=False)
print("Archivo 'XGBoost_Optuna_submission.csv' generado correctamente.")

Archivo 'XGBoost_Optuna_submission.csv' generado correctamente.
