In [1]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import graphviz
import joblib
import optuna




In [2]:
X_train = pd.read_csv("../Data/Prueba_Ruxi/preprocessed_training_dataset.csv", index_col="respondent_id")
X_test = pd.read_csv("../Data/Prueba_Ruxi/preprocessed_test_dataset.csv", index_col="respondent_id")
y_train = pd.read_csv("../Data/Prueba_Ruxi/labels_train_dataset.csv", index_col="respondent_id")
y_test =  pd.read_csv("../Data/Prueba_Ruxi/labels_test_dataset.csv", index_col="respondent_id")

# AdaBoost

Si no se especifica el parámetro base_estimator, se utiliza DecisionTreeClassifier por defecto. En el caso de AdaBoost, es interesante tener clasificadores débiles que produzcan ajustes diversos a los datos. Se prueben otros clasificadores débiles, como LogisticRegression o 1NN (por ejemplo).

El learning rate controla cómo “sobreajustamos” a los datos que resultaron incorrectos en la iteración previa. Pruebe a jugar incrementando el learning rate con learning_rate frente al número de iteraciones n_estimators, intentando reducir al máximo el segundo argumento.

# Árbol de clasificación 

# Optimización con Bayes Search

In [41]:

# Crear el modelo base con class_weight='balanced'
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(class_weight='balanced'),  
    random_state=42
)

# Envolverlo en MultiOutputClassifier para predicción multietiqueta
multi_ada = MultiOutputClassifier(ada_model)

# Espacio de búsqueda de hiperparámetros
search_space = {
    'estimator__n_estimators': Integer(50, 300),  
    'estimator__learning_rate': Real(0.01, 1.0, prior='log-uniform'), 
    'estimator__estimator__max_depth': Integer(1, 10), 
    'estimator__estimator__min_samples_split': Integer(2, 20), 
    'estimator__estimator__min_samples_leaf': Integer(1, 10), 
    'estimator__algorithm': ['SAMME'], 
}

# Configurar BayesSearchCV
opt = BayesSearchCV(
    multi_ada,
    search_space,
    cv=5,
    n_iter=100,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)
opt.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros
print("Mejores hiperparámetros:", opt.best_params_)

Mejores hiperparámetros: OrderedDict([('estimator__algorithm', 'SAMME'), ('estimator__estimator__max_depth', 7), ('estimator__estimator__min_samples_leaf', 1), ('estimator__estimator__min_samples_split', 2), ('estimator__learning_rate', 0.06982974562429808), ('estimator__n_estimators', 162)])


In [38]:

# Guardar el mejor modelo optimizado
joblib.dump(opt.best_estimator_, "../Modelos/Prueba_Ruxi/AdaBoost_arbol_bayes.pkl")
print("Modelo guardado como 'AdaBoost_arbol_bayes.pkl'")

# Guardar resultados
results = pd.DataFrame(opt.cv_results_)
columns = [
    'mean_test_score',  
    'param_estimator__n_estimators',  
    'param_estimator__learning_rate', 
    'param_estimator__estimator__max_depth',
    'param_estimator__estimator__min_samples_split',
    'param_estimator__estimator__min_samples_leaf',
    'param_estimator__algorithm',  
]
results_table = results[columns].copy()
results_table.to_csv('../Results/Prueba_Ruxi/AdaBoost_arbol_bayes_results_table.csv', index=False)
results_table.sort_values(by=['mean_test_score'], ascending=False).head(5)


Modelo guardado como 'AdaBoost_arbol_bayes.pkl'


Unnamed: 0,mean_test_score,param_estimator__n_estimators,param_estimator__learning_rate,param_estimator__estimator__max_depth,param_estimator__estimator__min_samples_split,param_estimator__estimator__min_samples_leaf,param_estimator__algorithm
99,0.841315,162,0.06983,7,2,1,SAMME
51,0.841189,300,0.061375,7,20,10,SAMME
67,0.841177,274,0.072081,7,20,1,SAMME
58,0.841163,241,0.049956,7,20,10,SAMME
52,0.841133,300,0.072636,7,20,10,SAMME


In [40]:
# Evaluacion en train
best_model = joblib.load("../Modelos/Prueba_Ruxi/AdaBoost_arbol_bayes.pkl")
y_pred_proba = best_model.predict_proba(X_train)
y_pred = best_model.predict(X_train)

# Calcular ROC AUC para cada label
roc_auc_scores = [
    roc_auc_score(y_train.iloc[:, i], y_pred_proba[i][:, 1]) for i in range(y_train.shape[1])
]

# Calcular Accuracy para cada label
accuracy_scores = [
    accuracy_score(y_train.iloc[:, i], y_pred_proba[i].argmax(axis=1)) for i in range(y_train.shape[1])
]

print("Medidas de evaluación-----------------------------------------------")
print("ROC AUC Scores del mejor modelo:", roc_auc_scores)
print("Accuracy Scores del mejor modelo:", accuracy_scores, "\n")
print("Reporte de Clasificación:")
print(classification_report(y_train, y_pred, zero_division=0))

print("Probabilidades-----------------------------------------------------")
print("Primeras 5 probabilities para label 0:", y_pred_proba[0][:5, 1])
print("Primeras 5 probabilities para label 1:", y_pred_proba[1][:5, 1])

Medidas de evaluación-----------------------------------------------
ROC AUC Scores del mejor modelo: [0.8559092720649415, 0.8868424175543236]
Accuracy Scores del mejor modelo: [0.8000468055230517, 0.8026211092908964] 

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.52      0.75      0.61      4539
           1       0.78      0.80      0.79      9947

   micro avg       0.68      0.78      0.73     14486
   macro avg       0.65      0.77      0.70     14486
weighted avg       0.70      0.78      0.73     14486
 samples avg       0.37      0.39      0.37     14486

Probabilidades-----------------------------------------------------
Primeras 5 probabilities para label 0: [0.72667418 0.79763732 0.11920292 0.11920292 0.13474615]
Primeras 5 probabilities para label 1: [0.75551147 0.7739071  0.73269013 0.16065331 0.46835083]


# Optimización con Optuna

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier

def objective(trial):
    # Definir el espacio de búsqueda de hiperparámetros
    max_depth = trial.suggest_int('max_depth', 1, 10)
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    algorithm = trial.suggest_categorical('algorithm', ['SAMME'])

    # Crear el modelo base con DecisionTreeClassifier
    base_estimator = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight='balanced',
    )

    # Crear el modelo de AdaBoostClassifier
    ada_model = AdaBoostClassifier(
        estimator=base_estimator,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        algorithm=algorithm,
        random_state=42
    )

    # Envolver el modelo en MultiOutputClassifier para predicción multietiqueta
    multi_ada = MultiOutputClassifier(ada_model)

    # Realizar la validación cruzada para calcular el ROC AUC
    roc_auc_scores = cross_val_score(
        multi_ada, X_train, y_train, cv=5, scoring='roc_auc'
    )

    # Promediar los ROC AUC obtenidos durante la validación cruzada
    score = roc_auc_scores.mean()

    return score

# Crear un estudio de optimización con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  


[I 2025-01-23 12:06:17,969] A new study created in memory with name: no-name-926d8e57-8a21-4a64-9f3b-91c140324ba6
[I 2025-01-23 12:06:43,583] Trial 0 finished with value: 0.8274721993415619 and parameters: {'max_depth': 3, 'n_estimators': 193, 'learning_rate': 0.1653638212177839, 'min_samples_split': 12, 'min_samples_leaf': 7, 'algorithm': 'SAMME'}. Best is trial 0 with value: 0.8274721993415619.
[I 2025-01-23 12:07:23,758] Trial 1 finished with value: 0.8196249132970154 and parameters: {'max_depth': 2, 'n_estimators': 275, 'learning_rate': 0.04838639067165953, 'min_samples_split': 9, 'min_samples_leaf': 2, 'algorithm': 'SAMME'}. Best is trial 0 with value: 0.8274721993415619.
[I 2025-01-23 12:08:21,360] Trial 2 finished with value: 0.8286982366072035 and parameters: {'max_depth': 4, 'n_estimators': 138, 'learning_rate': 0.01856062250549623, 'min_samples_split': 19, 'min_samples_leaf': 7, 'algorithm': 'SAMME'}. Best is trial 2 with value: 0.8286982366072035.
[I 2025-01-23 12:08:48,433]

In [7]:

# Entrenar el mejor modelo con los parámetros encontrados
best_params = study.best_params
best_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf']
    ),
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    algorithm=best_params['algorithm'],
    random_state=42
)
multi_best_model = MultiOutputClassifier(best_model)

# Ajustar el mejor modelo con los datos de entrenamiento
multi_best_model.fit(X_train, y_train)

# Guardar el mejor modelo
joblib.dump(multi_best_model, "../Modelos/Prueba_Ruxi/AdaBoost_arbol_optuna.pkl")
print("Mejor modelo guardado como 'AdaBoost_arbol_optuna.pkl'")

Mejor modelo guardado como 'AdaBoost_arbol_optuna.pkl'


In [8]:
print("Mejores parámetros encontrados:", study.best_params)

# Guardar resultados
trials_df = study.trials_dataframe()

columns = [
    'value',  
    'params_learning_rate',  
    'params_n_estimators', 
    'params_max_depth',
    'params_min_samples_leaf',
    'params_min_samples_split',
    'params_max_depth', 
    'params.algorithm'  
]

trials_df = trials_df[trials_df.columns.intersection(columns)]
trials_df.sort_values(by='value', ascending=False, inplace=True)

results_csv_path = '../Results/Prueba_Ruxi/AdaBoost_arbol_optuna_results_table.csv'
trials_df.to_csv(results_csv_path, index=False)
trials_df.head(5)

Mejores parámetros encontrados: {'max_depth': 7, 'n_estimators': 227, 'learning_rate': 0.12621186329750728, 'min_samples_split': 8, 'min_samples_leaf': 10, 'algorithm': 'SAMME'}


Unnamed: 0,value,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators
13,0.840304,0.126212,7,10,8,227
12,0.84023,0.083004,8,10,8,295
5,0.840037,0.152149,7,8,11,141
18,0.839856,0.098249,8,4,2,258
15,0.83955,0.043199,6,9,5,226


In [9]:
# Evaluacion en train
best_model = joblib.load("../Modelos/Prueba_Ruxi/AdaBoost_arbol_optuna.pkl")
y_pred_proba = best_model.predict_proba(X_train)
y_pred = best_model.predict(X_train)

# Calcular ROC AUC para cada label
roc_auc_scores = [
    roc_auc_score(y_train.iloc[:, i], y_pred_proba[i][:, 1]) for i in range(y_train.shape[1])
]

# Calcular Accuracy para cada label
accuracy_scores = [
    accuracy_score(y_train.iloc[:, i], y_pred_proba[i].argmax(axis=1)) for i in range(y_train.shape[1])
]

print("Medidas de evaluación-----------------------------------------------")
print("ROC AUC Scores del mejor modelo:", roc_auc_scores)
print("Accuracy Scores del mejor modelo:", accuracy_scores, "\n")
print("Reporte de Clasificación:")
print(classification_report(y_train, y_pred, zero_division=0))

print("Probabilidades-----------------------------------------------------")
print("Primeras 5 probabilities para label 0:", y_pred_proba[0][:5, 1])
print("Primeras 5 probabilities para label 1:", y_pred_proba[1][:5, 1])

Medidas de evaluación-----------------------------------------------
ROC AUC Scores del mejor modelo: [0.8851280829951715, 0.8920962041190001]
Accuracy Scores del mejor modelo: [0.861362040720805, 0.8085654107184648] 

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.76      0.51      0.61      4539
           1       0.80      0.78      0.79      9947

   micro avg       0.79      0.70      0.74     14486
   macro avg       0.78      0.65      0.70     14486
weighted avg       0.79      0.70      0.73     14486
 samples avg       0.37      0.35      0.35     14486

Probabilidades-----------------------------------------------------
Primeras 5 probabilities para label 0: [0.42505038 0.39107267 0.13239643 0.21232802 0.24617575]
Primeras 5 probabilities para label 1: [0.65670746 0.677444   0.65941646 0.25544606 0.43632201]


# Evaluación en test 

In [55]:
# Evaluacion en test
best_model = joblib.load("../Modelos/Prueba_Ruxi/AdaBoost_arbol_optuna.pkl")
y_pred_proba = best_model.predict_proba(X_test)
y_pred = best_model.predict(X_test)

# Calcular ROC AUC para cada label
roc_auc_scores = [
    roc_auc_score(y_test.iloc[:, i], y_pred_proba[i][:, 1]) for i in range(y_test.shape[1])
]

# Calcular Accuracy para cada label
accuracy_scores = [
    accuracy_score(y_test.iloc[:, i], y_pred_proba[i].argmax(axis=1)) for i in range(y_test.shape[1])
]

print("Medidas de evaluación-----------------------------------------------")
print("ROC AUC Scores del mejor modelo:", roc_auc_scores)
print("Accuracy Scores del mejor modelo:", accuracy_scores, "\n")
print("Reporte de Clasificación:")
print(classification_report(y_test, y_pred, zero_division=0))

print("Probabilidades-----------------------------------------------------")
print("Primeras 5 probabilities para label 0:", y_pred_proba[0][:5, 1])
print("Primeras 5 probabilities para label 1:", y_pred_proba[1][:5, 1])

Medidas de evaluación-----------------------------------------------
ROC AUC Scores del mejor modelo: [0.8388333268760164, 0.8559335687262418]
Accuracy Scores del mejor modelo: [0.8399475851740921, 0.7834144515162861] 

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.68      0.47      0.56      1135
           1       0.77      0.76      0.76      2488

   micro avg       0.75      0.67      0.71      3623
   macro avg       0.72      0.62      0.66      3623
weighted avg       0.74      0.67      0.70      3623
 samples avg       0.35      0.34      0.34      3623

Probabilidades-----------------------------------------------------
Primeras 5 probabilities para label 0: [0.23851672 0.57967882 0.24948673 0.35552768 0.33252167]
Primeras 5 probabilities para label 1: [0.48358444 0.5822577  0.27803864 0.3685768  0.41425405]


# Submission test

In [21]:
# Cargar el dataset de prueba preprocesado
test_set_preprocessed = pd.read_csv("../Data/Prueba_Ruxi/preprocessed_submission_dataset.csv")

# Asegurarse de que respondent_id esté disponible
respondent_ids = test_set_preprocessed['respondent_id']

# Eliminar respondent_id del conjunto de características
X_test_final = test_set_preprocessed.drop(columns=['respondent_id'])

# Realizar predicciones de probabilidad
best_model = joblib.load("../Modelos/AdaBoost_optuna_best_model.pkl")
y_proba_test = best_model.predict_proba(X_test_final)

# Extraer las probabilidades para la clase positiva (1)
y_proba_h1n1_test = y_proba_test[0][:, 1]  # Probabilidades para H1N1
y_proba_seasonal_test = y_proba_test[1][:, 1]  # Probabilidades para vacuna estacional

# Crear el DataFrame de submission
submission = pd.DataFrame({
    "respondent_id": respondent_ids,
    "h1n1_vaccine": y_proba_h1n1_test,
    "seasonal_vaccine": y_proba_seasonal_test
})

# Guardar el archivo de submission
submission.to_csv("../Results/AdaBoost_arbol_optuna_submission.csv", index=False)
print("Archivo 'AdaBoost_arbol_optuna_submission.csv' generado correctamente.")

Archivo 'AdaBoost_optuna_SMME.R_submission.csv' generado correctamente.
