# Notebook to Implement Model Training - LGBM

---

### 1) Setup

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import balanced_accuracy_score, make_scorer, f1_score, recall_score,precision_score
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, cross_validate

import optuna
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42

TRAIN_CLINICAL_FILENAME = "train_set_clinical.csv"
TEST_CLINICAL_FILENAME = "test_set_clinical.csv"

TRAIN_CLINICAL_ARTIGO_FILENAME = "E:\\work\\MAC5832_prognostico_covid\\train_set_clinical.csv"
TEST_CLINICAL_ARTIGO_FILENAME = "E:\\work\\MAC5832_prognostico_covid\\test_set_clinical.csv"

---

### 2) Read and Preprocess Data

In [3]:
datasets = {
    'onlynormalized': {
        'train': pd.read_csv(TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID"),
        'test': pd.read_csv(TEST_CLINICAL_FILENAME, sep=";", index_col="ID")
    },
    'artigo': {
        'train': pd.read_csv(TRAIN_CLINICAL_ARTIGO_FILENAME, sep=";", index_col="ID"),
        'test':  pd.read_csv(TEST_CLINICAL_ARTIGO_FILENAME, sep=";", index_col="ID")
    }
}


In [4]:
##### Preprocessing all datasets
for d_key in datasets.keys():
    for d_type in datasets[d_key].keys():
        
        # Drop NaN Values 
        datasets[d_key][d_type].dropna(inplace=True)
        
        # Convert Sex column to boolean (Female: 1, Male: 0)
        datasets[d_key][d_type]["Sex"] = np.where(datasets[d_key][d_type]["Sex"]=="F", 1, 0)

---

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

def evaluate_test(groundtruth, predicted, print_result=True):
    bal_accuracy = balanced_accuracy_score(groundtruth, predicted)
    accuracy = accuracy_score(groundtruth, predicted)
    tn, fp, fn, tp = confusion_matrix(groundtruth, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)
    if(print_result):
        print(f"\n [test:]")
        print(f'Balanced accuracy: {bal_accuracy:.4f}') 
        print(f'Accuracy: {accuracy:.4f}') 
        print(f'Specificity:  {specificity:.4f}')
        print(f'Sensitivity:  {sensitivity:.4f}')
    return (accuracy, specificity, sensitivity)

# Get Features and Target
def getFeaturesTargets(dataset_name):
    dataset = datasets[dataset_name]
    X, y = dataset['train'].drop("Group", axis=1), dataset['train']["Group"]
    X_test, y_test = dataset['test'].drop("Group", axis=1), dataset['test']["Group"]
    return (X, y, X_test, y_test)

### 3) Baseline Model Training and CV

In [6]:
# Select a dataset
dataset_name = 'onlynormalized'

# Define Classifier (or pipeline)
clf = lgb.LGBMClassifier(random_state=SEED)

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=SEED)

# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)

# No parameters to search now
parameters = {}

# Using GridSearchCV instead cross_val_score and cross_validate, because with GridSearchCV we could also evalute a test set
search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=4, scoring=metric_scorer, cv=rkf, return_train_score=True)
search.fit(X, y)

print(f"Dataset: {dataset_name}")
print(f"Balanced accuracy mean_train: {search.cv_results_['mean_train_score'][0]}, mean_val: {search.cv_results_['mean_test_score'][0]} ")

predicted = search.best_estimator_.predict(X_test)
test_score = evaluate_test(y_test, predicted)



Fitting 50 folds for each of 1 candidates, totalling 50 fits
Dataset: onlynormalized
Balanced accuracy mean_train: 1.0, mean_val: 0.7703995514289632 

 [test:]
Balanced accuracy: 0.4578
Accuracy: 0.5156
Specificity:  0.6429
Sensitivity:  0.2727


In [48]:
# Select a dataset
dataset_name = 'artigo'

# Define Classifier (or pipeline)
clf = lgb.LGBMClassifier(random_state=SEED)

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=SEED)

# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)

# No parameters to search now
parameters = {}

# Using GridSearchCV instead cross_val_score and cross_validate, because with GridSearchCV we could also evalute a test set
search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=4, scoring=metric_scorer, cv=rkf, return_train_score=True)
search.fit(X, y)

print(f"Dataset: {dataset_name}")
print(f"Balanced accuracy mean_train: {search.cv_results_['mean_train_score'][0]}, mean_val: {search.cv_results_['mean_test_score'][0]} ")

predicted = search.best_estimator_.predict(X_test)
test_score = evaluate_test(y_test, predicted)



Fitting 50 folds for each of 1 candidates, totalling 50 fits
Dataset: artigo
Balanced accuracy mean_train: 1.0, mean_val: 0.778942822657219 

 [test:]
Balanced accuracy: 0.5390
Accuracy: 0.5938
Specificity:  0.7143
Sensitivity:  0.3636


---

### 4) Experiments

##### 4.1) Hyper Parameter Optimization with Optuna

In [8]:
# Select a dataset
dataset_name = 'onlynormalized'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# Define the objective function to be maximized
def objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model with param trial
    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }

    # Print params for best trial
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    clf.fit(X, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 



In [10]:
# study = optuna.create_study(direction="maximize")

# study = optuna.create_study(
#     study_name="hyperparameters-optimization-dataset-onlynormalized",
#     storage="postgresql://postgres:postgres@localhost",
#     direction="maximize",
#     load_if_exists=True
# )

study = optuna.create_study(
    study_name="hyperparameters-optimization-dataset-onlynormalized",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=24)
study._storage._backend.engine.dispose()

[32m[I 2022-06-19 15:51:44,976][0m Using an existing study with name 'hyperparameters-optimization-dataset-onlynormalized' instead of creating a new one.[0m
[32m[I 2022-06-19 15:51:51,800][0m Trial 477 finished with value: 0.8426439402101167 and parameters: {'lambda_l1': 1.6947649124215238e-05, 'lambda_l2': 1.7727160333021608e-06, 'num_leaves': 224, 'feature_fraction': 0.5317283124140778, 'bagging_fraction': 0.9157726859779072, 'bagging_freq': 3, 'min_child_samples': 36, 'max_depth': 769}. Best is trial 284 with value: 0.8513628685126363.[0m
[32m[I 2022-06-19 15:51:58,408][0m Trial 478 finished with value: 0.8244992943932572 and parameters: {'lambda_l1': 7.769379430697606e-05, 'lambda_l2': 9.025557313955381e-05, 'num_leaves': 245, 'feature_fraction': 0.5537686680283244, 'bagging_fraction': 0.8744268054219735, 'bagging_freq': 3, 'min_child_samples': 40, 'max_depth': 715}. Best is trial 284 with value: 0.8513628685126363.[0m
[32m[I 2022-06-19 15:52:04,764][0m Trial 479 finishe

In [11]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8514

    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 2.1215664393174326e-05
    lambda_l2: 8.364884776331308e-06
    num_leaves: 236
    feature_fraction: 0.5374514135397644
    bagging_fraction: 0.9175306516826528
    bagging_freq: 3
    min_child_samples: 35
    n_estimators: 721
[TEST]
balanced_accuracy: 0.5152, accuracy: 0.5625, specificity: 0.6667, sensitivity: 0.3636


(0.5151515151515151, 0.5625, 0.6666666666666666, 0.36363636363636365)

In [8]:
# Select a dataset
dataset_name = 'artigo'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# Define the objective function to be maximized
def objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model with param trial
    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }

    # Print params for best trial
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    clf.fit(X, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 



In [10]:
study = optuna.create_study(
    study_name="hyperparameters-optimization-dataset-artigo",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=47)
study._storage._backend.engine.dispose()

[32m[I 2022-06-19 17:06:11,982][0m Using an existing study with name 'hyperparameters-optimization-dataset-artigo' instead of creating a new one.[0m
[32m[I 2022-06-19 17:06:18,116][0m Trial 464 finished with value: 0.828176264052425 and parameters: {'lambda_l1': 1.2735183239886915e-07, 'lambda_l2': 1.5782761557349205e-05, 'num_leaves': 245, 'feature_fraction': 0.8863833140162415, 'bagging_fraction': 0.42540645052818044, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 594}. Best is trial 434 with value: 0.8421816732459148.[0m
[32m[I 2022-06-19 17:06:24,419][0m Trial 465 finished with value: 0.831773483611332 and parameters: {'lambda_l1': 5.216964389261414e-08, 'lambda_l2': 2.754247860055373e-05, 'num_leaves': 237, 'feature_fraction': 0.9194655769227538, 'bagging_fraction': 0.41093934359270445, 'bagging_freq': 1, 'min_child_samples': 17, 'max_depth': 940}. Best is trial 434 with value: 0.8421816732459148.[0m
[32m[I 2022-06-19 17:06:32,562][0m Trial 466 finished with v

In [11]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8422

    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 1.0904157458907538e-07
    lambda_l2: 3.6012185136024024e-05
    num_leaves: 238
    feature_fraction: 0.8891389078723603
    bagging_fraction: 0.43967936337751173
    bagging_freq: 1
    min_child_samples: 17
    n_estimators: 955
[TEST]
balanced_accuracy: 0.5758, accuracy: 0.6562, specificity: 0.8333, sensitivity: 0.3182


(0.5757575757575758, 0.65625, 0.8333333333333334, 0.3181818181818182)

### Resultados

- Aumento de performance *considerável* em relação ao valor baseline para um LGBM

**dataset apenas dado normalizado**

De:
```
Balanced accuracy mean_train: 1.0, mean_val: 0.7703995514289632 

 [test:]
Balanced accuracy: 0.4578
Accuracy: 0.5156
Specificity:  0.6429
Sensitivity:  0.2727
```
Para:
```
Best trial Score from HP Opt: 0.8514
[TEST]
balanced_accuracy: 0.5152, accuracy: 0.5625, specificity: 0.6667, sensitivity: 0.3636
```


**dataset do artigo**

De: 
```
Balanced accuracy mean_train: 1.0, mean_val: 0.778942822657219 

 [test:]
Balanced accuracy: 0.5390
Accuracy: 0.5938
Specificity:  0.7143
Sensitivity:  0.3636
```


Para:
```
Best trial Score from HP Opt: 0.8422

[TEST]
balanced_accuracy: 0.5758, accuracy: 0.6562, specificity: 0.8333, sensitivity: 0.3182
```


##### 4.3) Features Scaler + Hyper Parameter Optimization with Optuna

In [7]:
# Select a dataset
dataset_name = 'onlynormalized'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    print(f"    scaler: {scaler}")
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

In [13]:
study = optuna.create_study(
    study_name="featuresscaler-dataset-onlynormalized",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=94)
study._storage._backend.engine.dispose()

[32m[I 2022-06-19 22:14:28,572][0m Using an existing study with name 'featuresscaler-dataset-onlynormalized' instead of creating a new one.[0m
[32m[I 2022-06-19 22:14:43,105][0m Trial 417 finished with value: 0.8277601934099613 and parameters: {'scalers': 'robust', 'lambda_l1': 0.011195539775695448, 'lambda_l2': 4.464338314561773e-08, 'num_leaves': 7, 'feature_fraction': 0.8392039098759433, 'bagging_fraction': 0.6150426341272669, 'bagging_freq': 2, 'min_child_samples': 23, 'max_depth': 755}. Best is trial 335 with value: 0.8467695101166463.[0m
[32m[I 2022-06-19 22:14:50,624][0m Trial 418 finished with value: 0.8313084666451539 and parameters: {'scalers': 'robust', 'lambda_l1': 0.0015538359120800304, 'lambda_l2': 1.293947068931389e-07, 'num_leaves': 2, 'feature_fraction': 0.6605514487360987, 'bagging_fraction': 0.5887862408159592, 'bagging_freq': 2, 'min_child_samples': 24, 'max_depth': 718}. Best is trial 335 with value: 0.8467695101166463.[0m
[32m[I 2022-06-19 22:15:01,922]

In [14]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8468

    scaler: RobustScaler()
    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 0.0011860769735769989
    lambda_l2: 1.598081004309406e-07
    num_leaves: 2
    feature_fraction: 0.8021333697259749
    bagging_fraction: 0.5925999330747219
    bagging_freq: 2
    min_child_samples: 24
    n_estimators: 746
[TEST]
balanced_accuracy: 0.5152, accuracy: 0.5625, specificity: 0.6667, sensitivity: 0.3636


(0.5151515151515151, 0.5625, 0.6666666666666666, 0.36363636363636365)

In [7]:
# Select a dataset
dataset_name = 'artigo'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    print(f"    scaler: {scaler}")
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

In [15]:
study = optuna.create_study(
    study_name="featuresscaler-dataset-artigo",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=29)
study._storage._backend.engine.dispose()

[32m[I 2022-06-20 00:23:01,498][0m Using an existing study with name 'featuresscaler-dataset-artigo' instead of creating a new one.[0m
[32m[I 2022-06-20 00:23:09,198][0m Trial 472 finished with value: 0.8321781089997033 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0002717685202229414, 'lambda_l2': 1.3998091509273321e-05, 'num_leaves': 193, 'feature_fraction': 0.4006724477838006, 'bagging_fraction': 0.45054468514183377, 'bagging_freq': 3, 'min_child_samples': 21, 'max_depth': 931}. Best is trial 328 with value: 0.8400733946565991.[0m
[32m[I 2022-06-20 00:23:17,502][0m Trial 473 finished with value: 0.8185162875909779 and parameters: {'scalers': 'standard', 'lambda_l1': 1.7999753572154908e-05, 'lambda_l2': 0.0007097769850095756, 'num_leaves': 188, 'feature_fraction': 0.464072415270713, 'bagging_fraction': 0.49180288056453936, 'bagging_freq': 3, 'min_child_samples': 15, 'max_depth': 953}. Best is trial 328 with value: 0.8400733946565991.[0m
[32m[I 2022-06-20 00:23:21,1

In [16]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8407

    scaler: StandardScaler()
    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 0.0014846922151639575
    lambda_l2: 0.0008713399784637161
    num_leaves: 190
    feature_fraction: 0.48199760968416155
    bagging_fraction: 0.44043645321243474
    bagging_freq: 3
    min_child_samples: 21
    n_estimators: 899
[TEST]
balanced_accuracy: 0.5617, accuracy: 0.6094, specificity: 0.7143, sensitivity: 0.4091


(0.5616883116883117, 0.609375, 0.7142857142857143, 0.4090909090909091)

### Resultados

**dataset apenas dado normalizado**

De:
```
Best trial Score from HP Opt: 0.8514
[TEST]
balanced_accuracy: 0.5152, accuracy: 0.5625, specificity: 0.6667, sensitivity: 0.3636
```
Para:
```
Best trial Score from HP Opt: 0.8468

[TEST]
balanced_accuracy: 0.5152, accuracy: 0.5625, specificity: 0.6667, sensitivity: 0.3636
(0.5151515151515151, 0.5625, 0.6666666666666666, 0.36363636363636365)
```

**dataset do artigo**

De: 
```
Best trial Score from HP Opt: 0.8422

[TEST]
balanced_accuracy: 0.5758, accuracy: 0.6562, specificity: 0.8333, sensitivity: 0.3182
```

Para:
```
Best trial Score from HP Opt: 0.8407

[TEST]
balanced_accuracy: 0.5617, accuracy: 0.6094, specificity: 0.7143, sensitivity: 0.4091
(0.5616883116883117, 0.609375, 0.7142857142857143, 0.4090909090909091)
```

- No experimento do Lucas a adição de um estágio de feature scaling antes do treinamento parece ter ajudado o modelo. Dessa vez, essa melhoria não ficou tão clara em nenhum dos dois datasets.

##### 4.4) Boruta Feature Selection + Features Scaler + Hyper Parameter Optimization with Optuna

In [7]:
# Select a dataset
dataset_name = 'onlynormalized'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

feat_selector.fit(np.array(X), y)

print("Number of selected features: ", feat_selector.n_features_)

print(X.columns[feat_selector.support_])

# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_test_transform = feat_selector.transform(np.array(X_test))

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    print(f"    scaler: {scaler}")
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X_transform, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test_transform)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	609
Iteration: 	9 / 100
Confirmed: 	6
Tentative: 	34
Rejected: 	609
Iteration: 	10 / 100
Confirmed: 	6
Tentative: 	34
Rejected: 	609
Iteration: 	11 / 100
Confirmed: 	6
Tentative: 	34
Rejected: 	609
Iteration: 	12 / 100
Confirmed: 	11
Tentative: 	29
Rejected: 	609
Iteration: 	13 / 100
Confirmed: 	11
Tentative: 	27
Rejected: 	611
Iteration: 	14 / 100
Confirmed: 	11
Tentative: 	27
Rejected: 	611
Iteration: 	15 / 100
Confirmed: 	11
Tentative: 	27
Rejected: 	611
Iteration: 	16 / 100
Confirmed: 	11
T

In [9]:
study = optuna.create_study(
    study_name="boruta-featuresscaler-dataset-onlynormalized",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=70)
study._storage._backend.engine.dispose()

[32m[I 2022-06-20 01:13:32,233][0m Using an existing study with name 'boruta-featuresscaler-dataset-onlynormalized' instead of creating a new one.[0m
[32m[I 2022-06-20 01:13:33,782][0m Trial 431 finished with value: 0.8603349241917353 and parameters: {'scalers': 'standard', 'lambda_l1': 2.4217399148527214e-07, 'lambda_l2': 0.11100207147210794, 'num_leaves': 77, 'feature_fraction': 0.6359970122800791, 'bagging_fraction': 0.6500114995582859, 'bagging_freq': 7, 'min_child_samples': 13, 'max_depth': 136}. Best is trial 297 with value: 0.8986074211306407.[0m
[32m[I 2022-06-20 01:13:36,303][0m Trial 432 finished with value: 0.8842394686067284 and parameters: {'scalers': 'standard', 'lambda_l1': 4.1066986648423137e-07, 'lambda_l2': 0.3015711040713209, 'num_leaves': 66, 'feature_fraction': 0.93049033810581, 'bagging_fraction': 0.6217094903570615, 'bagging_freq': 7, 'min_child_samples': 19, 'max_depth': 987}. Best is trial 297 with value: 0.8986074211306407.[0m
[32m[I 2022-06-20 01:13

In [10]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8986

    scaler: StandardScaler()
    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 1.9930245598572226e-07
    lambda_l2: 0.07315458408775775
    num_leaves: 214
    feature_fraction: 0.6575315561043651
    bagging_fraction: 0.6325570898222838
    bagging_freq: 7
    min_child_samples: 16
    n_estimators: 965
[TEST]
balanced_accuracy: 0.5271, accuracy: 0.5781, specificity: 0.6905, sensitivity: 0.3636


(0.527056277056277, 0.578125, 0.6904761904761905, 0.36363636363636365)

In [11]:
# Select a dataset
dataset_name = 'artigo'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

feat_selector.fit(np.array(X), y)

print("Number of selected features: ", feat_selector.n_features_)

print(X.columns[feat_selector.support_])

# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_test_transform = feat_selector.transform(np.array(X_test))

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    print(f"    scaler: {scaler}")
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X_transform, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test_transform)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	59
Rejected: 	501
Iteration: 	9 / 100
Confirmed: 	3
Tentative: 	56
Rejected: 	501
Iteration: 	10 / 100
Confirmed: 	3
Tentative: 	56
Rejected: 	501
Iteration: 	11 / 100
Confirmed: 	3
Tentative: 	56
Rejected: 	501
Iteration: 	12 / 100
Confirmed: 	12
Tentative: 	47
Rejected: 	501
Iteration: 	13 / 100
Confirmed: 	12
Tentative: 	47
Rejected: 	501
Iteration: 	14 / 100
Confirmed: 	12
Tentative: 	47
Rejected: 	501
Iteration: 	15 / 100
Confirmed: 	12
Tentative: 	47
Rejected: 	501
Iteration: 	16 / 100
Confirmed: 	19
T

In [15]:
study = optuna.create_study(
    study_name="boruta-featuresscaler-dataset-artigo",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=500)
study._storage._backend.engine.dispose()

[32m[I 2022-06-20 01:19:50,533][0m A new study created in RDB with name: boruta-featuresscaler-dataset-artigo[0m
[32m[I 2022-06-20 01:19:51,306][0m Trial 0 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 2.3076737351660542e-07, 'lambda_l2': 0.0007515276678931005, 'num_leaves': 240, 'feature_fraction': 0.4594147332067577, 'bagging_fraction': 0.6414780846917948, 'bagging_freq': 6, 'min_child_samples': 58, 'max_depth': 427}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-20 01:19:52,040][0m Trial 1 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 2.2571663082391177e-07, 'lambda_l2': 8.027165680988265, 'num_leaves': 52, 'feature_fraction': 0.9128508227593485, 'bagging_fraction': 0.7679427344965074, 'bagging_freq': 2, 'min_child_samples': 90, 'max_depth': 902}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-20 01:19:52,960][0m Trial 2 finished with value: 0.5 and parameters: {'scalers': 'robust', 'lambda_l1': 7.86

In [16]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8770

    scaler: RobustScaler()
    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 3.1642271775352635e-08
    lambda_l2: 0.001552432242725747
    num_leaves: 139
    feature_fraction: 0.6945210692949899
    bagging_fraction: 0.7239607431842036
    bagging_freq: 7
    min_child_samples: 11
    n_estimators: 554
[TEST]
balanced_accuracy: 0.5509, accuracy: 0.6094, specificity: 0.7381, sensitivity: 0.3636


(0.550865800865801, 0.609375, 0.7380952380952381, 0.36363636363636365)

### Resultados

**dataset apenas dado normalizado**

De:
```
Best trial Score from HP Opt: 0.8514
[TEST]
balanced_accuracy: 0.5152, accuracy: 0.5625, specificity: 0.6667, sensitivity: 0.3636
```
Para:
```
Best trial Score from HP Opt: 0.8986

[TEST]
balanced_accuracy: 0.5271, accuracy: 0.5781, specificity: 0.6905, sensitivity: 0.3636
(0.527056277056277, 0.578125, 0.6904761904761905, 0.36363636363636365)
```

**dataset do artigo**

De: 
```
Best trial Score from HP Opt: 0.8422

[TEST]
balanced_accuracy: 0.5758, accuracy: 0.6562, specificity: 0.8333, sensitivity: 0.3182
```

Para:
```
Best trial Score from HP Opt: 0.8770

  
[TEST]
balanced_accuracy: 0.5509, accuracy: 0.6094, specificity: 0.7381, sensitivity: 0.3636
(0.550865800865801, 0.609375, 0.7380952380952381, 0.36363636363636365)
```

- No dataset "onlynormalized", Pré selecionar as features mais importantes auxiliou a obter uma performance ainda melhor a partir do pipeline com Robust Scaler e LightGBM.

- Já no dataset do artigo, apesar de uma melhora no score de validação, o resultado no test não melhorou.