# Notebook gerar e testar Dataset v2



---

### 1) Setup

In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import balanced_accuracy_score, make_scorer, f1_score, recall_score,precision_score
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV, cross_validate

import optuna
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

from sklearn.model_selection import train_test_split


In [3]:
SEED = 42

ARTIGO_TRAIN_CLINICAL_FILENAME = "datasets/artigo/train_set_clinical.csv"
ARTIGO_TEST_CLINICAL_FILENAME = "datasets/artigo/test_set_clinical.csv"

ARTIGOV2_TRAIN_CLINICAL_FILENAME = "datasets/artigo_v2/train_set_clinical.csv"
ARTIGOV2_TEST_CLINICAL_FILENAME = "datasets/artigo_v2/test_set_clinical.csv"

NORMALIZADO_TRAIN_CLINICAL_FILENAME = "datasets/normalizado/train_set_clinical.csv"
NORMALIZADO_TEST_CLINICAL_FILENAME = "datasets/normalizado/test_set_clinical.csv"

NORMALIZADOV2_TRAIN_CLINICAL_FILENAME = "datasets/normalizado_v2/train_set_clinical.csv"
NORMALIZADOV2_TEST_CLINICAL_FILENAME = "datasets/normalizado_v2/test_set_clinical.csv"

---

### 2) Read and Preprocess Data

In [4]:
datasets = {
    'normalizado': {
        'train': pd.read_csv(NORMALIZADO_TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID"),
        'test': pd.read_csv(NORMALIZADO_TEST_CLINICAL_FILENAME, sep=";", index_col="ID")
    },
    'normalizado_v2': {
        'train': pd.read_csv(NORMALIZADOV2_TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID"),
        'test': pd.read_csv(NORMALIZADOV2_TEST_CLINICAL_FILENAME, sep=";", index_col="ID")
    },    
    'artigo': {
        'train': pd.read_csv(ARTIGO_TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID"),
        'test':  pd.read_csv(ARTIGO_TEST_CLINICAL_FILENAME, sep=";", index_col="ID")
    },
    'artigo_v2': {
        'train': pd.read_csv(ARTIGOV2_TRAIN_CLINICAL_FILENAME, sep=";", index_col="ID"),
        'test':  pd.read_csv(ARTIGOV2_TEST_CLINICAL_FILENAME, sep=";", index_col="ID")
    }
}


### 2.1) Reparticiona e datasets de forma balanceada


Estavámos percebendo o que o aumento no score de validação (~.80) não estava levando aumento similar no score de test (~.58). Talvez a razão seja porque o conjunto de amostra de testes atual foi colhido em momento posterior ao conjunto de treinamento. 

A ideia desse repartcionamento é diluir um possível data drift no conjunto de teste e treinamento.

O código abaixo só foi rodado uma vez.

In [23]:
# normalizado_all = pd.concat([ datasets['normalizado']['train'], datasets['normalizado']['test']], axis=0)
# artigo_all = pd.concat([ datasets['artigo']['train'], datasets['artigo']['test']], axis=0)


# X_train, X_test, Y_train, Y_test = train_test_split(normalizado_all.iloc[:,:-1], normalizado_all.iloc[:,-1],
#                                                     stratify=normalizado_all.iloc[:,-1], 
#                                                     test_size=0.323, random_state=0)

# datasets['normalizado_v2'] = {
#     'train': pd.concat([X_train, Y_train], axis=1),
#     'test': pd.concat([X_test, Y_test], axis=1)
# }

# X_train, X_test, Y_train, Y_test = train_test_split(artigo_all.iloc[:,:-1], artigo_all.iloc[:,-1],
#                                                     stratify=artigo_all.iloc[:,-1], 
#                                                     test_size=0.323, random_state=0)

# datasets['artigo_v2'] = {
#     'train': pd.concat([X_train, Y_train], axis=1),
#     'test': pd.concat([X_test, Y_test], axis=1)
# }


# datasets['normalizado_v2']['train'].to_csv(NORMALIZADOV2_TRAIN_CLINICAL_FILENAME, sep=';', line_terminator='\n')
# datasets['normalizado_v2']['test'].to_csv(NORMALIZADOV2_TEST_CLINICAL_FILENAME, sep=';', line_terminator='\n')
# datasets['artigo_v2']['train'].to_csv(ARTIGOV2_TRAIN_CLINICAL_FILENAME, sep=';', line_terminator='\n')
# datasets['artigo_v2']['test'].to_csv(ARTIGOV2_TEST_CLINICAL_FILENAME, sep=';', line_terminator='\n')

In [5]:
# Target categories are balanced?
for dataset in datasets.keys():
    df_train = datasets[dataset]['train']
    df_test  = datasets[dataset]['test']
    nr_train_mild = (df_train['Group'] == 'MILD').sum()
    nr_train_severe =(df_train['Group'] == 'SEVERE').sum()
    nr_test_mild = (df_test['Group'] == 'MILD').sum()
    nr_test_severe =(df_test['Group'] == 'SEVERE').sum()
    print(f'Dataset {dataset}')
    print(f'Training MILD: {nr_train_mild} ({nr_train_mild/(nr_train_mild+nr_train_severe):.2}), SEVERE: {nr_train_severe} ({nr_train_severe/(nr_train_mild+nr_train_severe):.2}) = {nr_train_mild+nr_train_severe}')
    print(f'    Test MILD: {nr_test_mild} ({nr_test_mild/(nr_test_mild+nr_test_severe):.2}), SEVERE: {nr_test_severe} ({nr_test_severe/(nr_test_severe+nr_test_mild):.2}) = {nr_test_mild+nr_test_severe} ')
    print()

Dataset normalizado
Training MILD: 81 (0.61), SEVERE: 51 (0.39) = 132
    Test MILD: 42 (0.66), SEVERE: 22 (0.34) = 64 

Dataset normalizado_v2
Training MILD: 83 (0.63), SEVERE: 49 (0.37) = 132
    Test MILD: 40 (0.62), SEVERE: 24 (0.38) = 64 

Dataset artigo
Training MILD: 81 (0.61), SEVERE: 51 (0.39) = 132
    Test MILD: 42 (0.66), SEVERE: 22 (0.34) = 64 

Dataset artigo_v2
Training MILD: 83 (0.63), SEVERE: 49 (0.37) = 132
    Test MILD: 40 (0.62), SEVERE: 24 (0.38) = 64 



In [6]:
##### Preprocessing all datasets
for d_key in datasets.keys():
    for d_type in datasets[d_key].keys():
        
        # Drop NaN Values 
        datasets[d_key][d_type].dropna(inplace=True)
        
        # Convert Sex column to boolean (Female: 1, Male: 0)
        datasets[d_key][d_type]["Sex"] = np.where(datasets[d_key][d_type]["Sex"]=="F", 1, 0)

---

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

def evaluate_test(groundtruth, predicted, print_result=True):
    bal_accuracy = balanced_accuracy_score(groundtruth, predicted)
    accuracy = accuracy_score(groundtruth, predicted)
    tn, fp, fn, tp = confusion_matrix(groundtruth, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)
    if(print_result):
        print(f"\n [test:]")
        print(f'Balanced accuracy: {bal_accuracy:.4f}') 
        print(f'Accuracy: {accuracy:.4f}') 
        print(f'Specificity:  {specificity:.4f}')
        print(f'Sensitivity:  {sensitivity:.4f}')
    return (accuracy, specificity, sensitivity)

# Get Features and Target
def getFeaturesTargets(dataset_name):
    dataset = datasets[dataset_name]
    X, y = dataset['train'].drop("Group", axis=1), dataset['train']["Group"]
    X_test, y_test = dataset['test'].drop("Group", axis=1), dataset['test']["Group"]
    return (X, y, X_test, y_test)

# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=SEED)

# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)

### 3) Baseline Model Training and CV

In [12]:
# Select a dataset
def baseline(dataset_name):
    # Define Classifier (or pipeline)
    clf = lgb.LGBMClassifier(random_state=SEED)

    # Get Features and Target
    X, y, X_test, y_test = getFeaturesTargets(dataset_name)

    # No parameters to search now
    parameters = {}

    # Using GridSearchCV instead cross_val_score and cross_validate, because with GridSearchCV we could also evalute a test set
    search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=4, scoring=metric_scorer, cv=rkf, return_train_score=True)
    search.fit(X, y)

    print(f"\n\nDataset: {dataset_name}")
    print(f"Balanced accuracy mean_train: {search.cv_results_['mean_train_score'][0]}, mean_val: {search.cv_results_['mean_test_score'][0]} ")

    predicted = search.best_estimator_.predict(X_test)
    test_score = evaluate_test(y_test, predicted)


for dataset in datasets.keys():
    baseline(dataset)


Fitting 50 folds for each of 1 candidates, totalling 50 fits


Dataset: normalizado
Balanced accuracy mean_train: 1.0, mean_val: 0.7703995514289632 

 [test:]
Balanced accuracy: 0.4578
Accuracy: 0.5156
Specificity:  0.6429
Sensitivity:  0.2727
Fitting 50 folds for each of 1 candidates, totalling 50 fits


Dataset: normalizado_v2
Balanced accuracy mean_train: 1.0, mean_val: 0.6109552646356052 

 [test:]
Balanced accuracy: 0.6875
Accuracy: 0.7344
Specificity:  0.8750
Sensitivity:  0.5000
Fitting 50 folds for each of 1 candidates, totalling 50 fits


Dataset: artigo
Balanced accuracy mean_train: 1.0, mean_val: 0.778942822657219 

 [test:]
Balanced accuracy: 0.5390
Accuracy: 0.5938
Specificity:  0.7143
Sensitivity:  0.3636
Fitting 50 folds for each of 1 candidates, totalling 50 fits


Dataset: artigo_v2
Balanced accuracy mean_train: 1.0, mean_val: 0.6979568114633905 

 [test:]
Balanced accuracy: 0.7333
Accuracy: 0.7812
Specificity:  0.9250
Sensitivity:  0.5417


### Resultados

| metrics                     	| normalizado 	| normalizado_v2 	| artigo 	| artigo_v2 	|
|-----------------------------	|-------------	|----------------	|--------	|-----------	|
| mean_train*            	| 100%     	| 100%   	| 100%   	| 100%      	|
| mean_val*              	| 77.03%   	| 61.09% 	| 77.89% 	| 69.79%    	|
| test balanced_accuracy 	| 45.78%   	| 68.75% 	| 53.90% 	| 73.33%    	|
| test accuracy          	| 51.56%   	| 73.44% 	| 59.38% 	| 78.12%    	|
| test specifity         	| 64.29%   	| 87.50% 	| 71.43% 	| 92.50%    	|
| test sensitivity       	| 27.27%   	| 50.00% 	| 36.36% 	| 54.17%    	|

\* = balanced_accuracy


Alguns pontos
- Todos os modelos parecem estar no regime de overfiting.
- O score de validação piora nos datasets v2, reparticionados. Talvez a validação tenha ficado mais difícil.
- As métricas de test melhoram no datasets v2.

---

##### 4.4) Boruta Feature Selection + Features Scaler + Hyper Parameter Optimization with Optuna

```
normalizado_v2
````

In [34]:
# Select a dataset
dataset_name = 'normalizado_v2'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

feat_selector.fit(np.array(X), y)

print("Number of selected features: ", feat_selector.n_features_)

print(X.columns[feat_selector.support_])

# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_test_transform = feat_selector.transform(np.array(X_test))

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    print(f"    scaler: {scaler}")
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X_transform, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test_transform)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	649
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	29
Rejected: 	620
Iteration: 	16 / 100
Confirmed: 	0
Tentat

In [41]:
study = optuna.create_study(
    study_name="boruta-featuresscaler-dataset-nomalizado_v2",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=100)
study._storage._backend.engine.dispose()

[32m[I 2022-06-21 08:10:47,572][0m Using an existing study with name 'boruta-featuresscaler-dataset-nomalizado_v2' instead of creating a new one.[0m
[32m[I 2022-06-21 08:10:50,116][0m Trial 400 finished with value: 0.7918107436509141 and parameters: {'scalers': 'robust', 'lambda_l1': 0.00027096113850948606, 'lambda_l2': 2.2139443840013307e-08, 'num_leaves': 156, 'feature_fraction': 0.703823632322998, 'bagging_fraction': 0.8091520383075381, 'bagging_freq': 1, 'min_child_samples': 9, 'max_depth': 489}. Best is trial 213 with value: 0.8103478984465826.[0m
[32m[I 2022-06-21 08:10:51,707][0m Trial 401 finished with value: 0.795222825420194 and parameters: {'scalers': 'standard', 'lambda_l1': 0.005812630361475851, 'lambda_l2': 3.6580510659283626e-08, 'num_leaves': 181, 'feature_fraction': 0.4537670813368496, 'bagging_fraction': 0.5150177402872773, 'bagging_freq': 1, 'min_child_samples': 5, 'max_depth': 395}. Best is trial 213 with value: 0.8103478984465826.[0m
[32m[I 2022-06-21 08:

In [42]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8103

    scaler: StandardScaler()
    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 0.0034481481436410702
    lambda_l2: 2.3643530816733582e-08
    num_leaves: 152
    feature_fraction: 0.7049291692912046
    bagging_fraction: 0.5598808081710741
    bagging_freq: 1
    min_child_samples: 5
    n_estimators: 434
[TEST]
balanced_accuracy: 0.6333, accuracy: 0.6875, specificity: 0.8500, sensitivity: 0.4167


(0.6333333333333333, 0.6875, 0.85, 0.4166666666666667)

```
artigo_v2 com scaler
```

In [8]:
# Select a dataset
dataset_name = 'artigo_v2'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

feat_selector.fit(np.array(X), y)

print("Number of selected features: ", feat_selector.n_features_)

print(X.columns[feat_selector.support_])

# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_test_transform = feat_selector.transform(np.array(X_test))

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    print(f"    scaler: {scaler}")
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X_transform, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test_transform)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	519
Iteration: 	9 / 100
Confirmed: 	2
Tentative: 	39
Rejected: 	519
Iteration: 	10 / 100
Confirmed: 	2
Tentative: 	39
Rejected: 	519
Iteration: 	11 / 100
Confirmed: 	2
Tentative: 	39
Rejected: 	519
Iteration: 	12 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	13 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	14 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	15 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	16 / 100
Confirmed: 	6
Tentat

In [17]:
study = optuna.create_study(
    study_name="boruta-featuresscaler-dataset-artigo_v2",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=20)
study._storage._backend.engine.dispose()

[32m[I 2022-06-21 11:06:08,306][0m Using an existing study with name 'boruta-featuresscaler-dataset-artigo_v2' instead of creating a new one.[0m
[32m[I 2022-06-21 11:06:10,548][0m Trial 481 finished with value: 0.847306312495554 and parameters: {'scalers': 'minmax', 'lambda_l1': 3.475281353016381e-05, 'lambda_l2': 2.2036117180354706e-08, 'num_leaves': 80, 'feature_fraction': 0.4103144688495338, 'bagging_fraction': 0.5914977904912272, 'bagging_freq': 4, 'min_child_samples': 9, 'max_depth': 483}. Best is trial 294 with value: 0.8598238376690389.[0m
[32m[I 2022-06-21 11:06:11,967][0m Trial 482 finished with value: 0.8464005130146771 and parameters: {'scalers': 'minmax', 'lambda_l1': 6.290298320799815e-05, 'lambda_l2': 3.279209064258963e-08, 'num_leaves': 74, 'feature_fraction': 0.43102767121241736, 'bagging_fraction': 0.5805406161244663, 'bagging_freq': 4, 'min_child_samples': 7, 'max_depth': 446}. Best is trial 294 with value: 0.8598238376690389.[0m
[32m[I 2022-06-21 11:06:13,3

In [18]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8598

    scaler: MinMaxScaler()
    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 0.00108303951692939
    lambda_l2: 2.1243714516894663e-07
    num_leaves: 103
    feature_fraction: 0.40116015489878404
    bagging_fraction: 0.5804983211493618
    bagging_freq: 4
    min_child_samples: 12
    n_estimators: 455
[TEST]
balanced_accuracy: 0.7208, accuracy: 0.7656, specificity: 0.9000, sensitivity: 0.5417


(0.7208333333333333, 0.765625, 0.9, 0.5416666666666666)

```
artigo_v2 sem scaler
```

In [9]:
# Select a dataset
dataset_name = 'artigo_v2'

# Get Features and Target
X, y, X_test, y_test = getFeaturesTargets(dataset_name)

# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

feat_selector.fit(np.array(X), y)

print("Number of selected features: ", feat_selector.n_features_)

print(X.columns[feat_selector.support_])

# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_test_transform = feat_selector.transform(np.array(X_test))

# Define objectiva funtion to maximize metric
def objective(trial):
    
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy


# Define a detailed objective function to get more metrics of best trial
def detailed_objective(trial):
    
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])

    # Print params for best trial
    for key, value in param.items():
        print("    {}: {}".format(key, value))


    clf.fit(X_transform, y)

    # calculate more evaluation metrics
    predicted = clf.predict(X_test_transform)

    bal_accuracy = balanced_accuracy_score(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
    specificity = tn / (tn+fp)
    sensitivity  = tp / (tp+fn)

    print('[TEST]')
    print(f'balanced_accuracy: {bal_accuracy:.4f}, accuracy: {accuracy:.4f}, specificity: {specificity:.4f}, sensitivity: {sensitivity:.4f}')
    return (bal_accuracy, accuracy, specificity, sensitivity) 

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	560
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	41
Rejected: 	519
Iteration: 	9 / 100
Confirmed: 	2
Tentative: 	39
Rejected: 	519
Iteration: 	10 / 100
Confirmed: 	2
Tentative: 	39
Rejected: 	519
Iteration: 	11 / 100
Confirmed: 	2
Tentative: 	39
Rejected: 	519
Iteration: 	12 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	13 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	14 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	15 / 100
Confirmed: 	4
Tentative: 	27
Rejected: 	529
Iteration: 	16 / 100
Confirmed: 	6
Tentat

In [14]:
study = optuna.create_study(
    study_name="boruta-dataset-artigo_v2",
    storage="sqlite:///experiments.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=100)
study._storage._backend.engine.dispose()

[32m[I 2022-06-21 22:40:52,104][0m Using an existing study with name 'boruta-dataset-artigo_v2' instead of creating a new one.[0m
[32m[I 2022-06-21 22:40:55,082][0m Trial 400 finished with value: 0.8491278896112487 and parameters: {'lambda_l1': 1.4427153197852581e-08, 'lambda_l2': 6.0278463855587e-07, 'num_leaves': 233, 'feature_fraction': 0.6745637896537282, 'bagging_fraction': 0.4958665892033316, 'bagging_freq': 1, 'min_child_samples': 14, 'max_depth': 836}. Best is trial 336 with value: 0.8591918981001799.[0m
[32m[I 2022-06-21 22:40:56,803][0m Trial 401 finished with value: 0.8368561999146366 and parameters: {'lambda_l1': 2.8691587816263718e-08, 'lambda_l2': 3.5760413898827236e-07, 'num_leaves': 187, 'feature_fraction': 0.6929259037749645, 'bagging_fraction': 0.6075351177635184, 'bagging_freq': 1, 'min_child_samples': 11, 'max_depth': 745}. Best is trial 336 with value: 0.8591918981001799.[0m
[32m[I 2022-06-21 22:40:59,026][0m Trial 402 finished with value: 0.827074896422

In [15]:
# Get best trial based on metric score
trial = study.best_trial
# Best Score from HP Opt
print(f'Best trial Score from HP Opt: {trial.values[0]:.4f}\n')

detailed_objective(trial)

Best trial Score from HP Opt: 0.8592

    objective: binary
    metric: binary_logloss
    verbosity: -1
    boosting_type: gbdt
    lambda_l1: 2.146567505636963e-08
    lambda_l2: 8.27476901340718e-07
    num_leaves: 220
    feature_fraction: 0.5865055793795527
    bagging_fraction: 0.44814575746694135
    bagging_freq: 1
    min_child_samples: 9
    n_estimators: 774
[TEST]
balanced_accuracy: 0.7625, accuracy: 0.7969, specificity: 0.9000, sensitivity: 0.6250


(0.7625, 0.796875, 0.9, 0.625)

### Resultados

| metrics                     	| normalizado 	| normalizado_v2 	| artigo 	| artigo_v2 	| artigo_v2 sem scaler  |
|-----------------------------	|-------------	|----------------	|--------	|-----------    | --------------------  |
| Best trial score (mean_val) 	| 89.86%      	| 81.03%         	| 84.22% 	| 85.98%    	| 85.92%                |
| test balanced_accuracy      	| 52.71%      	| 63.33%         	| 57.58% 	| 72.08%    	| 76.25%                |
| test accuracy               	| 57.81%      	| 68.75%         	| 65.62% 	| 76.56%    	| 79.68%                |
| test specifity              	| 69.05%      	| 85.00%         	| 83.33% 	| 90.00%    	| 90.00%                |
| test sensitivity            	| 36.36%      	| 41.67%         	| 31.82% 	| 54.15%    	| 62.50%                |

