<a href="https://colab.research.google.com/github/Stefano-t/bioinf-lab/blob/main/prediction_ae_hg38.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# download the cached results
!git clone https://github.com/LucaCappelletti94/bioinformatics_practice.git
!mv bioinformatics_practice/Notebooks/active_enhancers_performance .
!rm -rf bioinformatics_practice

Cloning into 'bioinformatics_practice'...
remote: Enumerating objects: 5677, done.[K
remote: Counting objects: 100% (223/223), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 5677 (delta 114), reused 221 (delta 112), pack-reused 5454[K
Receiving objects: 100% (5677/5677), 435.20 MiB | 30.79 MiB/s, done.
Resolving deltas: 100% (274/274), done.
Checking out files: 100% (5308/5308), done.


In [2]:
print("installing dependency...")
!wget "https://raw.githubusercontent.com/LucaCappelletti94/bioinformatics_practice/master/requirements.txt" &> /dev/null
%pip install -r requirements.txt &> /dev/null
#print("downloading epigenomic_dataset...")
#%pip install epigenomic_dataset &> /dev/null && echo "epigenomic_dataset installed!"
print("downloading humanize 3.6.0...")
%pip install -Iv humanize==3.6.0 &> /dev/null && echo "humanize 3.6.0 installed!"
print("downloading extra_keras_metrics...")
%pip install extra_keras_metrics &> /dev/null && echo "extra_keras_metrics installed!"

!rm requirements.txt
print("finish!")

installing dependency...
downloading humanize 3.6.0...
humanize 3.6.0 installed!
downloading extra_keras_metrics...
extra_keras_metrics installed!
finish!


In [3]:
%tensorflow_version 2.x
import tensorflow as tf
from epigenomic_dataset import load_epigenomes
from sklearn.impute import KNNImputer
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm.auto import tqdm
from tqdm.keras import TqdmCallback

from sklearn.tree import DecisionTreeClassifier
from cache_decorator import Cache

from typing import Dict, List
from sanitize_ml_labels import sanitize_ml_labels
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score


In [4]:
# load the dataset

X, y = load_epigenomes(
    cell_line = "K562",
    dataset = "fantom",
    region = "enhancers",
    window_size = 256,
    root = "datasets"
)

In [5]:
# the strand in the enhancer is not relevant
# the columns have values which is equal to the average times the protein 
# apperers in the given window_size 
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,chrom,SMAD5,NCOA2,ZNF23,NR2F6,CEBPZ,SNIP1,ZBTB7A,ZNF133,NRF1,TAF1,STAG1,ZFX,STAT5A,E2F1,H2AFZ,TAL1,NFRKB,HNRNPUL1,SKIL,RBM34,ZBTB11,RBFOX2,ZNF175,GATA2,XRCC3,NFIC,ATF1,ETV6,ZNF444,MEF2D,ZNF700,SMARCA4,NFXL1,HMBOX1,IKZF1,TBX18,IRF2,H3K27ac,ZKSCAN8,PATZ1,...,ZEB2,KLF10,EP300,HDAC1,CEBPB,H3K4me1,ZNF319,SMARCE1,CBFA2T3,ZNF134,RUNX1,RNF2,TEAD1,MYC,ZNF148,ZNF766,HES1,MCM5,ZNF589,TARDBP,KLF6,ZNF583,ZNF696,PTRF,RFX5,UBTF,TEAD2,SMARCA5,HLTF,ZNF384,SIN3A,SMARCC2,ZFP91,E2F4,NR3C1,SAFB2,E4F1,NCOR1,NCOA1,KLF1
chrom,chromStart,chromEnd,strand,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
chr10,100006381,100006637,.,0.90,0.90,1.37,0.67,0.58,1.19,0.79,0.86,1.22,0.00,0.92,0.81,0.82,1.79,2.88,1.49,0.31,0.75,1.05,1.67,5.19,0.78,1.76,0.76,0.51,1.32,0.38,1.57,1.82,0.39,0.75,3.75,0.82,3.52,3.39,1.31,0.66,3.44,0.66,0.99,...,2.58,0.86,4.60,1.63,1.62,3.01,0.52,5.60,3.88,0.65,0.52,1.03,1.04,2.91,0.66,1.26,1.33,0.34,1.21,1.16,1.74,0.69,0.70,0.81,0.51,2.13,0.55,1.67,1.43,1.39,0.41,3.41,1.07,1.15,1.19,0.99,2.74,2.38,0.82,0.69
chr10,100008146,100008402,.,0.73,1.06,0.74,0.42,0.61,0.81,0.95,0.59,0.65,1.46,1.25,0.89,1.40,0.56,0.94,2.28,0.65,0.71,0.96,0.46,1.23,0.74,1.80,1.91,0.85,1.79,0.85,0.93,0.40,0.72,0.45,1.40,0.47,3.73,1.81,0.63,0.80,1.69,0.82,0.98,...,1.62,1.21,4.86,1.33,2.81,8.04,0.90,2.38,9.35,0.85,0.68,1.00,0.90,1.34,1.22,1.02,1.67,0.73,1.07,0.88,0.59,0.52,1.03,1.06,1.51,2.88,1.31,0.72,0.84,1.38,0.94,1.20,0.77,1.54,0.88,2.21,1.11,3.11,0.49,1.05
chr10,100014418,100014674,.,0.77,0.33,0.53,0.58,0.82,1.98,0.15,0.99,0.85,0.51,1.23,0.45,0.50,0.84,0.87,0.99,1.06,1.57,0.58,0.58,0.82,0.00,0.93,1.67,1.30,0.35,1.21,0.83,0.72,1.10,0.78,2.18,0.55,1.59,1.06,0.35,1.29,1.31,1.27,0.85,...,1.17,0.46,0.85,1.39,0.92,1.90,1.05,0.77,1.08,0.86,0.98,0.57,0.74,0.92,0.42,1.11,1.32,1.17,0.80,0.87,1.14,0.80,0.28,0.53,0.55,1.46,1.05,1.23,1.56,1.26,0.72,0.95,0.39,0.00,0.42,0.45,1.56,0.61,0.97,0.96
chr10,100020216,100020472,.,0.87,0.80,0.83,0.63,0.82,0.94,0.56,0.81,0.74,2.38,0.77,0.33,0.32,0.78,0.95,0.69,0.95,1.61,0.77,0.32,0.45,0.05,0.82,1.30,0.88,0.83,0.73,0.45,0.64,0.28,0.26,1.11,0.95,0.67,0.70,0.36,0.57,0.66,0.26,0.83,...,1.28,1.29,0.55,0.87,0.38,1.01,0.62,0.87,0.34,0.78,0.80,0.86,1.26,0.48,0.26,0.53,1.06,0.99,0.32,0.55,0.41,0.65,0.74,1.30,1.30,0.80,0.77,1.03,0.90,1.15,0.74,0.73,0.55,0.15,1.02,0.32,1.06,0.57,0.03,1.07
chr10,100043528,100043784,.,0.64,0.53,0.03,0.60,0.99,0.22,0.28,0.55,0.69,2.04,1.28,1.36,0.44,0.30,0.33,0.30,1.06,0.74,0.70,0.00,0.65,0.15,1.29,1.02,0.66,0.24,0.41,0.94,0.65,0.74,0.77,0.79,0.73,0.64,1.00,0.47,0.22,0.00,1.08,0.40,...,1.03,0.26,0.24,0.47,0.68,0.26,0.39,1.09,0.56,0.29,0.67,1.13,0.33,1.26,0.56,0.94,1.12,1.20,0.79,0.46,0.49,0.84,0.71,0.44,1.30,0.24,0.74,1.44,1.07,0.67,0.30,0.69,1.32,0.00,0.70,0.51,0.38,1.26,1.73,0.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY,7520247,7520503,.,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
chrY,7724272,7724528,.,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.09,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.17,0.00,0.00,0.00
chrY,7770029,7770285,.,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
chrY,7796295,7796551,.,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [6]:
y

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,K562
chrom,chromStart,chromEnd,strand,Unnamed: 4_level_1
chr10,100006381,100006637,.,0
chr10,100008146,100008402,.,0
chr10,100014418,100014674,.,0
chr10,100020216,100020472,.,0
chr10,100043528,100043784,.,0
...,...,...,...,...
chrY,7520247,7520503,.,0
chrY,7724272,7724528,.,0
chrY,7770029,7770285,.,0
chrY,7796295,7796551,.,0


count the NaNs

In [7]:
print(f"The total number of nan is: {X.isna().sum().sum()}")
print(f"Max nan in cols: {X.isna().sum().max()}")

The total number of nan is: 102
Max nan in cols: 93


In [8]:
imputer = KNNImputer() # default to 5 neighbours
# fit_transform creates an nparray, so it's necessary to convert back to DataFrame
X = pd.DataFrame(
    imputer.fit_transform(X),
    index = X.index,
    columns = X.columns
)

scaling the values

`RobustScaler` scales values in range of 10-90 percentiles.

In [9]:
scaler = RobustScaler()
scaler.fit_transform(X)

array([[ 0.31578947,  0.43902439,  1.03278689, ...,  2.14666667,
         0.08333333, -0.32142857],
       [ 0.01754386,  0.82926829,  0.        , ...,  3.12      ,
        -0.46666667,  0.32142857],
       [ 0.0877193 , -0.95121951, -0.3442623 , ..., -0.21333333,
         0.33333333,  0.16071429],
       ...,
       [-1.26315789, -1.75609756, -1.21311475, ..., -1.02666667,
        -1.28333333, -1.55357143],
       [-1.26315789, -1.75609756, -1.21311475, ..., -1.02666667,
        -1.28333333, -1.55357143],
       [-1.26315789, -1.75609756, -1.21311475, ..., -1.02666667,
        -1.28333333, -1.55357143]])

In [10]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from multiprocessing import cpu_count

@Cache(
    cache_path=[
        "active_enhancers_performance/{function_name}/kept_features_{_hash}.json",
        "active_enhancers_performance/{function_name}/discarded_features_{_hash}.json"
    ],
    args_to_ignore=[
        "X_train", "y_train"
    ]
)
def execute_boruta_feature_selection(
    X_train: pd.DataFrame,
    y_train: np.ndarray,
    holdout_number: int,
    max_iter: int = 100
):
    """Returns tuple with list of kept features and list of discared features.
    
    Parameters
    --------------------------
    X_train: pd.DataFrame,
        The data reserved for the input of the training of the Boruta model.
    y_train: np.ndarray,
        The data reserved for the output of the training of the Boruta model.
    holdout_number: int,
        The current holdout number.
    max_iter: int = 100,
        Number of iterations to run Boruta for.
    """
    # Create the Boruta model
    boruta_selector = BorutaPy(
        # Defining the model that Boruta should use.
        RandomForestClassifier(n_jobs=cpu_count(), class_weight='balanced_subsample', max_depth=5),
        # We leave the number of estimators to be decided by Boruta
        n_estimators='auto',
        verbose=False,
        alpha=0.05, # p_value
        # In practice one would run at least 100-200 times,
        # until all tentative features are exausted.
        max_iter=max_iter, 
        random_state=42,
    )
    # Fit the Boruta model
    boruta_selector.fit(X_train.values, y_train)
    
    # Get the kept features and discarded features
    kept_features = list(X_train.columns[boruta_selector.support_])
    discarded_features = list(X_train.columns[~boruta_selector.support_])
    
    # Filter out the unused featured.
    return kept_features, discarded_features

In [11]:
# method to evaluate the models

def evaluate_model_prediction(
    y_true: np.ndarray,
    y_pred: np.ndarray
) -> Dict[str, float]:
  """Return the evaluation of the given predictions.

  Parameters
  -----------------------------
  y_true: np.ndarray,
    the ground truth labels.
  y_pred: np.ndarray,
    the predicted labels.
  
  Raises
  -----------------------------
  ValueError,
    if the two given array do no have the same shape.

  Returns
  -----------------------------
  Dictionary with the performance metrics.
  """
  y_true = y_true.flatten()
  y_pred = y_pred.flatten()
  if y_true.shape != y_pred.shape:
    raise ValueError(
        "The two arrays do not have the same shape: {} != {}".format(
            y.true.shape, y_pred.shape
        )
    )

  float_metrics = average_precision_score, roc_auc_score
  int_metrics = (accuracy_score, ) # tuple of one elem
  return {
      ** { # `**` unwrap the dict as a linear sequence of key-value pairs
          sanitize_ml_labels(metric.__name__): metric(
              y_true,
              y_pred
          )
          for metric in float_metrics
      },
      ** {
          sanitize_ml_labels(metric.__name__): metric(
              y_true, 
              np.round(y_pred).astype(int)
          )
          for metric in int_metrics
      }
  }

In [12]:
def all_model_evalutation(
    y_train_true: np.ndarray,
    y_train_pred: np.ndarray,
    y_test_true: np.ndarray,
    y_test_pred: np.ndarray,
    model_name: str,
    holdout_number: int,
    use_feature_selection: bool
) -> List[Dict]:

  info = {
      "model_name": model_name,
      "holdout_number": holdout_number,
      "use_feature_selection": use_feature_selection
  }

  return [
      {
        ** evaluate_model_prediction(y_train_true, y_train_pred),
        "run_type": "train",
        ** info
      },
      {
        ** evaluate_model_prediction(y_test_true, y_test_pred),
        "run_type": "test",
        ** info  
      },
  ]

In [13]:
@Cache(
    cache_path="active_enhancers_performance/{function_name}/{_hash}.json",
    args_to_ignore=[
      "X_train", "X_test", "y_train", "y_test"
    ]
)
def train_decision_tree(
    X_train: np.ndarray, 
    X_test: np.ndarray, 
    y_train: np.ndarray,
    y_test: np.ndarray,
    holdout_number: int,
    use_feature_selection: bool
) -> Dict[str, float]:
  tree = DecisionTreeClassifier(
      max_depth = 10,
      min_samples_leaf = 50
  )
  tree.fit(X_train,y_train)
  y_train_pred = tree.predict(X_train)
  y_test_pred = tree.predict(X_test)
  return all_model_evalutation(
      y_train, 
      y_train_pred,
      y_test,
      y_test_pred,
      "Decision tree",
      holdout_number,
      use_feature_selection
  )

In [14]:
from sklearn.ensemble import RandomForestClassifier
from multiprocess import cpu_count

@Cache(
    cache_path="active_enhancers_performance/{function_name}/{_hash}.json",
    args_to_ignore=[
      "X_train", "X_test", "y_train", "y_test"
    ]
)
def train_random_forest(
    X_train: np.ndarray, 
    X_test: np.ndarray, 
    y_train: np.ndarray,
    y_test: np.ndarray,
    holdout_number: int,
    use_feature_selection: bool
) -> Dict[str, float]:
  forest = RandomForestClassifier(
      n_estimators = 600,
      class_weight = "balanced_subsample",
      max_depth = 5,
      min_samples_leaf = 100,
      n_jobs = cpu_count(),
      verbose=True
  )
  forest.fit(X_train,y_train)
  y_train_pred = forest.predict(X_train)
  y_test_pred = forest.predict(X_test)
  return all_model_evalutation(
      y_train, 
      y_train_pred,
      y_test,
      y_test_pred,
      "Random Forest",
      holdout_number,
      use_feature_selection
  )

In [15]:
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from extra_keras_metrics import get_minimal_multiclass_metrics

@Cache(
    cache_path="active_enhancers_performance/{function_name}/{_hash}.json",
    args_to_ignore=[
        "X_train", "X_test", "y_train", "y_test"
    ]
)
def train_perceptron(
    X_train: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_test: np.ndarray,
    holdout_number: int,
    use_feature_selection: bool
) -> Dict[str, float]:
    """Return performance of a Perceptron.
    
    Parameters
    ----------------------
    X_train: np.ndarray,
        Data reserved for the input during training of the model.
    X_test: np.ndarray,
        Data reserved for the input during  test of the model.
    y_train: np.ndarray,
        Data reserved for the output during  training of the model.
    y_test: np.ndarray,
        Data reserved for the output during  test of the model.
    holdout_number: int,
        Number of the holdout.
    use_feature_selection: bool,
        Whether the model is trained using features that have
        been selected with Boruta or not.
        
    Returns
    ----------------------
    Dictionary with the model perfomance.
    """
    perceptron = Sequential()
    perceptron.add(Dense(
        1,
        activation="sigmoid",
        input_shape=( (X_train.shape[1], ) )
    ))

    perceptron.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=get_minimal_multiclass_metrics()
    )
    perceptron.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=1000,
        batch_size=1024,
        verbose=False,
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                patience=10
            ),
            TqdmCallback(verbose=1)
        ]
    )
    y_train_pred = perceptron.predict(X_train)
    y_test_pred = perceptron.predict(X_test)
    return all_model_evalutation(
        y_train,
        y_train_pred,
        y_test,
        y_test_pred,
        "Perceptron",
        holdout_number,
        use_feature_selection
    )

In [16]:
@Cache(
    cache_path="active_enhancers_performance/{function_name}/{_hash}.json",
    args_to_ignore=[
        "X_train", "X_test", "y_train", "y_test"
    ]
)
def train_ffnn(
    X_train: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_test: np.ndarray,
    holdout_number: int,
    use_feature_selection: bool
) -> Dict[str, float]:
    """Return performance of a Feed Forward Neural Network.
    
    Parameters
    ----------------------
    X_train: np.ndarray,
        Data reserved for the input during training of the model.
    X_test: np.ndarray,
        Data reserved for the input during  test of the model.
    y_train: np.ndarray,
        Data reserved for the output during  training of the model.
    y_test: np.ndarray,
        Data reserved for the output during  test of the model.
    holdout_number: int,
        Number of the holdout.
    use_feature_selection: bool,
        Whether the model is trained using features that have
        been selected with Boruta or not.
        
    Returns
    ----------------------
    Dictionary with the model perfomance.
    """
    ffnn = Sequential([
        InputLayer( (X_train.shape[1], ) ),
        Dense(128, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    ffnn.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=get_minimal_multiclass_metrics()
    )
    ffnn.fit(
        X_train,
        y_train,
        validation_data=(X_test, y_test),
        epochs=1000,
        batch_size=1024,
        verbose=False,
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                patience=10
            ),
            TqdmCallback(verbose=1)
        ]
    )
    y_train_pred = ffnn.predict(X_train)
    y_test_pred = ffnn.predict(X_test)
    return all_model_evalutation(
        y_train,
        y_train_pred,
        y_test,
        y_test_pred,
        "FFNN",
        holdout_number,
        use_feature_selection
    )

In [17]:
number_of_splits = 10

holdouts_generator = StratifiedShuffleSplit(
  n_splits = number_of_splits,
  test_size = 0.2, # 20% of the dataset as test set
)

In [20]:
all_performance = []

for holdout_number, (train_indices, test_indices) in tqdm(
    # returns a generator over indices of train/test set
    enumerate(holdouts_generator.split(X, y)),
    total=number_of_splits,
    desc="Computing holdouts"
):
  for use_feature_selection in tqdm(
      (True, False),
      desc="Running feature selection",
      leave=False
  ):

    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

    # do it once for all the models
    if use_feature_selection:
      kept_features, discarded_features = execute_boruta_feature_selection(
          X_train,
          y_train.values.ravel(),
          holdout_number
      )
      # We filter the DataFrames columns using the features that
      # Boruta has decided we should keep
      X_train = X_train[kept_features]
      X_test = X_test[kept_features]
    
    for train_model in tqdm(
        (train_decision_tree, train_perceptron, train_ffnn, train_random_forest),
        desc="Training model",
        leave=False
    ):
      performance = train_model(
          X_train.values,
          X_test.values,
          y_train.values,
          y_test.values,
          holdout_number,
          use_feature_selection
      )
      all_performance += performance

all_performance = pd.DataFrame(all_performance)

HBox(children=(FloatProgress(value=0.0, description='Computing holdouts', max=10.0, style=ProgressStyle(descri…

HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   43.7s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   58.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.6s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   43.6s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   58.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   43.7s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   58.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.9s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.7s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   44.0s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   59.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.6s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   43.9s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   59.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.6s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   43.8s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   59.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.8s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   44.1s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   59.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.9s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.6s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   43.8s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   58.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.9s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   44.1s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   59.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.8s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Running feature selection', max=2.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    4.6s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   19.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   44.1s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:   59.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.5s finished


HBox(children=(FloatProgress(value=0.0, description='Training model', max=4.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   26.3s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   59.8s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:  1.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s





[Parallel(n_jobs=2)]: Done 600 out of 600 | elapsed:    0.4s finished


In [None]:
# !rm -rf active_enhancers/train_perceptron/

In [21]:
all_performance

Unnamed: 0,AUPRC,AUROC,Accuracy,run_type,model_name,holdout_number,use_feature_selection
0,0.178086,0.577171,0.919096,train,Decision tree,0,True
1,0.130156,0.552953,0.910010,test,Decision tree,0,True
2,0.313158,0.768667,0.915264,train,Perceptron,0,True
3,0.308579,0.762128,0.915225,test,Perceptron,0,True
4,0.444145,0.824829,0.921763,train,FFNN,0,True
...,...,...,...,...,...,...,...
155,0.302698,0.760867,0.914672,test,Perceptron,9,False
156,0.475350,0.834612,0.921387,train,FFNN,9,False
157,0.285336,0.757052,0.914672,test,FFNN,9,False
158,0.179244,0.729710,0.757091,train,Random Forest,9,False


In [22]:
# !zip -r /content/cache_layer.zip /content/active_enhancers_performance
# from google.colab import files
# files.download("/content/cache_layer.zip")

  adding: content/active_enhancers_performance/ (stored 0%)
  adding: content/active_enhancers_performance/train_random_forest/ (stored 0%)
  adding: content/active_enhancers_performance/train_random_forest/f5d842d49e782d6538ca2dc8b745b5f8520064f3c3e3a72eab797075c4e1592e.json (deflated 48%)
  adding: content/active_enhancers_performance/train_random_forest/a454ca49d241d3a48ba4435623c4ebaa32c0807736995942436dfa4af6582ecd.json (deflated 47%)
  adding: content/active_enhancers_performance/train_random_forest/f4c8ac41ee6aaca37cb585d466f430291407820577cc9e9bd5b15ca51ad93b8a.json.metadata (deflated 63%)
  adding: content/active_enhancers_performance/train_random_forest/35cf8e82bd981966aacede5dab9753a8bb85f724ad5ffdd534b14c5c59274b35.json (deflated 48%)
  adding: content/active_enhancers_performance/train_random_forest/00b2079fb4fdbc03f066560600f2cf21be738809614e8a5688f434db0f74bc61.json.metadata (deflated 56%)
  adding: content/active_enhancers_performance/train_random_forest/1ac9e454be6a68c

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Slightly adapting the dataframe in order to visualiza it better
all_performance["use_feature_selection"] = [
    "Feature Selection" if use_selection else "No feature selection"
    for use_selection in all_performance["use_feature_selection"]
]
all_performance = all_performance.drop(columns=["holdout_number"])

In [29]:
# TODO: change barplot method because this doesn't work

from barplots import barplots

barplots(
    all_performance,
    groupby=["model_name", "use_feature_selection", "run_type"],
    orientation="horizontal",
    height=8
)


HBox(children=(FloatProgress(value=0.0, description='Rendering barplots', layout=Layout(flex='2'), max=3.0, st…

Exception in thread Thread-321:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 470, in _handle_results
    task = get()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 251, in recv
    return _ForkingPickler.loads(buf.getbuffer())
  File "/usr/local/lib/python3.7/dist-packages/matplotlib/figure.py", line 2038, in __setstate__
    mgr = plt._backend_mod.new_figure_manager_given_figure(num, self)
AttributeError: module 'ipykernel.pylab.backend_inline' has no attribute 'new_figure_manager_given_figure'

Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python

AssertionError: ignored

  File "/usr/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
KeyboardInterrupt


# wilcoxon test

use it when you need to evaluate which model
outperform another model, and also to understand if it is important to perform feature selection or not.

In [None]:
# from scipy.stats import wilcoxon
# 
# # Here we will be doing a statistical test.
# models = df[
#     (df.run_type == "test")
# ]
# 
# ffnn_scores = models[models.model=="Perceptron"]
# mlp_scores = models[models.model=="DecisionTreeClassifier"]
# 
# alpha = 0.01
# 
# for metric in ffnn_scores.columns[-4:]:
#     print(metric)
#     a,  b = ffnn_scores[metric], mlp_scores[metric]
#     stats, p_value = wilcoxon(a, b)
#     if p_value > alpha:
#         print(p_value, "The two models performance are statistically identical.")
#     else:
#         print(p_value, "The two models performance are different")
#         if a.mean() > b.mean():
#             print("The first model is better")
#         else:
#             print("The second model is better")