## 0. Install libs

In [None]:
import random
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib as mpl
import networkx as nx
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from IPython.display import display
from sklearn.compose import make_column_selector as selector, ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from imblearn.over_sampling import RandomOverSampler, BorderlineSMOTE, SMOTE, ADASYN

# import auxiliary functions:
from utils.helpers import plot_label_dist, plot_bn_graph, drop_col, drop_dups, set_coltypes, merge_bins, bin_features, select_edges


import warnings
warnings.filterwarnings('ignore')
colormap = [ '#90c3dd',  '#ffffbf', '#f98e52', '#9e0142', '#4575b4']

2025-05-14 10:53:54.660677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-14 10:53:54.677284: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-14 10:53:54.682181: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-14 10:53:54.695279: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Inspect the data
* load the data,
* display the structure,
* display the statistics for each of the descriptors,
* drop descriptors with low variability (one value has occurence of 95% or more).

In [None]:
# set options to display all columns in df and for sns style:
pd.set_option('display.max_columns', None)
sns.set_style("darkgrid")

dir = '/home/yulia/Documents/' # set your directory here

# load the data:
data_tve = pd.read_csv(os.path.join(dir, 'laccase-f-tve.csv'), sep=',')
data_mth = pd.read_csv(os.path.join(dir, 'laccase-f-mth.csv'), sep=',')
data_bpu = pd.read_csv(os.path.join(dir, 'laccase-bpu-lac.csv'), sep=',')

# plot label distributions:
plot_label_dist(data_bpu, 12, colormap=colormap, title='Distribution of examples by target, bpu-lac')

## 2. Define preprocessing rules for the ordinal and continuous descriptors

For the ordinals, we are essentially choosing between 2 strategies:

* using corser binning 
* using one-hot-encoding.

For the continuous, we are choosing between 

* min-max scaling (with the additional option to log-transform)
* standard scaling (with the additional option to log-transform).


In [None]:
def process_ordinals(ord_cols_train, ord_cols_test, ohe=False):
    '''
    function that label-encodes the binned ordinal columns
    returns labels corresponding to bins for the training data and maps them to the test data
    
    args:
    * ord_cols_train: pd.DataFrame of ordinal descriptors containing training examples 
    * ord_cols_test: pd.DataFrame of ordinal descriptors containing test examples 
    * ohe: if True, uses one-hot-encoding is used, otherwise - coarser binning with subsequent label-encoding; defaults to False
    '''
             
    ord_cols_train_cp = ord_cols_train.copy()
    ord_cols_test_cp = ord_cols_test.copy()
    
    if ohe==True:
        # one-hot encode ordinals:
        print('Applying one-hot encoding to ordinal features...')
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #OneHotEncoder(handle_unknown='ignore', categories='auto')
        ord_cols_train_cp = encoder.fit_transform(ord_cols_train_cp)
        ohe_labels = encoder.get_feature_names_out(ord_cols_train.columns.to_list())
        ord_cols_train = pd.DataFrame(ord_cols_train_cp, index=ord_cols_train.index, columns=ohe_labels)             
        ord_cols_test = pd.DataFrame(encoder.transform(ord_cols_test_cp), index=ord_cols_test.index, columns=ohe_labels)
            
    elif ohe==False:   
        print('Applying binning with label-encoding to ordinal features...')
        # bin ordinals from the training set and label-encode:
        ord_cols_train_cp = ord_cols_train_cp.apply(merge_bins)
        ord_cols_train = ord_cols_train_cp
        
        le = LabelEncoder()
        invdict = {'var':[], 'bin':[], 'label':[]}
        for i, col in enumerate(list(ord_cols_train.columns)):
            le.fit(ord_cols_train.iloc[:, i])
            ord_cols_train.iloc[:, i] = le.transform(ord_cols_train.iloc[:, i])
            inv = le.inverse_transform(ord_cols_train.iloc[:, i].unique())
            invdict['var'].append([col]*len(inv))
            invdict['label'].append(ord_cols_train.iloc[:, i].unique())
            invdict['bin'].append(inv)
            
        # save the bins and the encodings to use for the test data and decoding:
        le_inv = pd.DataFrame(invdict)
        le_inv_exp = le_inv.explode(['var', 'bin', 'label']).reset_index(drop=True)        
        
        for col in list(ord_cols_test.columns):
            lookup_table = le_inv_exp[le_inv_exp['var']==col]
            
            interval_index = pd.Index(lookup_table['bin']) # create IntervalIndex:
            mapped_bins = interval_index.get_indexer(ord_cols_test[col]) # map values to bins:
            labs = lookup_table['label'].iloc[mapped_bins].reset_index(drop=True) 
            ord_cols_test[col] = labs.to_numpy()
    
    else:
        raise ValueError('Invalid preprocessing method')
    
    return  ord_cols_train, ord_cols_test
    



def process_continuous (cont_cols_train, cont_cols_test, log=False, scaling=None):
    '''
    function that preprocesses continuous columns:
    * log-transforms if all values in the column (based on the training data) that are positive 
    * scales all columns (either min-max or standardization - based on the option chosen)
    returns log-transformed and scaled values
    '''
    
    # find columns where all values are strictly positive and log-transform only these:
    positive_columns = [col for col in cont_cols_train.columns if (cont_cols_train[col] > 0).all()]
    
    if log==True:
        # transform only columns where all values are positive:
        print('Log-transforming descriptors with non-negative values.\n')
        cont_cols_train[positive_columns] = cont_cols_train[positive_columns].applymap(lambda x: np.log(x))
        
        test_positive = [col for col in cont_cols_test.columns if (cont_cols_test[col] > 0).all()]
          
        s = len(set(test_positive).symmetric_difference(set(positive_columns)))
    
        cont_cols_test[positive_columns] = cont_cols_test[positive_columns].applymap(lambda x: np.log(x) if x>0 else 0.0)
        
        if s > 0:
            print(f'{s} columns in the test data contain non-positive values. The logs for non-positive values will be set to 0.\nTry turning off the log-option.\n')
        
      
    if scaling=='min-max':
        
        # scale columns if no division by 0 occurs:
        sc_columns = (cont_cols_train.max()-cont_cols_train.min()!=0)
        sc_cols = sc_columns.index[sc_columns == True].tolist()
        print('Applying min-max scaling to continuous features...')
        train_min, train_max_min = cont_cols_train[sc_cols].min(), cont_cols_train[sc_cols].max()-cont_cols_train[sc_cols].min()
        cont_cols_train[sc_cols] = cont_cols_train[sc_cols].subtract(train_min).div(train_max_min) 
        cont_cols_test[sc_cols] = cont_cols_test[sc_cols].subtract(train_min).div(train_max_min)
    
    if scaling=='standard':
        
        # scale columns if no division by 0 occurs:
        sc_columns = (cont_cols_train.std()!=0)
        sc_cols = sc_columns.index[sc_columns == True].tolist()
        print('Applying standard scaling to continuous features...')
        train_mean = cont_cols_train[sc_cols].mean()
        train_sd = cont_cols_train[sc_cols].std()
        cont_cols_train[sc_cols] = cont_cols_train[sc_cols].subtract(train_mean).div(train_sd) 
        cont_cols_test[sc_cols] = cont_cols_test[sc_cols].subtract(train_mean).div(train_sd) 
                
    return cont_cols_train, cont_cols_test


## 3. Define the preprocessing function

In [None]:
def preprocess(data, seed, preprocessing=None, log=False, ohe=False):
    
    '''
    function that preprocesses the input as follows:
    * drops low-variance features
    * splits the input set into training and test sets
    * performs variable selection by building a Bayesian Network on the training set and identifying relevant edges (w.r.t to the target)
    * bins the ordinal features for more stability and encodes them 
    * log-transforms and/ or min-max scales the continuous features
    
    preprocessing options:
    None: only removes duplicates and MVs and performs the data split
    'scaling': treats all features as continuous and standardizes all features
    'mixed': applies binning to the ordinals and log-transform and/or min-max scaling to the continuous predictors
    returns preprocessed training data, preprocessed test data, training labels and test labels
    '''
   
    # STEP 1. Define dtypes, replace the only MV (in mth) with 0:
    data = drop_dups(data)
    data = set_coltypes(data)
    
    # STEP 2. Split the data into train and test, separate the target and the descriptors:
  
    train_data, test_data, train_labels, test_labels = train_test_split(data.drop('Oxd', axis=1), data['Oxd'], test_size=0.2, random_state=seed)   
    plot_label_dist(train_labels, seed)
    
    
    # STEP 3. Drop cols with low variability in train, discretize the rest of the columns:
    train_data = drop_col(train_data)
    test_data = test_data[train_data.columns]   
        
    # STEP 4. Preprocess the variables depending on the preprocessing method:    
    if preprocessing == 'scaling': 
        processed_train, processed_test = process_continuous(train_data, test_data, log=log, scaling='standard')
        
   
    elif preprocessing == 'mixed': 
        # separate features into different datatypes
        ord_feat_train = train_data.select_dtypes(include=['int64', 'float64'])
        ord_feat_test = test_data.select_dtypes(include=['int64', 'float64'])

        # process ordinals
        ord_feat_train_cp, ord_feat_test_cp = process_ordinals(ord_feat_train, ord_feat_test, ohe=ohe)
        
        cont_feat_train = train_data.select_dtypes(exclude=['int64', 'float64'])
        cont_feat_test = test_data.select_dtypes(exclude=['int64', 'float64'])    
        
        # process continuous
        cont_feat_train_cp, cont_feat_test_cp = process_continuous(cont_feat_train, cont_feat_test, log=log, scaling='standard') # 'min-max'
        
        processed_train, processed_test =  pd.concat([ord_feat_train_cp, cont_feat_train_cp], axis = 1), pd.concat([ord_feat_test_cp, cont_feat_test_cp], axis = 1)
    elif preprocessing == None:
        print('Variables will not be transformed...')
        processed_train, processed_test = train_data, test_data
    else:
        raise ValueError('Invalid preprocessing method')
    # check if the train data contain MV's
    if (processed_train.isnull().values.any() or processed_test.isnull().values.any()):
        
        print('Either the training or test data contain missing values! Dropping obs with NaNs...\n')
        if processed_train.isnull().values.any():
            processed_train = processed_train.dropna()
        else:
            processed_test = processed_test.dropna()

    
    return processed_train, processed_test, train_labels, test_labels

## GN data augmentation

This section contains code modifications for augmenting the data with GN-contaminated samples.

*NOTE:* 
The noise is injected into the training data prior to preprocessing. Formally, for each descriptor $j = 1, \dots, p$ the perturbed feature is defined as $x_{\cdot j} + \xi_j$, where  $\xi_j \sim \mathcal{N}(0, \eta \sigma_j)$ and $\eta$ represents the noise intensity. To accommodate ordinal descriptors, noise is added and then rounded to the nearest integer, with values clipped to remain within observed bounds. The target labels remain unchanged.


```python

def process_continuous (cont_cols_train, cont_cols_test, log=False, scaling=None):
    '''
    function that preprocesses continuous columns:
    * log-transforms if all values in the column (based on the training data) are positive 
    * min-max scales all columns
    returns log-transformed and scaled values
    '''
    
    # find columns where all values are strictly positive and log-transform only these:
    positive_columns = [col for col in cont_cols_train.columns if (cont_cols_train[col] > 0).all()]
    
    
    # add GN to the training data:  
    noise_std = 0.2  # define noise intensity: set to 0.20*standard deviation

    cont_cols_train_gn =  cont_cols_train.copy()
    # add noise to original (unstandardized) features:
    for col in list(cont_cols_train.columns):
        std_dev = cont_cols_train[col].std()
        np.random.seed(0) 
        noise = np.random.normal(0, noise_std * std_dev, size=cont_cols_train.shape[0])
        cont_cols_train_gn[col] += noise  # Add noise
      
    # augment the original data with the noised data:  
    print('Non-contaminated data: ', cont_cols_train.shape)
    print('Contaminated data: ', cont_cols_train_gn.shape)
    
    cont_cols_train_aug = pd.concat([ cont_cols_train,  cont_cols_train_gn])
    print('Augmented data: ', cont_cols_train_aug.shape)
    print('')
    
    # reassign: 
    cont_cols_train = cont_cols_train_aug
    
    if log==True:
        # transform only columns where all values are positive:
        print('Log-transforming descriptors with non-negative values.\n')
        cont_cols_train[positive_columns] = cont_cols_train[positive_columns].applymap(lambda x: np.log(x))
        
        test_positive = [col for col in cont_cols_test.columns if (cont_cols_test[col] > 0).all()]
          
        s = len(set(test_positive).symmetric_difference(set(positive_columns)))
    
        cont_cols_test[positive_columns] = cont_cols_test[positive_columns].applymap(lambda x: np.log(x) if x>0 else 0.0)
        
        if s > 0:
            print(f'{s} columns in the test data contain non-positive values. The logs for non-positive values will be set to 0.\nTry turning off the log-option.\n')
        
      
    if scaling=='min-max':
        
        # scale columns if no division by 0 occurs:
        sc_columns = (cont_cols_train.max()-cont_cols_train.min()!=0)
        sc_cols = sc_columns.index[sc_columns == True].tolist()
        print('Applying min-max scaling to continuous features...')
        train_min, train_max_min = cont_cols_train[sc_cols].min(), cont_cols_train[sc_cols].max()-cont_cols_train[sc_cols].min()
        cont_cols_train[sc_cols] = cont_cols_train[sc_cols].subtract(train_min).div(train_max_min) 
        cont_cols_test[sc_cols] = cont_cols_test[sc_cols].subtract(train_min).div(train_max_min)
    
    if scaling=='standard':
        
        # scale columns if no division by 0 occurs:
        sc_columns = (cont_cols_train.std()!=0)
        sc_cols = sc_columns.index[sc_columns == True].tolist()
        print('Applying standard scaling to continuous features...')
        train_mean = cont_cols_train[sc_cols].mean()
        train_sd = cont_cols_train[sc_cols].std()
        cont_cols_train[sc_cols] = cont_cols_train[sc_cols].subtract(train_mean).div(train_sd) 
        cont_cols_test[sc_cols] = cont_cols_test[sc_cols].subtract(train_mean).div(train_sd) 
                
    return cont_cols_train, cont_cols_test


def process_ordinals(ord_cols_train, ord_cols_test, ohe=False):
    '''
    function that label-encodes the binned ordinal columns
    returns labels corresponding to bins for the training data and maps them to the test data
    '''
        # add GN to the training data:  
    noise_std = 0.2  # define noise intensity: set to 10% of standard deviation

    ord_cols_train_gn =  ord_cols_train.copy()
    # augment ordinal features carefully (rounding to nearest category):
    for col in list(ord_cols_train.columns):
        std_dev = ord_cols_train[col].std()
        np.random.seed(0) 
        noise = np.random.normal(0, noise_std * std_dev, size=ord_cols_train.shape[0])
        ord_cols_train_gn[col] = np.round(ord_cols_train_gn[col] + noise).clip(ord_cols_train[col].min(), ord_cols_train[col].max())
        # ensure ordinal values remain integers
        ord_cols_train_gn[col] = ord_cols_train_gn[col].astype(int)  
        
        
    # augment the original data with the noised data:  
    print('Non-contaminated data: ', ord_cols_train.shape)
    print('Contaminated data: ', ord_cols_train_gn.shape)
    
    ord_cols_train_aug = pd.concat([ord_cols_train, ord_cols_train_gn])
    print('Augmented data: ', ord_cols_train_aug.shape)
    print('')
    
    # reassign: 
    ord_cols_train = ord_cols_train_aug

    
             
    ord_cols_train_cp = ord_cols_train.copy()
    ord_cols_test_cp = ord_cols_test.copy()
    
    if ohe==True:
        # one-hot encode ordinals:
        print('Applying one-hot encoding to ordinal features...')
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #OneHotEncoder(handle_unknown='ignore', categories='auto')
        ord_cols_train_cp = encoder.fit_transform(ord_cols_train_cp)
        ohe_labels = encoder.get_feature_names_out(ord_cols_train.columns.to_list())
        ord_cols_train = pd.DataFrame(ord_cols_train_cp, index=ord_cols_train.index, columns=ohe_labels)             
        ord_cols_test = pd.DataFrame(encoder.transform(ord_cols_test_cp), index=ord_cols_test.index, columns=ohe_labels)
            
    elif ohe==False:   
        print('Applying binning with label-encoding to ordinal features...')
        # bin ordinals from the training set and label-encode:
        ord_cols_train_cp = ord_cols_train_cp.apply(merge_bins)
        ord_cols_train = ord_cols_train_cp
        
        le = LabelEncoder()
        invdict = {'var':[], 'bin':[], 'label':[]}
        for i, col in enumerate(list(ord_cols_train.columns)):
            le.fit(ord_cols_train.iloc[:, i])
            ord_cols_train.iloc[:, i] = le.transform(ord_cols_train.iloc[:, i])
            inv = le.inverse_transform(ord_cols_train.iloc[:, i].unique())
            invdict['var'].append([col]*len(inv))
            invdict['label'].append(ord_cols_train.iloc[:, i].unique())
            invdict['bin'].append(inv)
        # save the bins and the encodings to use for the test data and decoding:
        le_inv = pd.DataFrame(invdict)
        le_inv_exp = le_inv.explode(['var', 'bin', 'label']).reset_index(drop=True)        
        
        for col in list(ord_cols_test.columns):
            lookup_table = le_inv_exp[le_inv_exp['var']==col]
            
            interval_index = pd.Index(lookup_table['bin']) # create IntervalIndex:
            mapped_bins = interval_index.get_indexer(ord_cols_test[col]) # map values to bins:
            labs = lookup_table['label'].iloc[mapped_bins].reset_index(drop=True) 
            ord_cols_test[col] = labs.to_numpy()
    
    else:
        raise ValueError('Invalid preprocessing method')
    
    return  ord_cols_train, ord_cols_test
    


def preprocess(data, seed, preprocessing='mixed', log=False, ohe=True):
    
    '''
    function that preprocesses the input as follows:
    * drops low-variance features
    * splits the input set into training and test sets
    * performs variable selection by building a Bayesian Network on the training set and identifying relevant edges (w.r.t to the target)
    * bins the ordinal features for more stability and encodes them 
    * log-transforms and/ or min-max scales the continuous features
    
    preprocessing options:
    None: only removes duplicates and MVs and performs the data split
    'scaling': treats all features as continuous and standardizes all features
    'mixed': applies binning to the ordinals and log-transform and/or min-max scaling to the continuous predictors
    returns preprocessed training data, preprocessed test data, training labels and test labels
    '''
   

    # STEP 1. Define dtypes, replace the only MV (in mth) with 0:
    data = drop_dups(data)
    data = set_coltypes(data)
    
    # STEP 2. Split the data into train and test, separate the target and the descriptors:
    train_data, test_data, train_labels, test_labels = train_test_split(data.drop('Oxd', axis=1), data['Oxd'], test_size=0.20, random_state=seed)
        
    plot_label_dist(train_labels, seed)
    
    
    # STEP 3. Drop cols with low variability in train, discretize the rest of the columns:
    train_data = drop_col(train_data)
    test_data = test_data[train_data.columns]   
        
    # STEP 4. Preprocess the variables depending on the preprocessing method:    
    if preprocessing == 'scaling': 
        processed_train, processed_test = process_continuous(train_data, test_data, log=log, scaling='standard')
        
   
    elif preprocessing == 'mixed': 
        # separate features into different datatypes
        ord_feat_train = train_data.select_dtypes(include='int64')
        ord_feat_test = test_data.select_dtypes(include='int64')

   
    
        # process ordinals
        ord_feat_train_cp, ord_feat_test_cp = process_ordinals(ord_feat_train, ord_feat_test, ohe=ohe)
        
        cont_feat_train = train_data.select_dtypes(exclude='int64')
        cont_feat_test = test_data.select_dtypes(exclude='int64')    
        
        # process continuous
        cont_feat_train_cp, cont_feat_test_cp = process_continuous(cont_feat_train, cont_feat_test, log=log, scaling='standard') # 'min-max'
        
        
        
        processed_train, processed_test =  pd.concat([ord_feat_train_cp, cont_feat_train_cp], axis = 1), pd.concat([ord_feat_test_cp, cont_feat_test_cp], axis = 1)

    elif preprocessing == None:
        print('Variables will not be transformed...')
        processed_train, processed_test = train_data, test_data
    else:
        raise ValueError('Invalid preprocessing method')
    # check if the train data contain MV's
    if (processed_train.isnull().values.any() or processed_test.isnull().values.any()):
        
        print('Either the training or test data contain missing values! Dropping obs with NaNs...\n')
        if processed_train.isnull().values.any():
            processed_train = processed_train.dropna()
        else:
            processed_test = processed_test.dropna()
    

        # augment labels (as they are):
    train_labels = pd.concat([train_labels, train_labels])           
        
    return processed_train, processed_test, train_labels, test_labels

```

In [None]:
# test the preprocessing function:
X_train_bpu, X_test_bpu, y_train_bpu, y_test_bpu  = preprocess(data_bpu, 123, preprocessing='mixed', log=False, ohe=True)

INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


The following columns have been dropped:
nTB,nAB,nH,nCsp3,nR09,D/Dtr09,Psi_e_t,P_VSA_m_1,P_VSA_p_1,nCt,nRCOOR,nRNR2,nRNO,nC=N-N<,nHDon
Non-contaminated data:  (136, 21)
Contaminated data:  (136, 21)
Augmented data:  (272, 21)

Applying one-hot encoding to ordinal features...
Non-contaminated data:  (136, 52)
Contaminated data:  (136, 52)
Augmented data:  (272, 52)

Applying standard scaling to continuous features...


In [52]:
#X_train_tve
print(X_train_bpu.shape)
print(X_test_bpu.shape)
print(y_train_bpu.shape)
print(y_test_bpu.shape)

(272, 222)
(34, 222)
(272,)
(34,)


## 4. Create the class for training the classical classifiers

The class defines hyperparameter tuning procedure for each of the classifiers: LogReg, SVC, RFC and GradBoostC by selecting the best hyperparams over the predefined grids using 5-fold CV for a specific seed (data split). Contains the possibility to balance the data using several upsampling methods from the `imblearn` library - such as RandomOverSampler, BorderlineSMOTE, SMOTE, ADASYN, etc.

In [53]:
class RunClf:
    '''
    defines the inner loop of the classifier's hyperparameter tunung by
    selecting the best hyperparams using 5-fold CV for a certain seed
    * * * * * * * * * * * * *
    inputs: features,
            labels,
            method ('LogReg', 'SVM', 'RF' or 'GBoost'),
            balanced ('UndersamplingSMOTE', 'SMOTE', 'BorderlineSMOTE', None)
    outputs: best_pars (object (list) storing the params of the best model) for a data split defined by a single seed
    this class is deterministic in the sense that the only randomness is defined by the data split (defined "outside"),
    for all models and the CV strategy the random_state is set to 0 to control for the random initializations
    '''
    
    def __init__(self, features, labels, method, balanced=None, scoring='f1'):
        self.features = features
        self.labels = labels
        self.method = method
        self.balanced = balanced
        self.scoring = scoring
        self.fitted = None  
        self.model_configs = {
                    'LogReg': {
                        'params': {
                            'penalty': ['l1', 'l2', 'elasticnet'],
                            'l1_ratio': [0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 1],
                            'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 15]
                                },
                        'model': LogisticRegression(max_iter=1000, random_state=0, solver='saga'),
                                },

                    'SVC':{
                        'params': {
                            'C': [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 20],
                            'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
                            },
                        'model': SVC(probability=True), 
                        },
                    
                    'RFC':{
                        'params': {
                          'n_estimators': [25, 50, 75, 100, 200, 300, 500, 1000],
                          'max_depth': [None, 5, 7, 10, 20],
                          'min_samples_split': [5, 10, 15, 25],
                          'min_samples_leaf': [5, 10, 20]
                          },
                        'model': RandomForestClassifier(random_state=0),
                    },

                    'GradBoostC':{
                        'params': {
                          'n_estimators': [25, 50, 75, 100, 200, 300, 500, 1000],
                          'min_samples_split': [5, 10, 15, 25],
                          'min_samples_leaf': [5, 10, 20]
                          },
                        'model': GradientBoostingClassifier(random_state=0, criterion='friedman_mse'),
                    },        
            }


        self.params = self.model_configs[method]['params']
        self.model = self.model_configs[method]['model']
    
        if self.method=='SVC':
            self.model.random_state = 0

    def resample_data(self):
        resampled_features, resampled_labels = self.balanced.fit_resample(self.features, self.labels)
        return resampled_features, resampled_labels


    def fit_tuned(self, cvseed=42):  
        if self.balanced:
            resampled_features, resampled_labels = self.resample_data()
        else:
            resampled_features, resampled_labels = self.features, self.labels
            
        search = GridSearchCV(self.model, self.params, scoring=self.scoring,
                                cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=cvseed),
                                error_score=np.nan)
        search.fit(resampled_features, resampled_labels)
        fitted = search.best_estimator_
        
        return fitted, resampled_features, resampled_labels 

## 5. Define the model evaluation function


Create classification report (computes classification Accuracy, Precision, Recall, F1, and AUROC).
Plots AUROC and the Confusion Matrix.

In [54]:
blsmote = BorderlineSMOTE(sampling_strategy='minority',
                          kind='borderline-1',
                          m_neighbors=10,
                          random_state=0)



def plot_roc(fpr, tpr, cm, method, seed):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
    
    # calculate ROC AUC
    roc_auc = auc(fpr, tpr)
    
    # Plot the ROC curve using matplotlib's ax.plot
    ax1.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc, color='steelblue')
    ax1.fill_between(fpr, 0, tpr, alpha=0.2, color='steelblue')  # auc shading
    ax1.plot([0, 1], [0, 1], linestyle='--', color='orange')  # random classifier
    
    # adjust ticks, labels, and title
    ax1.tick_params(axis='both', labelsize=8)
    ax1.set_xlabel('False Positive Rate', fontsize=9)
    ax1.set_ylabel('True Positive Rate', fontsize=9)
    ax1.set_title('ROC Curve, AUC = %0.2f' % roc_auc, fontsize=11)
    ax1.legend(loc="lower right")
    
    # plot the confusion matrix heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='crest', annot_kws={"fontsize":8}, ax=ax2)
    cbar = ax2.collections[0].colorbar
    cbar.ax.tick_params(labelsize=8)
    ax2.tick_params(axis='both', labelsize=8)
    ax2.set_xlabel('Predicted', fontsize=9)
    ax2.set_ylabel('True', fontsize=9)
    ax2.set_title('Confusion Matrix', fontsize=11)
    
    # save the figure
    plt.savefig(f'plots/roc_auc_{method}_{seed}.png', bbox_inches='tight')
    plt.close(fig)

In [None]:
def evaluate_model(data, method, seeds, balanced=blsmote, threshold=0.5):
    np.random.seed(0)
    r = []
    models = []
    test_pts = []

    
    for s in seeds:
        print(f'* * * {method} for seed={s} * * * \n')
        X_train, X_test, y_train, y_test  = preprocess(data, s, preprocessing='mixed', log=False, ohe=True)
        
        print(f'Data pre-processed, fitting a {method} model...')       

        clf = RunClf(X_train, y_train, method, balanced=balanced)

        fitted, resampled_features, resampled_labels = clf.fit_tuned()
        print(f'Best model: {fitted} \n')
        # predict labels:
        probs_train, probs_test = fitted.predict_proba(np.array(resampled_features))[:,1], fitted.predict_proba(np.array(X_test))[:,1]
        preds_train, preds_test = [int(i > threshold) for i in probs_train], [int(i > threshold) for i in probs_test]

        print(f'Predicted test probabilities: {probs_test} \n')
        print(f'Predicted test labels: {preds_test} \n')
        
        acc_train = accuracy_score(resampled_labels, preds_train)
        prec_train = precision_score(resampled_labels, preds_train)
        rec_train = recall_score(resampled_labels, preds_train)
        f1_train = f1_score(resampled_labels, preds_train)
        fpr, tpr, _ = roc_curve(np.array(resampled_labels), np.array(probs_train))
        roc_auc_train = auc(fpr, tpr)
        

        acc_test = accuracy_score(y_test, preds_test)
        prec_test = precision_score(y_test, preds_test)
        rec_test = recall_score(y_test, preds_test)
        f1_test = f1_score(y_test, preds_test)
        fpr, tpr, _ = roc_curve(np.array(y_test), np.array(probs_test))
        roc_auc_test = auc(fpr, tpr)
        
        print(f'Accuracy: {acc_test}, Precision: {prec_test}, Recall: {rec_test}, F1: {f1_test}, AUC: {roc_auc_test} \n')
        
        cm_train, cm_test = confusion_matrix(resampled_labels, preds_train, labels=fitted.classes_), confusion_matrix(y_test, preds_test, labels=fitted.classes_)
        # plot ROC curve and confusion matrix:
        plot_roc(fpr, tpr, cm_test, method, s)
        
        r.append((acc_train, prec_train, rec_train, f1_train, roc_auc_train, cm_train, acc_test, prec_test, rec_test, f1_test, roc_auc_test, cm_test))
        models.append(fitted)
        test_pts.append([X_test, y_test])
 

 
        
    res = pd.DataFrame({'Accuracy (train)': [el[0] for el in r], 
                        'Precision (train)': [el[1] for el in r], 
                        'Recall (train)': [el[2] for el in r], 
                        'F1 (train)': [el[3] for el in r], 
                        'AUC (train)': [el[4] for el in r],
                        'Accuracy (test)': [el[6] for el in r], 
                        'Precision (test)': [el[7] for el in r], 
                        'Recall (test)': [el[8] for el in r], 
                        'F1 (test)': [el[9] for el in r], 
                        'AUC (test)': [el[10] for el in r],
                        'Method': method}, index=seeds)

    return res, models, test_pts
    

## 6. Train the classifiers

This will train for all 10 different data splits (as specified by the parameter `seeds`) and all classifiers (as specified in the `methods` list).

Seeds used in the experiments:
* **f-tve:** `[54321, 4321, 1234, 1, 123456, 98765, 56789, 5, 567890, 9876]` 
* **f-mth:** `[4321, 321, 123, 2, 12345, 9876, 5678, 6, 67890, 876]`
* **bpu-lac:** `[321, 21, 12, 3, 1234, 987, 567, 7, 7890, 76]`

In [None]:
# f-mth data is used for illustration
methods = ['LogReg', 'SVC', 'RFC', 'GradBoostC']
stats_mth = list(map(lambda m: evaluate_model(data_mth, m, seeds=[4321, 321, 123, 2, 12345, 9876, 5678, 6, 67890, 876]), methods))  

To retrieve different elements of the estimated objects, one needs to use list comprehensions.

In [None]:
metrics_mth = [el[0] for el in stats_mth] # metrics
clfs_mth = [el[1] for el in stats_mth] # classifiers - sklearn objects
test_points_mth = [el[2] for el in stats_mth] # test data

## Feature Importance (based on the RFC features) 

Access RFC feature importance scores.

For accessing the test data, e.g. `test_points_mth[2][0][0]`, the 1st index stands for the method (2 - for `RFC` as indexed in the `methods` list, the 2nd - for the seed (0 - for `4321` as indexed in the `seeds` list), the very last 0 - for the design matrix (the labels are accessed as `test_points_mth[2][0][1]`).  


In [None]:
# RFC for the first seed:
rfc = clfs_mth[2][0]
X_test_mth = test_points_mth[2][0][0] # test data corresponding to the RFC classifier and the first seed
# extract and plot feature importances:
rfc_pred = rfc.predict(X_test_mth.to_numpy())

    # bar plot of the feature scores:
plt.figure(figsize=(8, 8))
feature_scores = pd.Series(rfc_pred[4].feature_importances_, index=X_test_mth.columns).sort_values(ascending=False)
fscores = feature_scores.nlargest(n=len(feature_scores[feature_scores>0.01]), keep='first') # select all features with scores>0.01
sns.barplot(x=fscores, y=fscores.index, hue=fscores.index, palette='RdYlBu', alpha=0.95, edgecolor='black', linewidth=0.25)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')

### Plot the Bayesian Network and the retrieved Markov Blanket
Example for the bpu-lac data and seed=21

In [None]:
rndseed = 21

dbpu = drop_dups(data_bpu)
dbpu = set_coltypes(dbpu)
    
# split the data into train and test, separate the target and the descriptors:
train_data, test_data, train_labels, test_labels = train_test_split(dbpu.drop('Oxd', axis=1), dbpu['Oxd'], test_size=0.2, random_state=rndseed)   
plot_label_dist(train_labels, rndseed)
    
    
# drop cols with low variability in train, discretize the rest of the columns:
train_data = drop_col(train_data)
test_data = test_data[train_data.columns]   
train_binned = bin_features(train_data)
# structure learning:
train_binned['Oxd'] = train_labels
hc = HillClimbSearch(train_binned)
best_model = hc.estimate(scoring_method=BicScore(train_binned), max_iter=200)
    
# instanciate and fit the Bayesian Network:
model = BayesianNetwork(best_model.edges())
model.fit(train_binned, estimator=MaximumLikelihoodEstimator)
        
plot_bn_graph(model.edges(), rndseed, reduced=False)
        
# select relevant edges using the Markov blanket:
relevant_edges = select_edges(model)
plot_bn_graph(relevant_edges, rndseed, reduced=True, colormap=colormap)
    
edges = list(set([item for sublist in relevant_edges for item in sublist]))
edges.remove('Oxd')
train_data, test_data = train_data[edges], test_data[edges]