## Explanation
Two functions are defined:
1. Forward feature selection, up to 100 parameters. Returns the optimal parameters for each algorithm 
2. Hyperparameter optimalisation. Uses a grid and the FFS parameters for the optimal HP selection.

These are called in the third section, where data is loaded and the algorithms are run and output is stored.
Afterwards, PlotCurves should be run

The algorithms are trained on a server online, indicating the need for installing several modules, as well as refering to ./datafolder, where the files were stored. For replication, this has to be altered.


# Install needed modules

# Define functions
### Forward feature selection up to 100 parameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import mlxtend
import xgboost as xgb

def FFSalgorithm(X, y, k_features, modelName, setName, toSave = True):

    categorical = list(X.select_dtypes('category').columns)
    numerical = list(X.select_dtypes('number').columns)
    transformer = ColumnTransformer(transformers=[('cat', StandardScaler(), categorical),
                                                  ('num', StandardScaler(), numerical)])
    
    if modelName == 'LR':
        classifier = LogisticRegression(max_iter=5000, penalty = 'none', class_weight = None)
    elif modelName == 'RF':
        classifier = RandomForestClassifier(max_depth = 10, n_estimators=200, random_state=42, min_samples_leaf = 20)
    elif modelName == 'XGB':
        classifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42, eta = 0.1, gamma = 0.1, min_child_weight=7,
                                       n_estimators=200, colsample_bytree=0.8, max_depth=5, reg_alpha=1e-05,subsample=0.9,
                                      use_label_encoder=False, verbosity = 0, nthread = 1)
    
    sfs = SFS(classifier, k_features=k_features, forward=True, floating=False, 
               scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)

    pipe = Pipeline([('preprocessing',transformer),
                     ('sfs', sfs)])
    
    pipe.fit(X, y)
    
    if toSave == True:
        
        try:
            
            with open('.//datafolder//FFS_set{}_full_{}'.format(setName, modelName),'wb') as handle:
                pickle.dump(pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)
                
        except:
            with open('FFS_set{}_full_{}'.format(setName, modelName),'wb') as handle:
                pickle.dump(pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    return pipe

### Hyperparameter optimalisation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.linear_model import LogisticRegression
import pickle
import xgboost as xgb

def execute_grid(X, y, param_grid, classifier, cv_splits = 5, randseed=0):
    
    np.random.seed(randseed)
    
    categorical = list(X.select_dtypes('category').columns)
    numerical = list(X.select_dtypes('number').columns)
    transformer = ColumnTransformer(transformers=[('cat', StandardScaler(), categorical),
                                                  ('num', StandardScaler(), numerical)])
    classifier = classifier()
    clf = Pipeline([('preprocessing', transformer), ('classifier',classifier)])
    
    grid_search = GridSearchCV(clf, param_grid, cv = cv_splits, n_jobs=-1, verbose=3, scoring = 'roc_auc')
    grid_search.fit(X,y)
    
    return grid_search

def TuningModel(X,y, modelName, setName):
    model_grid = {}
    model_grid['LR'] = LogisticRegression
    model_grid['RF'] = RandomForestClassifier
    model_grid['XGB'] = xgb.XGBClassifier
    
    param_grid = {}
    param_grid['LR'] = [
    {
        'classifier__penalty': ['none'],
    'classifier__solver' : ['lbfgs', 'saga'],
    'classifier__class_weight' : [None],
    'classifier__max_iter' : [5000] },
    {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['elasticnet'],
    'classifier__solver' : ['saga'],
    'classifier__l1_ratio' : [0,0.2,0.4,0.6,0.8,1],
    'classifier__class_weight' : [None],    
    'classifier__max_iter' : [5000] },
    ]
    
    '''param_grid['XGB'] = [
    {
        'classifier__eta': [0.01, 0.1, 0.2], 
        'classifier__max_depth': [5, 6, 7, 8],
        'classifier__min_child_weight': [3,5,7],
        'classifier__gamma': [0, 0.1, 0.2],
        'classifier__subsample': [ 07, 0.8, 0.9], 
        'classifier__colsample_bytree': [ 0.7, 0.8, 0.9],
        'classifier__n_estimators': [200, 300], 
        'classifier__reg_alpha':[1e-5, 1e-2, 0.1],
        'classifier__random_state' : [42],
        'classifier__objective' : ["binary:logistic"]}
    ]'''
    
    param_grid['XGB'] = [
    {
        'classifier__eta': [0.01, 0.1, 0.2], 
        'classifier__max_depth': [3, 5, 8],
        'classifier__min_child_weight': [5,7,9],
        'classifier__gamma': [0, 0.1, 0.2],
        'classifier__subsample': [ 0.9], 
        'classifier__colsample_bytree': [ 0.8],
        'classifier__n_estimators': [300], 
        'classifier__reg_alpha':[1e-5, 0.1],
        'classifier__random_state' : [42],
        'classifier__objective' : ["binary:logistic"],
        'classifier__use_label_encoder' : [False],
        'classifier__nthread' : [1]}
    ]
    
    param_grid['RF'] = [
    {
         'classifier__bootstrap': [True],
         'classifier__max_depth': [3, 5, 7, 10, 20, None],
         'classifier__max_features': ['auto', 'sqrt'],
         'classifier__min_samples_leaf': [1, 2, 4, 8],
         'classifier__min_samples_split': [2, 5, 10],
         'classifier__n_estimators': [200, 600, 1000, 2000]
    }
    ]

    gridsearch = execute_grid(X, y, param_grid[modelName], model_grid[modelName])
    
    try:
        with open('.//datafolder//Tuning_set{}_{}'.format(setName, modelName),'wb') as handle:
            pickle.dump(gridsearch, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    except:
        with open('Tuning_set{}_{}'.format(setName, modelName),'wb') as handle:
            pickle.dump(gridsearch, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data and run FFS and hyperparameter optimalization for different datasets and algorithms

In [None]:
import pickle
import datetime
from sklearn.model_selection import train_test_split
import pandas as pd

# The normal ratio is 1:1 cases:controls. If new_ratio is set to an integer >1, the ratio is 1:new_ratio.
# This may be benefinicial for training; Patients will be removed accordingly from the test set.
# Not being used for this paper
new_ratio = 1

# Alter the combis for the algorithm and dataset desired.
combis = [[1,'XGB'],[2,'RF'],[3,'LR']]

for i in combis:
          
    dataset = i[0]
    datamethod = i[1]
    
    #--------------------------------------------------------------------------------------------------------------------#
    # Load the data
    
    print('time before loading: {}'.format(datetime.datetime.now()))
    
    with open('.//datafolder//DataMatrix{}.pickle'.format(dataset), 'rb') as f:
        X, y = pickle.load(f)
            
    X.replace(True,1, inplace = True)
    X.replace(False,0, inplace = True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify = y)
    del X, y
    
    # Not being used for this paper, ratio training is 1:1
    if new_ratio>1:
            
        with open('.//datafolder//DataMatrix{}_extra.pickle'.format(dataset), 'rb') as f:
             X, y = pickle.load(f)
                
        X.replace(True,1, inplace = True)
        X.replace(False,0, inplace = True)
        
        from random import sample, seed
        len_cases = y_train['target'].sum()
        seed(0)
        tosample_list = sample(list(X.index),len_cases*(new_ratio-1))
        X = X.loc[tosample_list]
        y = y.loc[tosample_list]
        X_train = pd.concat([X_train, X])
        y_train = pd.concat([y_train, y])
        del X, y

    #--------------------------------------------------------------------------------------------------------------------#
    # Perform FFS
    
    print('time before FFS: {}'.format(datetime.datetime.now()))
    if dataset == 1:
        pipe = FFSalgorithm(X_train.iloc[:,:], y_train.iloc[:,0], 2, datamethod, dataset, toSave = True)
    else:
        pipe = FFSalgorithm(X_train.iloc[:,:], y_train.iloc[:,0], 100, datamethod, dataset, toSave = True)
    
    #--------------------------------------------------------------------------------------------------------------------#
    # Select best parameters

    full_dict = pipe['sfs'].get_metric_dict()
    list_avg_value = [full_dict[x]['avg_score'] for x in full_dict.keys()]
    max_ind = max(full_dict.keys(), key=(lambda key: full_dict[key]['avg_score']))
    best_features = full_dict[max_ind]['feature_names']
    best_features = [int(x) for x in best_features]

    #--------------------------------------------------------------------------------------------------------------------#
    # Hyperparameter search
    
    print('time before Tuning: {}'.format(datetime.datetime.now()))
    TuningModel(X_train.iloc[:,best_features], y_train.iloc[:,0], datamethod, dataset)
    
    print('time after Tuning: {}'.format(datetime.datetime.now()))
