Author: Paweł Chruszczewski

Objective: As part of the study, I tried to determine whether the inclusion of the structural probability of default from the Merton model (PD) as an explanatory variable in classification models would improve the prediction of bankruptcy of the company. To conduct the study, I used the balance sheet data and quotations of American companies from the non-financial sector for the years 1985–2019 taken from the Compustat database. I obtained information on corporate insolvency from the UCLA LoPucki database. On the basis of the obtained results, there are no grounds to claim that the inclusion of the PD variable obtained from the Merton model significantly improves the predictive abilities of the models. For some classifiers, the mean ROC-AUC of the model with the PD variable is only slightly higher than the models without the PD variable, and the tests performed showed no statistical similarity of the ROC-AUC models for the significance level of 0.01, 0.05, 0.1. Nevertheless, the conducted study and literature research suggest that the procedure of selecting variables and number of variables may have the greatest impact on the rejection of a hypothesis. For this reason, it is suggested to re-conduct the study based on the step-wise selecetion algorithm. Code using this algorithm is also available in this notebook.

Codes and libraries: This project requires Python  3. I have Used python 3.9. The following Python libraries are also required:

<li> numpy
<li> pandas
<li> warnings
<li> matplotlib
<li> scikit-learn
<li> xgboost
<li> scipy
<li> seaborn
<li> itertools
<li> math
<li> mlxtend

In [None]:
import numpy as np
import pandas as pd
import warnings

## Plotting libraries
# import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

## Sklearn Libraries
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_predict
from sklearn.utils import shuffle
from sklearn.utils import resample
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, \
            classification_report, recall_score, precision_recall_curve, roc_auc_score, precision_score, accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import get_scorer

## XGBoost Librarires
from xgboost import XGBClassifier

# pickle library
import pickle

## Scipy Libraries
from scipy.stats.mstats import winsorize
from scipy.stats import f
from scipy.stats import norm
from scipy.stats import chi2
from scipy.stats import ttest_ind
from scipy.stats import randint
from scipy.stats import yeojohnson

#statistics
from statistics import stdev 

#itertools
from itertools import combinations, permutations

#mlxtend
from mlxtend.evaluate import paired_ttest_5x2cv
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#math
import math

# Define random state
random_state = 2020
np.random.seed(random_state)
warnings.filterwarnings('ignore')

In [None]:
import ipynb.fs.full.DeLong_Test as delong

In [None]:
import ipynb.fs.full.Bootstrap_Test as bootstrap

## Import Dataset & Initial Data Analysis

In [None]:
data = pd.read_csv('Dataset.csv')

In [None]:
data.drop(columns=['Unnamed: 0'], inplace = True)

In [None]:
data.describe().transpose()

In [None]:
data.count().sort_values(ascending=False)

## 1. Data Preprocessing

In [None]:
X = data.loc[:, data.columns != 'y']
y = data.loc[:, data.columns == 'y']

In [None]:
# changing extreme values(inf) to the 0.01 percentile and 0.99 percentile
def winsorize_all(predictors):
    for col in predictors.columns: 
         predictors[col] = winsorize(predictors[col], limits=0.01)
    return predictors

In [None]:
X = winsorize_all(X)

In [None]:
X.skew()

In [None]:
# # predictors distribution
# for i, col in enumerate(X.columns):
#     plt.figure(i)
#     sns.countplot(x=col, data=X);

In [None]:
# Perform first split
xtrain, xtest, ytrain, ytest = train_test_split(X, 
                                                y, 
                                                test_size=0.2, 
                                                stratify = y,
                                                random_state=42)

In [None]:
xtrain_pd = xtrain.loc[:,xtrain.columns == 'pd']
xtest_pd = xtest.loc[:,xtest.columns == 'pd']

In [None]:
xtrain_nopd = xtrain.loc[:,xtrain.columns != 'pd']
xtest_nopd = xtest.loc[:,xtest.columns != 'pd']

## 2. Hyperparameter Tuning

### Naive Bayes

In [None]:
var_smoothing = np.logspace(-1,1, num=50)

In [None]:
gnb_grid = {'gaussiannb__var_smoothing': var_smoothing}

In [None]:
gnb = GaussianNB()

### Quadratic Discriminant Analysis

In [None]:
qda_params = [0.1,0.2,0.3,0.4,0.5]

In [None]:
qda_grid = {'quadraticdiscriminantanalysis__reg_param': qda_params}

In [None]:
qda = QuadraticDiscriminantAnalysis()

### Logistic Regression

In [None]:
# the norm used in the penalization
logreg_penalty = ['l1', 'l2', 'elasticnet', None]

In [None]:
# Inverse of regularization strength
logreg_c = [0.1, 1, 10, 100, 1000]

In [None]:
# Algorithm to use in the optimization problem
logreg_solver = ['newton-cg','liblinear', 'saga', 'lbfgs']

In [None]:
logreg_weight = ['balanced', None]

In [None]:
logreg_grid = {'logisticregression__penalty' : logreg_penalty,
               'logisticregression__C' : logreg_c,
               'logisticregression__solver' : logreg_solver,
               'logisticregression__class_weight': logreg_weight}

In [None]:
logreg = LogisticRegression(random_state = random_state)

### Decision Trees

In [None]:
# Criterion to split on
dt_criterion = ['gini', 'entropy']

In [None]:
# The strategy used to choose the split at each node
# dt_splitter = ['best', 'random']

In [None]:
# Maximum number of levels in tree
dt_max_depth = [int(x) for x in np.linspace(1, 20, 5)]

In [None]:
# Add the default as a possible value
dt_max_depth.append(None)

In [None]:
# The minimum number of samples required to split an internal node
# dt_min_samples_split = [int(x) for x in np.linspace(2, 40, 10)]

In [None]:
# dt_min_impurity_decrease = [float(x) for x in np.linspace(0, 0.3, 6)]

In [None]:
# The minimum number of samples required to be at a leaf node
dt_min_samples_leaf = [int(x) for x in np.linspace(1, 5, 5)]

In [None]:
# Number of features to consider at every split
dt_max_features = ['auto', 'sqrt', 'log2']

In [None]:
# Weights associated with classes
dt_class = ['balanced_subsample', 'balanced', None]

In [None]:
dt_grid = {'decisiontreeclassifier__criterion': dt_criterion,
#            'decisiontreeclassifier__splitter': dt_splitter,
           'decisiontreeclassifier__max_depth': dt_max_depth,
#            'decisiontreeclassifier__min_samples_split': dt_min_samples_split,
#            'decisiontreeclassifier__min_impurity_decrease': dt_min_impurity_decrease,
           'decisiontreeclassifier__min_samples_leaf':dt_min_samples_leaf,
           'decisiontreeclassifier__max_features':dt_max_features,
           'decisiontreeclassifier__class_weight': dt_class}

In [None]:
dt = DecisionTreeClassifier(random_state = random_state)

### Random Forest

In [None]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(100, 300, 3)]
rf_n_estimators.append(10)
rf_n_estimators.append(50)
rf_n_estimators.append(1000)
rf_n_estimators.append(1500)

In [None]:
rf_max_depth = [int(x) for x in np.linspace(1, 20, 5)]

In [None]:
rf_max_depth.append(None)

In [None]:
rf_max_features = ['auto', 'sqrt', 'log2']

In [None]:
rf_min_samples_leaf = [int(x) for x in np.linspace(1, 5, 5)]

In [None]:
rf_criterion = ['gini', 'entropy']

In [None]:
# rf_min_samples_split = [int(x) for x in np.linspace(2, 40, 20)]

In [None]:
# rf_min_impurity_decrease = [float(x) for x in np.linspace(0, 0.3, 6)]

In [None]:
# Method of selecting samples for training each tree
# rf_bootstrap = [True, False]

In [None]:
rf_class = ['balanced_subsample', 'balanced']

In [None]:
rf_grid = {'randomforestclassifier__n_estimators': rf_n_estimators,
           'randomforestclassifier__max_depth': rf_max_depth,
           'randomforestclassifier__max_features': rf_max_features,
           'randomforestclassifier__criterion': rf_criterion,
#            'randomforestclassifier__min_samples_split': rf_min_samples_split,
#            'randomforestclassifier__min_impurity_decrease': rf_min_impurity_decrease,
           'randomforestclassifier__min_samples_leaf':rf_min_samples_leaf,
#            'randomforestclassifier__bootstrap': rf_bootstrap,
           'randomforestclassifier__class_weight': rf_class
          }

In [None]:
rf = RandomForestClassifier(random_state = random_state)

### AdaBoost

In [None]:
# Maximum number of levels in tree
adab_max_depth = [1,2,5,10,20]

In [None]:
# Number of trees to be used
adab_n_estimators = [20,50,100,200,500]

In [None]:
# Learning rate
adab_eta = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]

In [None]:
adab_algorithm = ['SAMME', 'SAMME.R']

In [None]:
adab_grid = {'adaboostclassifier__base_estimator__max_depth': adab_max_depth,
             'adaboostclassifier__n_estimators': adab_n_estimators,
             'adaboostclassifier__learning_rate': adab_eta,
             'adaboostclassifier__algorithm': adab_algorithm}

In [None]:
adab = AdaBoostClassifier(base_estimator = RandomForestClassifier(random_state=random_state, class_weight = 'balanced'), random_state=random_state)

### XGBoost

In [None]:
# Number of trees to be used
xgb_n_estimators = [20,50,100,200,500]

In [None]:
# Maximum number of levels in tree
xgb_max_depth = [1,2,5,10,20]

In [None]:
# Minimum number of instaces needed in each node
xgb_min_child_weight = [1,2,3,4,5]

In [None]:
# Tree construction algorithm used in XGBoost
xgb_tree_method = ['auto', 'exact', 'approx']

In [None]:
# Learning rate
xgb_eta = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]

In [None]:
# Minimum loss reduction required to make further partition
xgb_gamma = [x for x in np.linspace(0, 0.5, 6)]

In [None]:
# Learning objective used
# xgb_objective = ['binary:logistic']

In [None]:
# xgb_lambda = [10,20,50,100]

In [None]:
# Balancing of positive and negative weights
xgb_weight = [119.85522788203754, None]

In [None]:
xgb_colsample_bytree = [x for x in np.linspace(0.1, 1, 5)]

In [None]:
subsample_bytree = [x for x in np.linspace(0.1, 1, 5)]

In [None]:
ytrain.y.value_counts()

In [None]:
51002/426

In [None]:
xgb_grid = {'xgbclassifier__n_estimators': xgb_n_estimators,
            'xgbclassifier__max_depth': xgb_max_depth,
            'xgbclassifier__min_child_weight': xgb_min_child_weight,
            'xgbclassifier__tree_method': xgb_tree_method,
            'xgbclassifier__learning_rate': xgb_eta,
            'xgbclassifier__gamma': xgb_gamma,
#             'xgbclassifier__objective': xgb_objective,
#             'xgbclassifier__reg_lambda':xgb_lambda,
            'xgbclassifier__colsample_bytree':xgb_colsample_bytree,
            'xgbclassifier__subsample_bytree':subsample_bytree,
            'xgbclassifier__scale_pos_weight': xgb_weight}

In [None]:
xgb =  XGBClassifier(random_state = random_state, objective = 'binary:logistic', scale_pos_weight = 119.72300469483568)

In [None]:
len(xtrain.columns)

### K-nearest Neighbors

In [None]:
knn_leaf_size = [int(x) for x in np.linspace(1, 55, 5)]

In [None]:
knn_weights = ['uniform','distance']

In [None]:
knn_n_neighbors = [int(x) for x in np.linspace(1, 30, 5)]

In [None]:
knn_p= [1, 2]

In [None]:
knn_metric = ['minkowski', 'euclidean', 'manhattan']

In [None]:
knn_grid = {'kneighborsclassifier__leaf_size':knn_leaf_size,
            'kneighborsclassifier__weights':knn_weights,
            'kneighborsclassifier__n_neighbors':knn_n_neighbors,
            'kneighborsclassifier__p': knn_p,
            'kneighborsclassifier__metric': knn_metric}

In [None]:
knn = KNeighborsClassifier()

### Multi-layer Perceptron

In [None]:
# for simplicity I choose 3 layers with the same number of neurons as there are features in my data set
mlp_hidden_layer_sizes = [(26,26)]

In [None]:
mlp_activation = ['tanh', 'relu', 'logistic']

In [None]:
mlp_solver = ['lbfgs', 'sgd', 'adam']

In [None]:
mlp_alpha = np.linspace(0.0001,0.1,10)

In [None]:
mlp_eta = ['constant','invscaling','adaptive']

In [None]:
mlp_grid = {'mlpclassifier__hidden_layer_sizes': mlp_hidden_layer_sizes,
            'mlpclassifier__activation': mlp_activation,
            'mlpclassifier__solver': mlp_solver,
            'mlpclassifier__alpha': mlp_alpha,
            'mlpclassifier__learning_rate': mlp_eta
}

In [None]:
mlp = MLPClassifier(random_state = random_state)

### SVC

In [None]:
# Inverse of regularization strength
svc_c = [0.1, 1, 10, 100, 1000]

In [None]:
# kernel selects the type of hyperplane used to separate the data; 
# ‘linear’ will use a linear hyperplane (a line in the case of 2D data). ‘rbf’ and ‘poly’ uses a non linear hyper-plane;
svc_kernel = ['linear', 'rbf', 'sigmoid']

In [None]:
# when kernel set to ‘poly’, the degree of the polynomial used to find the hyperplane to split the data
# svc_degree = [int(x) for x in np.linspace(0, 10, 10)]

In [None]:
# parameter for non linear hyperplanes
svc_gamma = [0.1, 1, 10, 100, 1000]
svc_gamma.append('scale')
svc_gamma.append(None)

In [None]:
svc_weight = ['balanced', None]

In [None]:
svc_grid = {'svc__C' : svc_c,
            'svc__kernel':svc_kernel,
#             'svc__degree':svc_degree,
            'svc__gamma':svc_gamma,
            'svc__class_weight':svc_weight}

In [None]:
svc = SVC(random_state = random_state, probability = True)

In [None]:
svc

## 3. Modelling

In [None]:
class Create_classifier(object):
    def __init__(self, n_splits, base_models, grids):
        
        """
        Parameters:
            n_splits: number of folds in k-fold cross-validation
            base_models: List with set of classifiers
            grids: list with set of parameters grids for classifiers
        """
        self.n_splits = n_splits
        self.base_models = base_models
        self.grids = grids

    def predict(self, x_train, y_train, x_test, y_test, chosen_set = ''):
        """
        The function normalizes predictors, searches hyperparameters space, chooses best set of predictors, using principal component analysis and predicts the results for best set of classifiers
        Parameters:
            x_train: training set; dataframe with predictors as columns
            y_train: training set; dataframe with dependent variable as column
            x_test: test set; dataframe with predictors as columns
            y_test: test set; dataframe with dependent variable as column
            chosen_set: designation of the data set for which the model is created: 'ALL' - all predictors, 'PDE' - all predictors, excluding PD, 'PD' - only PD predictor

            return:
                roc_auc_scores: metric on the basis of which the classifier is assessed, from sklearn.metrics.roc_auc_score
                test_pred: dataframe with predictions of the classifiers as floats of the probability of being class 1
                classifiers: list of best classifiers' instances fitted to the model
                
        """
        
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state = random_state)
                  
        roc_auc_scores = pd.DataFrame(columns = [str(i).split('(')[0].lower() for i in self.base_models])
        test_pred = pd.DataFrame(np.zeros((x_test.shape[0], len(self.base_models))), columns=[str(i).split('(')[0].lower() for i in self.base_models])
        test_pred.columns = pd.MultiIndex.from_product([[chosen_set], test_pred.columns])
        feat_selected = pd.DataFrame(np.empty((len(x_train.columns), len(self.base_models))), dtype = np.str, columns=[str(i).split('(')[0].lower() for i in self.base_models])
        feat_selected.columns = pd.MultiIndex.from_product([[chosen_set], feat_selected.columns])
        classifiers = []        
        
        for i, clf in enumerate(self.base_models):
        
            pipe_lr = make_pipeline(PowerTransformer(method='yeo-johnson',standardize = True),
                                    PCA(n_components = 0.99),
                                    clf)
            
            search = RandomizedSearchCV(estimator=pipe_lr, param_distributions = self.grids[i], cv = cv, n_jobs=-1, verbose=True, scoring = 'roc_auc', iid = True, refit = True, n_iter = 50)

            search.fit(x_train, y_train)
            
            filename = 'model_'+str(chosen_set)+str(clf).split('(')[0].lower()+'_saved.sav'
            pickle.dump(pipe_lr.steps[2][1], open(filename, 'wb'))
            
            classifiers.append([chosen_set, i, search.best_estimator_.steps[2][1]])
            
            predict_rdf = search.best_estimator_.predict_proba(x_test)[:,1]
            test_pred[chosen_set][str(clf).split('(')[0].lower()] = predict_rdf.astype('float64')
                  
            roc_auc_scores.loc[0,str(clf).split('(')[0].lower()] = roc_auc_score(y_test, predict_rdf)
                
        return roc_auc_scores, test_pred, classifiers
    
    def joined_scores(self, predict_df):
        """
        The function joints the predictions of the predict function
        Parameters
            predict_df: list with dataframes; each dataframe in a list contains predictions of the classifiers as floats of the probability of being class 1

            return:
                predict_df_all: dataframe with predictions of the classifiers for all three datasets ('ALL', 'PDE', 'PD')
                
        """
            predict_df_all = pd.concat(predict_df, axis = 1)
            
            return predict_df_all

In [None]:
# class Create_classifier(object):
#     def __init__(self, n_splits, base_models, grids):
#         self.n_splits = n_splits
#         self.base_models = base_models
#         self.grids = grids

#     def predict(self, x_train, y_train, x_test, y_test, chosen_set = ''):

          """
            The function normalizes predictors, searches hyperparameters space, chooses best set of predictors, using forward-selection algorithm and predicts the results for best set of classifiers
            Parameters:
                x_train: training set; dataframe with predictors as columns
                y_train: training set; dataframe with dependent variable as column
                x_test: test set; dataframe with predictors as columns
                y_test: test set; dataframe with dependent variable as column
                chosen_set: designation of the data set for which the model is created: 'ALL' - all predictors, 'PDE' - all predictors, excluding PD, 'PD' - only PD predictor

                return:
                    roc_auc_scores: metric on the basis of which the classifier is assessed, from sklearn.metrics.roc_auc_score
                    test_pred: dataframe with predictions of the classifiers as floats of the probability of being class 1
                    classifiers: list of best classifiers' instances fitted to the model

         """
        
#         cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state = random_state)
                  
#         roc_auc_scores = pd.DataFrame(columns = [str(i) for i in self.base_models])
#         test_pred = pd.DataFrame(np.zeros((x_test.shape[0], len(self.base_models))), columns=[str(i).split('(')[0].lower() for i in self.base_models])
#         test_pred.columns = pd.MultiIndex.from_product([[chosen_set], test_pred.columns])
#         feat_selected = pd.DataFrame(np.zeros((len(x_train.columns), len(self.base_models))), index=x_train.columns, columns=[str(i) for i in self.base_models])
#         feat_importance = pd.DataFrame(np.zeros((len(x_train.columns), len(self.base_models))), index=x_train.columns, columns=[str(i) for i in self.base_models])
#         estimators = []         
        
#         for i, clf in enumerate(self.base_models):
        
#             pipe_lr = make_pipeline(PowerTransformer(method='yeo-johnson',standardize = True),
#                                     SFS(estimator=clf, k_features='best', forward=True, floating=False, scoring='roc_auc',cv=cv),
#                                     clf)
            
#             search = RandomizedSearchCV(estimator=pipe_lr, param_distributions = self.grids[i], cv = cv, n_jobs=-1, verbose=True, scoring = 'roc_auc', iid = True, refit = True, n_iter = 100)
#             search = search.fit(x_train, y_train)

#             search.fit(x_train, y_train)
            
#             filename = 'model_'+str(chosen_set)+str(clf)+'_saved.sav'
#             pickle.dump(search.best_estimator_.steps[1][1], open(filename, 'wb'))
            
#             estimators.append([chosen_set, i, search.best_estimator_.steps[1][1]])
            
#             predict_rdf = search.best_estimator_.predict_proba(x_test)[:,1]
#             test_pred[chosen_set][str(clf).split('(')[0].lower()] = predict_rdf.astype('float64')
                  
#             roc_auc_scores.loc[0,str(clf)] = roc_auc_score(y_test, predict_rdf)
            
#             feat_est = search.best_estimator_.steps[1][1].k_feature_idx_
            
#             for j in feat_est:
#                 feat_selected.iloc[j,i] = 1
                  
#             for j in x_train.columns:
#                 feat_est = dict(zip(x_train.columns, search.best_estimator_.steps[1][1].k_feature_idx_))
#                 feat_selected.loc[str(j), str(clf)] = feat_est[str(j)]
                
#                 try:
#                     importances = dict(zip(x_train.columns, search.best_estimator_.named_steps[str(clf).split('(')[0].lower()].feature_importances_))
#                     feat_importance.loc[str(j), str(clf)] = importances[str(j)]
#                 except Exception:
#                     pass
                
#         return roc_auc_scores, test_pred, estimators
    
#     def joined_scores(self, predict_df):
        """
        The function joints the predictions of the predict function
        Parameters
            predict_df: list with dataframes; each dataframe in a list contains predictions of the classifiers as floats of the probability of being class 1

            return:
                test_pred: dataframe with predictions of the classifiers for all three sets of predictors ('ALL', 'PDE', 'PD')
                
        """
#             roc_auc_all = pd.concat(self.roc_auc)
#             predict_df_all = pd.concat(predict_df, axis = 1)
#             return predict_df_all

In [None]:
class Scoring(object):
    def __init__(self, base_models):
        self.base_models = base_models

    def delong_test(self, predict_df_all, labels):
        
        """
        Computes p-value of DeLong Test with hypothesis that ROC AUCs of two classifiers are different
        Parameters:
            predict_df_all: dataframe with predictions of the classifiers for all three sets of predictors ('ALL', 'PDE', 'PD')
            labels: test set; dataframe with dependent variable as column

            return: 
                Test_df_sets: dataframe with results of paired test comparing ROC AUCs of different sets ('ALL' vs 'PDE' and 'ALL' vs 'PD') for all classifiers
                Test_df_all: dataframe with results of paired test comparing ROC AUCs for all classifiers built based on set of all variables ('ALL')
        """

        Test_df_sets = pd.DataFrame(np.zeros((2, len(self.base_models))), index=['ALL/PDE','ALL/PD'], columns=[str(i).split('(')[0].lower() for i in self.base_models])
        Test_df_sets.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df_sets.columns])
        
        Test_df_all = pd.DataFrame(list(combinations(Test_df_sets['DeLong Test'].columns,2)),columns = ['1st Algorithm', '2nd Algorithm'])
        Test_df_all['score'] = 0
        Test_df_all.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df_all.columns])    
            
        for i, clf in enumerate(self.base_models):
        
            Test_df_sets['DeLong Test'].loc['ALL/PDE',str(clf).split('(')[0].lower()] = delong.delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][str(clf).split('(')[0].lower()], predict_df_all['PDE'][str(clf).split('(')[0].lower()])
            Test_df_sets['DeLong Test'].loc['ALL/PD',str(clf).split('(')[0].lower()] = delong.delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][str(clf).split('(')[0].lower()], predict_df_all['PD'][str(clf).split('(')[0].lower()])
        
        for j in range(Test_df_all.shape[0]):
            Test_df_all.loc[j, ('DeLong Test', 'score')]  = delong.delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][Test_df_all.loc[j, ('DeLong Test', '1st Algorithm')]], predict_df_all['ALL'][Test_df_all.loc[j, ('DeLong Test', '2nd Algorithm')]])
       
        return Test_df_sets, Test_df_all
    
    def bootstrap_test(self, predict_df_all, labels):
        
        """
        Computes p-value of Bootstrap Test with hypothesis that ROC AUCs of two classifiers are different
        Parameters:
            predict_df_all: dataframe with predictions of the classifiers for all three sets of predictors ('ALL', 'PDE', 'PD')
            labels: test set; dataframe with dependent variable as column

            return: 
                Test_df_sets: dataframe with results of paired test comparing ROC AUCs of different sets ('ALL' vs 'PDE' and 'ALL' vs 'PD') for all classifiers
                Test_df_all: dataframe with results of paired test comparing ROC AUCs for all classifiers built based on set of all variables ('ALL')
        """
        
        Test_df_sets = pd.DataFrame(np.zeros((2, len(self.base_models))), index=['ALL/PDE','ALL/PD'], columns=[str(i).split('(')[0].lower() for i in self.base_models])
        Test_df_sets.columns = pd.MultiIndex.from_product([['Bootstrap Test'], Test_df_sets.columns])
        
        Test_df_all = pd.DataFrame(list(combinations(Test_df_sets['Bootstrap Test'].columns,2)),columns = ['1st Algorithm', '2nd Algorithm'])
        Test_df_all['score'] = 0
        Test_df_all.columns = pd.MultiIndex.from_product([['Bootstrap Test'], Test_df_all.columns])
            
        for i, clf in enumerate(self.base_models):
        
            Test_df_sets['Bootstrap Test'].loc['ALL/PDE',str(clf).split('(')[0].lower()] = bootstrap.pvalue(labels.values.ravel(), predict_df_all['ALL'][str(clf).split('(')[0].lower()], predict_df_all['PDE'][str(clf).split('(')[0].lower()], score_fun=roc_auc_score)
            Test_df_sets['Bootstrap Test'].loc['ALL/PD',str(clf).split('(')[0].lower()] = bootstrap.pvalue(labels.values.ravel(), predict_df_all['ALL'][str(clf).split('(')[0].lower()], predict_df_all['PD'][str(clf).split('(')[0].lower()], score_fun=roc_auc_score)
            
        for j in range(Test_df_all.shape[0]):
            Test_df_all.loc[j, ('Bootstrap Test', 'score')]  = bootstrap.pvalue(labels.values.ravel(), predict_df_all['ALL'][Test_df_all.loc[j, ('Bootstrap Test', '1st Algorithm')]], predict_df_all['ALL'][Test_df_all.loc[j, ('Bootstrap Test', '2nd Algorithm')]],score_fun=roc_auc_score)
       
        return Test_df_sets, Test_df_all
    
    def likelihood_RT(self, predict_df_all, estimators, x_train, y_test):
        
        """
        Computes p-value of Likelihood Ratio Test with hypothesis that ROC AUCs of Logistic Regression models built on different sets of variables are different
        Parameters:
            predict_df_all: dataframe with predictions of the classifiers for all three sets of predictors ('ALL', 'PDE', 'PD')
            classifiers: list of best classifiers' instances fitted to the model
            x_train: training set; dataframe with predictors as columns
            labels: test set; dataframe with dependent variable as column

            return: 
                Test_df_sets: dataframe with results of paired test comparing ROC AUCs of different sets ('ALL' vs 'PDE' and 'ALL' vs 'PD') for all classifiers
        """
        
        Test_df_sets = pd.DataFrame((np.zeros((2, 1))), index=['ALL/PDE','ALL/PD'], columns=[str(classifiers[2])])
        Test_df_sets.columns = pd.MultiIndex.from_product([['LRT'], Test_df_sets.columns])

        alt_log_likelihood = -log_loss(y_test,
                                       predict_df_all['ALL'][str(classifiers[2])],
                                       normalize=False)
        null_log_likelihood = -log_loss(y_test,
                                        predict_df_all['PDE'][str(classifiers[2])],
                                        normalize=False)
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        p_log_l = chi2.sf(G, x_train.shape[1])
        
        alt_log_likelihood = -log_loss(y_test,
                                       predict_df_all['ALL'][str(classifiers[2])],
                                       normalize=False)
        null_log_likelihood = -log_loss(y_test,
                                        predict_df_all['PD'][str(classifiers[2])],
                                        normalize=False)
        
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        p_log_2 = chi2.sf(G, x_train.shape[1])
        
        Test_df_sets['LRT'].loc['ALL/PDE' ,str(classifiers[2])] = p_log_l
        Test_df_sets['LRT'].loc['ALL/PD', str(classifiers[2])] = p_log_2

        return Test_df_sets

In [None]:
def yeoj_graph(x_train, lbd_list, feature=''):
    
    """
    Normalization of a selected predictor using the Yeo-Johnson transformation for various λ parameters.
    Parameters:
        x_train: training set; dataframe with predictors as columns
        lbd_list: list of λ parameters for which the transofrmation is conducted
        feature: string name of predictor

        return: Graph showing the non-transformed against transformed values of predictor for various λ parameters.
    """

    plt.figure(figsize=(8,6))

    for i in range(len(lbd_list)):
        n_lines = len(lbd_list)
        c = np.arange(1, n_lines + 1)
        norm = mpl.colors.Normalize(vmin=c.min(), vmax=c.max())
        cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Greys)
        cmap.set_array([])
        a = x_train[feature].values.ravel()
        a = np.sort(a)
        b = yeojohnson(x_train[feature], lmbda=lbd_list[i])
        b = np.sort(b)
        plt.plot(a,b, c=cmap.to_rgba(i + 1), label='λ = '+str(lbd_list[i]))
    plt.legend(loc=0)
    plt.ylabel("ψ(λ,x)", fontsize=15)
    plt.xlabel("x", fontsize=15)
    plt.savefig('yeo-johnson.png', dpi=1200)
    
    return plt.show()

In [None]:
def roc_comparison_sets(predict_df_all, y_test, chosen_set = ''):
    
    """
    ROC Curves for given set of predictors
    Parameters:
        predict_df_all: dataframe with predictions of the classifiers for all three datasets ('ALL', 'PDE', 'PD')
        y_test: test set; dataframe with dependent variable as column
        chosen_set: designation of the data set for which the model is created: 'ALL' - all predictors, 'PDE' - all predictors, excluding PD, 'PD' - only PD predictor

        return: Graph showing the ROC Curves for chosen set of predictors
    """
    
    predict_df = predict_df_all[str(chosen_set)]
    
    # Plot the figure
    # Train the models and record the results
    result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
    for i, (j, clf) in enumerate(predict_df):
        yproba = predict_df[j][clf]

        fpr, tpr, _ = roc_curve(y_test.values.ravel(),  yproba)
        auc = roc_auc_score(y_test.values.ravel(), yproba)

        result_table = result_table.append({'classifiers': [j,clf],
                                            'fpr':fpr, 
                                            'tpr':tpr, 
                                            'auc':auc}, ignore_index=True)

        result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)

        fig = plt.figure(figsize=(8,6))

        for k,m in enumerate(result_table.set.unique()):

            plt.plot(result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['fpr'].values[0], 
                     result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['tpr'].values[0],
                     label="{}, AUC={:.3f}".format(str(m), result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['auc'].values[0]))

            plt.plot([0,1], [0,1], color='gray', linestyle='--')

            plt.xticks(np.arange(0.0, 1.1, step=0.1))
            plt.xlabel("False Positive Rate", fontsize=15)

            plt.yticks(np.arange(0.0, 1.1, step=0.1))
            plt.ylabel("True Positive Rate", fontsize=15)

            plt.legend(prop={'size':13}, loc='lower right')
            
        plt.savefig(str(clf)+str(m)+'.png',  dpi=1200)
        plt.show()
        plt.close()

In [None]:
def graph_ci_alternative(predict_df_all, y_test, base_models):
    
    """
    Confidence Intervals for given set of predictors
    Parameters:
        predict_df_all: dataframe with predictions of the classifiers for all sets of predictors ('ALL', 'PDE', 'PD')
        y_test: test set; dataframe with dependent variable as column
        base_models: List with set of classifiers

        return: Graph showing the Confidence Intervals of all classifiers for given set of predictors
    """
    predict_df = predict_df_all.copy(deep=False)
        
    result_table = pd.DataFrame(columns=['classifiers', 'delong','bootstrap'])
    for i, (j, clf) in enumerate(predict_df):
        yproba = predict_df[j][clf]

        delong = delong.calc_auc_ci(y_test.values.ravel(),  yproba, alpha=0.95) 
        bootstrap = bootstrap.score_stat_ci(y_test.values.ravel(), yproba,  roc_auc_score)

        result_table = result_table.append({'classifiers': [j,clf],
                                            'delong':delong, 
                                            'bootstrap':bootstrap}, ignore_index=True)

        result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)

    for k,m in enumerate(result_table.set.unique()):
    
        plt.figure(figsize=(8,6))

        SMALL_SIZE = 10
        MEDIUM_SIZE = 12
        BIGGER_SIZE = 14

        plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
        plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
        plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
        plt.rc('xtick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
        plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

        x_ticks = [str(i).split('(')[0].lower() for i in base_models]

        for n,l in enumerate(result_table.classifier.unique()):

            eb_1 = plt.errorbar(x=n+1, 
                             y=(result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['bootstrap'].values[0][1] + result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['bootstrap'].values[0][0])/2, 
                             yerr=[(result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['bootstrap'].values[0][1] - result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['bootstrap'].values[0][0])/2],
                             fmt='ok',
                             capsize = 10)

            eb_2 = plt.errorbar(x=n+1.1, 
                             y=(result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['delong'].values[0][1] + result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['delong'].values[0][0])/2, 
                             yerr=[(result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['delong'].values[0][1] - result_table.loc[(result_table.classifier == str(l)) & (result_table.set == str(m))]['delong'].values[0][0])/2],
                             fmt='ok',
                             capsize = 10)
            eb_2[-1][0].set_linestyle('--')
            
            # I need to manipulate 3rd parameter in arange, so the graph looks nice & I also need to do the same in errorbar(x)

            plt.xticks(np.arange(1.05,len(x_ticks)+0.5,1), x_ticks, rotation=90)
            plt.tight_layout()

            plt.ylabel("ROC AUC Przedział Ufności", fontsize=15)
            plt.tight_layout()

        plt.savefig('plot'+str(m)+'ci.png', dpi=1200)
        plt.show()
        plt.close()

### Prediction

In [None]:
chosen_set = ['ALL', 'PDE', 'PD']
base_models = [gnb, qda, logreg, dt, rf, xgb, adab, knn, svc, mlp]
n_splits = 4
grids = [gnb_grid, qda_grid, logreg_grid, dt_grid, rf_grid, xgb_grid, adab_grid, knn_grid, svc_grid, mlp_grid]
lgb_stack = Create_classifier(n_splits = n_splits, base_models = base_models, grids = grids)        
roc_auc_scores, test_pred, classifiers = lgb_stack.predict(xtrain, ytrain, xtest, ytest, chosen_set = chosen_set[0])

In [None]:
predict_df = [test_pred, test_pred2, test_pred3]

In [None]:
# combining results from three datasets
predict_df_all = lgb_stack.joined_scores(predict_df)

### Evaluation

In [None]:
lgb_score = Scoring(base_models = base_models)

In [None]:
df_sets_bootstrap, df_all_bootstrap, df_sets_bootstrap = lgb_score.bootstrap_test(predict_df_all, ytest)

In [None]:
df_sets_delong, df_all_delong = lgb_score.delong_test(predict_df_all, ytest)

In [None]:
df_likelihood = lgb_score.likelihood_RT(predict_df_all, ytest, xtrain):

### Visualization

In [None]:
# yeo-johnson graph
lbd_list = [-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]
yeoj_graph(xtrain, lbd_list, feature = '')

In [None]:
# confidence intervals graphs
graph_ci_alternative(predict_df_all, ytest)

In [None]:
# roc_auc graphs comparing sets
roc_comparison_sets(predict_df_all, ytest)