Author: Paweł Chruszczewsju

Objective: The primary goal of this code is to develop a machine learning model that can classify the labels properly and address dataset imbalance. The dataset has three classes (1, 2, and 3). There are three problems in the challenge. In each problem, there is some specific task that needs to be done. In the following subsections, I describe three techniques I used to overcome the data imbalance problem.

Codes and libraries: This project requires Python  3. I have Used python 3.9. The following Python libraries are also required:

<li> numpy
<li> pandas
<li> matplotlib
<li> scikit-learn
<li> xgboost
<li> scipy
<li> seaborn
<li> itertools
<li> math
<li> mlxtend

In [635]:
import numpy as np
import pandas as pd
import warnings

## Plotting libraries
# import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

## Sklearn Libraries
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFECV
from sklearn.utils import shuffle
from sklearn.utils import resample
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, \
            classification_report, recall_score, precision_recall_curve, roc_auc_score, precision_score, accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import get_scorer

## XGBoost Librarires
from xgboost import XGBClassifier

## Scipy Libraries
from scipy.stats.mstats import winsorize
from scipy import stats
from scipy.stats import norm
from scipy.stats import chi2

#itertools
from itertools import combinations, permutations

#mlxtend
from mlxtend.evaluate import paired_ttest_5x2cv

#math
import math

# Define random state
random_state = 2020
np.random.seed(random_state)
warnings.filterwarnings('ignore')

In [395]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = random_state)

pipe_lr = make_pipeline(PowerTransformer(method='yeo-johnson',standardize = True),
                        RFECV(estimator = logreg, step = 1, cv=cv, scoring = 'roc_auc'),
                        logreg)

search = RandomizedSearchCV(estimator=pipe_lr, param_distributions = logreg_grid, cv = cv, n_jobs=-1, verbose=True, scoring = 'roc_auc')

In [399]:
search.fit(xtrain, ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.6min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2020, shuffle=True),
                   estimator=Pipeline(steps=[('powertransformer',
                                              PowerTransformer()),
                                             ('rfecv',
                                              RFECV(cv=StratifiedKFold(n_splits=5, random_state=2020, shuffle=True),
                                                    estimator=LogisticRegression(random_state=2020),
                                                    scoring='roc_auc')),
                                             ('logisticregression',
                                              LogisticRegression(random_state=2020...
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                        'logisticregression__class_w

In [427]:
predict = search.predict_proba(xtest)[:,1]

In [428]:
roc_auc_score(ytest,predict)

0.9305790057411274

## Import Dataset & Initial Data Analysis

In [2]:
data = pd.read_csv('Fundamentals_test4.csv')

In [3]:
data.drop(columns=['Unnamed: 0','gdp_change'], inplace = True)

In [4]:
data.shape

(64399, 24)

In [5]:
data.head()

Unnamed: 0,pd,y,atch,empch,salech,roech,ptbch,dlcpdlttdebit,nwcdta,redat,...,dtdat,actdlct,quickratio,bvdmv,nidseq,actdnat,ebitdxint,redsale,nidsale,ebitdsale
0,2.585663e-20,0.0,0.275937,0.189045,0.132754,0.016827,1.190518,2.508597,0.356337,0.162688,...,0.286958,2.008176,0.85573,0.405124,0.120632,1.475566,0.141434,0.13007,0.046393,0.091455
1,0.0002021916,0.0,0.18561,0.016854,0.202329,-0.006262,-0.63472,1.678391,0.372817,0.183627,...,0.205406,2.078004,0.86275,0.545357,0.11437,1.257911,0.121963,0.144769,0.051514,0.096485
2,0.001698752,0.0,0.212075,0.099448,0.165826,0.026716,0.143538,2.079429,0.371033,0.205122,...,0.254955,2.029756,0.720055,0.505766,0.141086,1.384712,0.157226,0.168131,0.061078,0.100498
3,5.37383e-15,0.0,0.250723,0.276382,0.16891,0.003535,0.332061,2.15125,0.361805,0.215033,...,0.269305,2.006557,0.778922,0.433039,0.144621,1.498123,0.175031,0.188591,0.061064,0.109792
4,5.429146000000001e-23,0.0,0.090154,-0.043307,0.09478,-0.009272,0.745122,2.265693,0.47599,0.24052,...,0.273216,3.040201,1.077016,0.327398,0.135348,1.453859,0.213208,0.210052,0.057668,0.105313


In [6]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pd,64399.0,0.060438,0.176708,1.34543e-307,1.493709e-24,1.618681e-09,0.00257,1.0
y,64399.0,0.008277,0.090599,0.0,0.0,0.0,0.0,1.0
atch,64399.0,inf,,-0.9995402,-0.04050683,0.05274254,0.184456,inf
empch,64399.0,inf,,-1.0,-0.04444444,0.02155172,0.126582,inf
salech,64399.0,inf,,-6.487889,-0.03418465,0.07043707,0.215762,inf
roech,64399.0,,,-inf,-0.09432236,-0.004729832,0.057598,inf
ptbch,64399.0,,,-inf,-0.5686304,-0.002560469,0.50181,inf
dlcpdlttdebit,64399.0,inf,,-12243.81,0.0,1.303295,4.207701,inf
nwcdta,64399.0,-0.415334,102.483575,-25968.52,0.02709664,0.1732806,0.364668,1.0
redat,64399.0,-3.792834,142.177284,-26097.6,-0.3597986,0.1084332,0.331912,140.581479


In [7]:
data.count().sort_values(ascending=False)

ebitdsale        64399
nidsale          64399
y                64399
atch             64399
empch            64399
salech           64399
roech            64399
ptbch            64399
dlcpdlttdebit    64399
nwcdta           64399
redat            64399
ebitdat          64399
mvaluedtd        64399
saledat          64399
nidat            64399
dtdat            64399
actdlct          64399
quickratio       64399
bvdmv            64399
nidseq           64399
actdnat          64399
ebitdxint        64399
redsale          64399
pd               64399
dtype: int64

## Data Preprocessing

In [8]:
X = data.loc[:, data.columns != 'y']
y = data.loc[:, data.columns == 'y']

In [9]:
# changing extreme values(inf) to the 0.01 percentile and 0.99 percentile
def winsorize_all(predictors):
    for col in predictors.columns: 
         predictors[col] = winsorize(predictors[col], limits=0.01)
    return predictors

In [10]:
X = winsorize_all(X)

In [11]:
X.skew()

pd               3.452901
atch             3.684233
empch            3.167673
salech           8.485622
roech            0.739635
ptbch           -0.193292
dlcpdlttdebit    0.758185
nwcdta          -4.467631
redat           -5.782641
ebitdat         -4.703092
mvaluedtd        4.949149
saledat          1.550898
nidat           -5.048107
dtdat            3.238351
actdlct          3.254847
quickratio       3.619614
bvdmv            1.067228
nidseq          -0.544845
actdnat          1.136374
ebitdxint        0.595351
redsale         -8.369578
nidsale         -8.130412
ebitdsale       -8.331175
dtype: float64

In [None]:
# predictors distribution
for i, col in enumerate(X.columns):
    plt.figure(i)
    sns.countplot(x=col, data=X)

In [12]:
# Perform first split
xtrain, xtest, ytrain, ytest = train_test_split(X, 
                                                y, 
                                                test_size=0.3, 
                                                stratify = y,
                                                random_state=42)

## 1. To handle data imbalance issue, I have used the following three techniques :
### A. Create ensemble class

In [None]:
# Random Forest All

In [13]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]
rf_n_estimators.append(1500)
rf_n_estimators.append(2000)

In [14]:
# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]

In [15]:
# Add the default as a possible value
rf_max_depth.append(None)

In [16]:
# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']

In [17]:
rf_min_samples_leaf = [int(x) for x in np.linspace(1, 55, 11)]

In [18]:
# Criterion to split on
rf_criterion = ['gini', 'entropy']

In [19]:
# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 40, 20)]

In [20]:
# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [float(x) for x in np.linspace(0, 0.3, 6)]

In [21]:
# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

In [22]:
# Weights associated with classes
rf_class = ['balanced_subsample', 'balanced', None]

In [23]:
rf_grid = {'randomforestclassifier__n_estimators': rf_n_estimators,
           'randomforestclassifier__max_depth': rf_max_depth,
           'randomforestclassifier__max_features': rf_max_features,
           'randomforestclassifier__criterion': rf_criterion,
           'randomforestclassifier__min_samples_split': rf_min_samples_split,
           'randomforestclassifier__min_impurity_decrease': rf_min_impurity_decrease,
           'randomforestclassifier__min_samples_leaf':rf_min_samples_leaf,
           'randomforestclassifier__bootstrap': rf_bootstrap,
           'randomforestclassifier__class_weight': rf_class
          }

In [25]:
rdf = RandomForestClassifier(random_state = random_state)

In [148]:
# base_models = [rdf]
# n_splits = 5
# grids = [rf_grid]
# lgb_stack = Create_classifier(n_splits = n_splits, base_models = base_models, grids = grids)        
# roc_auc_scores, feat_selected, feat_importance = lgb_stack.predict(xtrain, ytrain, xtest, ytest)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 81.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 108.3min finished


In [27]:
# the norm used in the penalization
logreg_penalty = ['l1', 'l2', 'elasticnet', None]

In [28]:
# Inverse of regularization strength
logreg_c = np.logspace(-4, 4, 20)

In [29]:
# Algorithm to use in the optimization problem
logreg_solver = ['newton-cg','liblinear', 'saga', 'lbfgs']

In [30]:
logreg_weight = ['balanced', None]

In [31]:
logreg_grid = {'logisticregression__penalty' : logreg_penalty,
               'logisticregression__C' : logreg_c,
               'logisticregression__solver' : logreg_solver,
               'logisticregression__class_weight': logreg_weight}

In [32]:
logreg = LogisticRegression(random_state = random_state)

In [409]:
# Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

In [410]:
# Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

In [411]:
# Minimum number of instaces needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

In [412]:
# Tree construction algorithm used in XGBoost
xgb_tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist']

In [413]:
# Learning rate
xgb_eta = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]

In [414]:
xgb_eta

[0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]

In [415]:
# Minimum loss reduction required to make further partition
xgb_gamma = [x for x in np.linspace(0, 0.5, 6)]

In [416]:
xgb_gamma

[0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5]

In [417]:
# Learning objective used
xgb_objective = ['binary:logistic']

In [418]:
xgb_lambda = [10,20,50,100]

In [419]:
# Balancing of positive and negative weights
xgb_weight = [119.85522788203754, None]

In [420]:
xgb_grid = {'xgbclassifier__n_estimators': xgb_n_estimators,
            'xgbclassifier__max_depth': xgb_max_depth,
            'xgbclassifier__min_child_weight': xgb_min_child_weight,
            'xgbclassifier__tree_method': xgb_tree_method,
            'xgbclassifier__learning_rate': xgb_eta,
            'xgbclassifier__gamma': xgb_gamma,
            'xgbclassifier__objective': xgb_objective,
            'xgbclassifier__reg_lambda':xgb_lambda,
            'xgbclassifier__scale_pos_weight': xgb_weight}

In [421]:
xgb =  XGBClassifier(random_state = random_state)

In [429]:
chosen_set = ['ALL', 'PDE', 'PD']
base_models = [logreg, rdf, xgb]
n_splits = 5
grids = [logreg_grid, rf_grid, xgb_grid]
lgb_stack = Create_classifier(n_splits = n_splits, base_models = base_models, grids = grids)        
roc_auc_scores, feat_selected, feat_importance, test_pred, models = lgb_stack.predict(xtrain, ytrain, xtest, ytest, chosen_set = chosen_set[0])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 78.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 106.0min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 87.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 119.0min finished


In [438]:
test_pred

Unnamed: 0_level_0,ALL,ALL,ALL
Unnamed: 0_level_1,LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n colsample_bynode=None, colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain', interaction_constraints=None,\n learning_rate=None, max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n n_estimators=100, n_jobs=None, num_parallel_tree=None,\n random_state=2020, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None, tree_method=None,\n validate_parameters=None, verbosity=None)"
0,0.000183,0.015745,0.0
1,0.002008,0.243919,0.0
2,0.000287,0.041810,0.0
3,0.112291,0.329563,0.0
4,0.000903,0.000962,0.0
...,...,...,...
19315,0.000321,0.000000,0.0
19316,0.000044,0.000000,0.0
19317,0.000079,0.000383,0.0
19318,0.020542,0.350903,0.0


In [440]:
test_pred['ALL']['RandomForestClassifier(random_state=2020)']

0        0.015745
1        0.243919
2        0.041810
3        0.329563
4        0.000962
           ...   
19315    0.000000
19316    0.000000
19317    0.000383
19318    0.350903
19319    0.009197
Name: RandomForestClassifier(random_state=2020), Length: 19320, dtype: float64

In [430]:
roc_auc_scores

Unnamed: 0,LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n colsample_bynode=None, colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain', interaction_constraints=None,\n learning_rate=None, max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n n_estimators=100, n_jobs=None, num_parallel_tree=None,\n random_state=2020, reg_alpha=None, reg_lambda=None,\n scale_pos_weight=None, subsample=None, tree_method=None,\n validate_parameters=None, verbosity=None)"
0,0.93039,0.951309,0.947079


In [98]:
lgb_score = Scoring(roc_auc = roc_auc_scores, predict_df = test_pred, base_models = base_models)        

In [69]:
class Create_classifier(object):
    def __init__(self, n_splits, base_models, grids):
        self.n_splits = n_splits
        self.base_models = base_models
        self.grids = grids

    def predict(self, x_train, y_train, x_test, y_test, chosen_set = ''):
        
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state = random_state)
                  
        roc_auc_scores = pd.DataFrame(columns = [str(i) for i in self.base_models])
        test_pred = pd.DataFrame(np.zeros((x_test.shape[0], len(self.base_models))), columns=[str(i) for i in self.base_models])
        test_pred.columns = pd.MultiIndex.from_product([[chosen_set], test_pred.columns])
        feat_selected = pd.DataFrame(np.zeros((len(x_train.columns), len(self.base_models))), index=x_train.columns, columns=[str(i) for i in self.base_models])
        feat_importance = pd.DataFrame(np.zeros((len(x_train.columns), len(self.base_models))), index=x_train.columns, columns=[str(i) for i in self.base_models])
        models = []         
        
        for i, clf in enumerate(self.base_models):
        
            pipe_lr = make_pipeline(PowerTransformer(method='yeo-johnson',standardize = True),
                                    RFECV(estimator = clf, step = 1, cv=cv, scoring = 'roc_auc'),
                                    clf)
                  
            search = RandomizedSearchCV(estimator=pipe_lr, param_distributions = self.grids[i], cv = cv, n_jobs=-1, verbose=True, scoring = 'roc_auc')
            search.fit(x_train, y_train)
            
            models.append([chosen_set, i, search.best_estimator_])
            
            predict_rdf = search.best_estimator_.predict_proba(x_test)[:,1]
            test_pred[chosen_set][str(clf)] = predict_rdf
                  
            roc_auc_scores.loc[0,str(clf)] = roc_auc_score(y_test, predict_rdf)
                  
            for j in x_train.columns:
                feat_est = dict(zip(x_train.columns, search.best_estimator_.named_steps["rfecv"].ranking_))
                feat_selected.loc[str(j), str(clf)] = feat_est[str(j)]
                
                try:
                    importances = dict(zip(x_train.columns, search.best_estimator_.named_steps[str(clf).split('(')[0].lower()].feature_importances_))
                    feat_importance.loc[str(j), str(clf)] = importances[str(j)]
                except Exception:
                    pass
                
        return roc_auc_scores, feat_selected, feat_importance, test_pred, models

In [636]:
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def compute_midrank_weight(x, sample_weight):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    cumulative_weight = np.cumsum(sample_weight[J])
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = cumulative_weight[i:j].mean()
        i = j
    T2 = np.empty(N, dtype=np.float)
    T2[J] = T
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    p_val = np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
    
    p_val[p_val>1]=1
    
    return p_val


def compute_ground_truth_statistics(ground_truth, sample_weight=None):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    if sample_weight is None:
        ordered_sample_weight = None
    else:
        ordered_sample_weight = sample_weight[order]

    return order, label_1_count, ordered_sample_weight


def delong_roc_variance(ground_truth, predictions):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    sample_weight = None
    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
        ground_truth, sample_weight)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    sample_weight = None
    order, label_1_count, a = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return math.exp(calc_pvalue(aucs, delongcov))

def calc_auc_ci(y_true, y_pred, alpha=0.95):
    auc, auc_cov = delong_roc_variance(y_true,y_pred)
    auc_std = np.sqrt(auc_cov)
    lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
    ci_delong = norm.ppf(
        lower_upper_q,
        loc=auc,
        scale=auc_std)

    ci_delong[ci_delong > 1] = 1
    
    return ci_delong

In [637]:
import numpy as np
from scipy.stats import percentileofscore


def score_ci(
    y_true,
    y_pred,
    score_fun,
    n_bootstraps=2000,
    confidence_level=0.95,
    seed=None,
    reject_one_class_samples=True,
):
    """
    Compute confidence interval for given score function based on labels and predictions using bootstrapping.
    :param y_true: 1D list or array of labels.
    :param y_pred: 1D list or array of predictions corresponding to elements in y_true.
    :param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
    :param n_bootstraps: The number of bootstraps. (default: 2000)
    :param confidence_level: Confidence level for computing confidence interval. (default: 0.95)
    :param seed: Random seed for reproducibility. (default: None)
    :param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
    need at least one positive and one negative sample. (default: True)
    :return: Score evaluated on labels and predictions, lower confidence interval, upper confidence interval, array of
    bootstrapped scores.
    """

    assert len(y_true) == len(y_pred)

    score = score_fun(y_true, y_pred)
    _, ci_lower, ci_upper, scores = score_stat_ci(
        y_true=y_true,
        y_preds=y_pred,
        score_fun=score_fun,
        n_bootstraps=n_bootstraps,
        confidence_level=confidence_level,
        seed=seed,
        reject_one_class_samples=reject_one_class_samples,
    )

    return score, ci_lower, ci_upper, scores


def score_stat_ci(
    y_true,
    y_preds,
    score_fun,
    stat_fun=np.mean,
    n_bootstraps=2000,
    confidence_level=0.95,
    seed=None,
    reject_one_class_samples=True,
):
    """
    Compute confidence interval for given statistic of a score function based on labels and predictions using
    bootstrapping.
    :param y_true: 1D list or array of labels.
    :param y_preds: A list of lists or 2D array of predictions corresponding to elements in y_true.
    :param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
    :param stat_fun: Statistic for which confidence interval is computed. (e.g. np.mean)
    :param n_bootstraps: The number of bootstraps. (default: 2000)
    :param confidence_level: Confidence level for computing confidence interval. (default: 0.95)
    :param seed: Random seed for reproducibility. (default: None)
    :param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
    need at least one positive and one negative sample. (default: True)
    :return: Mean score statistic evaluated on labels and predictions, lower confidence interval, upper confidence
    interval, array of bootstrapped scores.
    """

    y_true = np.array(y_true)
    y_preds = np.atleast_2d(y_preds)
    assert all(len(y_true) == len(y) for y in y_preds)

    np.random.seed(seed)
    scores = []
    for i in range(n_bootstraps):
        readers = np.random.randint(0, len(y_preds), len(y_preds))
        indices = np.random.randint(0, len(y_true), len(y_true))
        if reject_one_class_samples and len(np.unique(y_true[indices])) < 2:
            continue
        reader_scores = []
        for r in readers:
            reader_scores.append(score_fun(y_true[indices], y_preds[r][indices]))
        scores.append(stat_fun(reader_scores))

    mean_score = np.mean(scores)
    sorted_scores = np.array(sorted(scores))
    alpha = (1.0 - confidence_level) / 2.0
    ci_lower = sorted_scores[int(round(alpha * len(sorted_scores)))]
    ci_upper = sorted_scores[int(round((1.0 - alpha) * len(sorted_scores)))]
    return mean_score, ci_lower, ci_upper, scores


def pvalue(
    y_true,
    y_pred1,
    y_pred2,
    score_fun,
    n_bootstraps=2000,
    two_tailed=True,
    seed=None,
    reject_one_class_samples=True,
):
    """
    Compute p-value for hypothesis that score function for model I predictions is higher than for model II predictions
    using bootstrapping.
    :param y_true: 1D list or array of labels.
    :param y_pred1: 1D list or array of predictions for model I corresponding to elements in y_true.
    :param y_pred2: 1D list or array of predictions for model II corresponding to elements in y_true.
    :param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
    :param n_bootstraps: The number of bootstraps. (default: 2000)
    :param two_tailed: Whether to use two-tailed test. (default: True)
    :param seed: Random seed for reproducibility. (default: None)
    :param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
    need at least one positive and one negative sample. (default: True)
    :return: Computed p-value, array of bootstrapped differences of scores.
    """

    assert len(y_true) == len(y_pred1)
    assert len(y_true) == len(y_pred2)

    return pvalue_stat(
        y_true=y_true,
        y_preds1=y_pred1,
        y_preds2=y_pred2,
        score_fun=score_fun,
        n_bootstraps=n_bootstraps,
        two_tailed=two_tailed,
        seed=seed,
        reject_one_class_samples=reject_one_class_samples,
    )


def pvalue_stat(
    y_true,
    y_preds1,
    y_preds2,
    score_fun,
    stat_fun=np.mean,
    n_bootstraps=1000,
    two_tailed=True,
    seed=None,
    reject_one_class_samples=True,
):
    """
    Compute p-value for hypothesis that given statistic of score function for model I predictions is higher than for
    model II predictions using bootstrapping.
    :param y_true: 1D list or array of labels.
    :param y_preds1: A list of lists or 2D array of predictions for model I corresponding to elements in y_true.
    :param y_preds2: A list of lists or 2D array of predictions for model II corresponding to elements in y_true.
    :param score_fun: Score function for which confidence interval is computed. (e.g. sklearn.metrics.accuracy_score)
    :param stat_fun: Statistic for which p-value is computed. (e.g. np.mean)
    :param n_bootstraps: The number of bootstraps. (default: 2000)
    :param two_tailed: Whether to use two-tailed test. (default: True)
    :param seed: Random seed for reproducibility. (default: None)
    :param reject_one_class_samples: Whether to reject bootstrapped samples with only one label. For scores like AUC we
    need at least one positive and one negative sample. (default: True)
    :return: Computed p-value, array of bootstrapped differences of scores.
    """

    y_true = np.array(y_true)
    y_preds1 = np.atleast_2d(y_preds1)
    y_preds2 = np.atleast_2d(y_preds2)
    assert all(len(y_true) == len(y) for y in y_preds1)
    assert all(len(y_true) == len(y) for y in y_preds2)

    np.random.seed(seed)
    z = []
    for i in range(n_bootstraps):
        readers1 = np.random.randint(0, len(y_preds1), len(y_preds1))
        readers2 = np.random.randint(0, len(y_preds2), len(y_preds2))
        indices = np.random.randint(0, len(y_true), len(y_true))
        if reject_one_class_samples and len(np.unique(y_true[indices])) < 2:
            continue
        reader_scores = []
        for r in readers1:
            reader_scores.append(score_fun(y_true[indices], y_preds1[r][indices]))
        score1 = stat_fun(reader_scores)
        reader_scores = []
        for r in readers2:
            reader_scores.append(score_fun(y_true[indices], y_preds2[r][indices]))
        score2 = stat_fun(reader_scores)
        z.append(score1 - score2)

    p = percentileofscore(z, 0.0, kind="weak") / 100.0
    if two_tailed:
        p *= 2.0
    return p, z

def method(x,y):
    roc_auc_score(x,y)

def bootstrap_error_estimate(pred, truth, method, method_name="", alpha=0.95, sample_frac=0.5, iterations=100):
    """
    Generate a bootstrapped estimate of confidence intervals
    :param pred: list of predicted values
    :param truth: list of experimental values
    :param method: method to evaluate performance, e.g. matthews_corrcoef
    :param method_name: name of the method for the progress bar
    :param alpha: confidence limit (e.g. 0.95 for 95% confidence interval)
    :param sample_frac: fraction to resample for bootstrap confidence interval
    :param iterations: number of iterations for resampling
    :return: lower and upper bounds for confidence intervals
    """
    index_list = range(0, len(pred))
    num_samples = int(len(index_list) * sample_frac)
    stats = []
    for _ in range(0, iterations):
        sample_idx = resample(index_list, n_samples=num_samples)
        pred_sample = [pred[x] for x in sample_idx]
        truth_sample = [truth[x] for x in sample_idx]
        stats.append(method(truth_sample, pred_sample))
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(stats, p))
    p = (alpha+((1.0-alpha)/2.0)) * 100
    upper = min(1.0, np.percentile(stats, p))
    
    ci_boot = np.array([lower,upper])
    
    return ci_boot

In [638]:
class Scoring(object):
    def __init__(self, roc_auc, predict_df, base_models):
        self.roc_auc = roc_auc
        self.predict_df = predict_df
        self.base_models = base_models

    def joined_scores(self):
        roc_auc_all = pd.concat(self.roc_auc)
        predict_df_all = pd.concat(self.predict_df, axis = 1)
        return roc_auc_all, predict_df_all

    def delong_test(predict_df_all, labels):
        
        Test_df_sets = pd.DataFrame(np.zeros((2, len(base_models))), index=['ALL/PDE','ALL/PD'], columns=[str(i) for i in base_models])
        Test_df_sets.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df_sets.columns])
        
        Test_df_all = pd.DataFrame(list(combinations(Test_df['DeLong Test'].columns,2)),columns = ['1st Algorithm', '2nd Algorithm', 'score'])
        Test_df_all['score'] = 0
        Test_df_all.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df_all.columns])    
            
        for i, clf in enumerate(self.base_models):
        
            Test_df['DeLong Test'].loc['ALL/PDE',str(clf)] = delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PDE'][str(clf)])
            Test_df['DeLong Test'].loc['ALL/PD',str(clf)] = delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PD'][str(clf)])
            
        for j in len(range(Test_df_all.shape[0])):
            Test_df_all['DeLong Test'].loc[i,'score'] = delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][Test_df_all['DeLong Test'].loc[i,'1st Algorithm']], predict_df_all['ALL'][Test_df_all['DeLong Test'].loc[i,'2nd Algorithm']])
    
        return Test_df_sets, Test_df_all
    
    def bootstrap_test(predict_df_all, labels):
    
        Test_df_sets = pd.DataFrame(np.zeros((2, len(base_models))), index=['ALL/PDE','ALL/PD'], columns=[str(i) for i in base_models])
        Test_df_sets.columns = pd.MultiIndex.from_product([['Bootstrap Test'], Test_df_sets.columns])
        
        Test_df_all = pd.DataFrame(list(combinations(Test_df['Bootstrap Test'].columns,2)),columns = ['1st Algorithm', '2nd Algorithm', 'score'])
        Test_df_all['score'] = 0
        Test_df_all.columns = pd.MultiIndex.from_product([['Bootstrap Test'], Test_df_all.columns])
            
        for i, clf in enumerate(self.base_models):
        
            Test_df['Bootstrap Test'].loc['ALL/PDE',str(clf)] = pvalue(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PDE'][str(clf)], score_fun=roc_auc_score)
            Test_df['Bootstrap Test'].loc['ALL/PD',str(clf)] = pvalue(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PD'][str(clf)], score_fun=roc_auc_score)
            
        for j in len(range(Test_df_all.shape[0])):
            Test_df_all['Bootstrap Test'].loc[i,'score'] = pvalue(labels.values.ravel(), predict_df_all['ALL'][Test_df_all['Bootstrap Test'].loc[i,'1st Algorithm']], predict_df_all['ALL'][Test_df_all['Bootstrap Test'].loc[i,'2nd Algorithm']], score_fun=roc_auc_score)
    
        return Test_df_sets, Test_df_all
    
    def likelihood_RT(predict_df_all, labels):
    
        Test_df_sets = pd.DataFrame((np.zeros((2, 1))), index=['ALL/PDE','ALL/PD'], columns=['LogisticRegression()'])
        Test_df_sets.columns = pd.MultiIndex.from_product([['LRT'], Test_df_sets.columns])

        alt_log_likelihood = -log_loss(labels,
                                       predict_df_all['ALL']['LogisticRegression()'],
                                       normalize=False)
        null_log_likelihood = -log_loss(ytest,
                                        predict_df_all['PDE']['LogisticRegression()'],
                                        normalize=False)
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        p_log_l = chi2.sf(G, x_train.shape[1])
        
        alt_log_likelihood = -log_loss(labels,
                                       predict_df_all['ALL']['LogisticRegression()'],
                                       normalize=False)
        null_log_likelihood = -log_loss(ytest,
                                        predict_df_all['PD']['LogisticRegression()'],
                                        normalize=False)
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        
        p_log_2 = chi2.sf(G, x_train.shape[1])
        
        Test_df_sets['LRT'].loc['ALL/PDE','LogisticRegression()'] = p_log_l
        Test_df_sets['LRT'].loc['ALL/PD','LogisticRegression()'] = p_log_2

        return Test_df_sets
    
    def combined_ftest_5x2cv(estimator1, estimator2, x_train, y_train, scoring, random_seed):

        if isinstance(scoring, str):
            scorer = get_scorer(scoring)
        else:
            scorer = scoring

        variances = []
        differences = []

        def score_diff(X_1, X_2, y_1, y_2):

            estimator1.fit(X_1, y_1)
            estimator2.fit(X_1, y_1)
            est1_score = scorer(estimator1, X_2, y_2)
            est2_score = scorer(estimator2, X_2, y_2)
            score_diff = est1_score - est2_score
            return score_diff

        for i in range(5):

            X_1, X_2, y_1, y_2 = train_test_split(x_train, y_train, test_size=0.5, random_state=random_state)

            score_diff_1 = score_diff(X_1, X_2, y_1, y_2)
            score_diff_2 = score_diff(X_2, X_1, y_2, y_1)
            score_mean = (score_diff_1 + score_diff_2) / 2.
            score_var = ((score_diff_1 - score_mean)**2 + (score_diff_2 - score_mean)**2)

            differences.extend([score_diff_1**2, score_diff_2**2])
            variances.append(score_var)

        numerator = sum(differences)
        denominator = 2*(sum(variances))
        f_stat = numerator / denominator

        p_value = scipy.stats.f.sf(f_stat, 10, 5)

        return float(f_stat), float(p_value)
    
    def f_test(estimators_list, x_train, y_train):
    
        estimators = []
        for i in range(len(self.base_models)):
            estimators.append(estimators_list[i][2])
        estimators = list(combinations(estimators,2))
        
        p_values = []

        for i in range(len(estimators)): 

            estimator1 = eval(estimators[i][0])
            estimator2 = eval(estimators[i][1])
            
            f_stat, p_value = combined_ftest_5x2cv(estimator1, estimator2, x_train, y_train, roc_auc_score, random_state)
            
            p_values.append(p_value)
            
        f_p_values = pd.DataFrame(columns = ['algorithm1', 'algorithm2', 'score'])    
            
        for j in range(len(estimators)): 
            f_p_values.loc[j,'algorithm1'] = estimators[j][0]
            f_p_values.loc[j,'algorithm2'] = estimators[j][1]
            f_p_values.loc[j,'score'] = p_values[j]
        
        return(f_p_values)

In [639]:
def yeoj_graph(x_train, lbd_list, feature=''):
    
    plt.figure(figsize=(8,6))

    for i in range(len(lbd_list)):
        n_lines = len(lbd_list)
        c = np.arange(1, n_lines + 1)
        norm = mpl.colors.Normalize(vmin=c.min(), vmax=c.max())
        cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues)
        cmap.set_array([])
        a = x_train[feature].values.ravel()
        a = np.sort(a)
        b = stats.yeojohnson(x_train[feature], lmbda=lbd_list[i])
        b = np.sort(b)
        plt.plot(a,b, c=cmap.to_rgba(i + 1), label='λ = '+str(lbd_list[i]))
    plt.legend(loc=0)
    plt.ylabel("ψ(λ,x)", fontsize=15)
    plt.xlabel("x", fontsize=15)
    plt.savefig('yeo-johnson.png', dpi=1200)
    
    return plt.show()

## 3. Random Forest Classification
### A. Hyperparameters Tuning

In [None]:
# Random Forest All

In [None]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]
rf_n_estimators.append(1500)
rf_n_estimators.append(2000)

In [None]:
# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]

In [None]:
# Add the default as a possible value
rf_max_depth.append(None)

In [None]:
# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']

In [None]:
rf_min_samples_leaf = [int(x) for x in np.linspace(1, 55, 11)]

In [None]:
# Criterion to split on
rf_criterion = ['gini', 'entropy']

In [None]:
# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 40, 20)]

In [None]:
# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1, 0.2]

In [None]:
# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

In [None]:
# Weights associated with classes
rf_class = ['balanced_subsample', 'balanced', None]

In [None]:
# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'min_samples_leaf':rf_min_samples_leaf,
               'bootstrap': rf_bootstrap,
               'class_weight': rf_class}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline

cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = random_state)

rdf = RandomForestClassifier(random_state = random_state) 
scoring = {'Recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)}

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'min_samples_leaf':rf_min_samples_leaf,
               'bootstrap': rf_bootstrap,
               'class_weight': rf_class}

In [None]:
#pipeline dobrzez zrobiony

In [None]:
rf_grid = {'randomforestclassifier__n_estimators': rf_n_estimators,
           'randomforestclassifier__max_depth': rf_max_depth,
           'randomforestclassifier__max_features': rf_max_features,
           'randomforestclassifier__criterion': rf_criterion,
           'randomforestclassifier__min_samples_split': rf_min_samples_split,
           'randomforestclassifier__min_impurity_decrease': rf_min_impurity_decrease,
           'randomforestclassifier__min_samples_leaf':rf_min_samples_leaf,
           'randomforestclassifier__bootstrap': rf_bootstrap,
           'randomforestclassifier__class_weight': rf_class
          }

In [None]:
pipe_lr = make_pipeline(PowerTransformer(method='yeo-johnson',standardize = True),
                        RFECV(estimator = RandomForestClassifier(random_state = random_state), step = 1, cv=cv, scoring = 'roc_auc'),
                        RandomForestClassifier(random_state = random_state))

In [None]:
search = RandomizedSearchCV(estimator=pipe_lr, param_distributions = rf_grid, cv = cv, n_jobs=-1, verbose=True, scoring = scoring, refit = 'roc_auc')
search.fit(xtrain, ytrain)

In [None]:
search.best_estimator_

In [None]:
#wywołanie z pipeline

In [None]:
xtrain.columns

In [None]:
search.best_estimator_.named_steps["rfecv"].support_

In [None]:
search.best_estimator_._final_estimator.feature_importances_

In [None]:
# inne wywołanie z pipeline

In [None]:
search.best_estimator_.named_steps["rfecv"].estimator_

In [None]:
search.best_estimator_.named_steps["rfecv"].grid_scores_

In [None]:
# test set
predict_rdf = search.predict_proba(xtest)[:,1]

In [None]:
roc_auc_score(ytest,predict_rdf)

In [None]:
grid_clf = RandomizedSearchCV(estimator = rdf, param_distributions = rf_grid, cv = cv, n_jobs=-1, verbose=True, scoring = scoring, refit = 'roc_auc', n_iter = 150)
grid_clf.fit(xtrain_pd, ytrain)

In [None]:
print(search.best_estimator_)
print(search.best_params_)
print(search.best_score_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

rdf = RandomForestClassifier(bootstrap=True, 
                             class_weight='balanced', 
                             criterion='gini',
                             max_depth=35, 
                             max_features='log2', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.1, 
#                              min_impurity_split=None,
                             min_samples_leaf=11, 
                             min_samples_split=12,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=1000, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [None]:
# Remember about changing when changing models(ALL,NOPD,PD)!
rdf.fit(xtrain,ytrain)

In [None]:
# training set
predict_rdf = rdf.predict(xtrain_pd)

In [None]:
roc_auc_score(ytrain,predict_rdf)

In [None]:
accuracy_score(ytrain,predict_rdf)

In [None]:
recall_score(ytrain,predict_rdf)

In [None]:
precision_score(ytrain,predict_rdf)

In [None]:
confusion_matrix(ytrain,predict_rdf)

In [None]:
# test set
predict_rdf = rdf.predict(xtest_pd)

In [None]:
roc_auc_score(ytest,predict_rdf)

In [None]:
accuracy_score(ytest,predict_rdf)

In [None]:
recall_score(ytest,predict_rdf)

In [None]:
precision_score(ytest,predict_rdf)

In [None]:
confusion_matrix(ytest,predict_rdf)

In [None]:
importances_rdf = search['randomforestclassifier'].feature_importances_

#create a feature list from the original dataset (list of columns)
# What are this numbers? Let's get back to the columns of the original dataset
feature_list = list(xtrain.columns)

#create a list of tuples
feature_importance_rdf= sorted(zip(importances_rdf, feature_list), reverse=True)

#create two lists from the previous list of tuples
df_rdf = pd.DataFrame(feature_importance_rdf, columns=['importance', 'feature'])
importance_rdf= list(df_rdf['importance'])
feature= list(df_rdf['feature'])

#see df
print(df_rdf)

In [None]:
import eli5

In [None]:
numeric_features_list = list(xtrain.columns)

In [None]:
eli5.explain_weights(pipe_lr['randomforestclassifier'], feature_names=numeric_features_list)

#### B. Predict All - Bootstraping

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

rdf = RandomForestClassifier(bootstrap=False, 
                             class_weight='balanced_subsample', 
                             criterion='entropy',
                             max_depth=5, 
                             max_features='log2', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.0, 
#                              min_impurity_split=None,
                             min_samples_leaf=33, 
                             min_samples_split=2,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=1000, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [None]:
# Remember about changing when changing models(ALL,NOPD,PD)!
rdf.fit(xtrain,ytrain)

In [None]:
# training set
predict_rdf_4 = rdf.predict(xtest)

#### C. Predict No PD - Bootstraping

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

rdf = RandomForestClassifier(bootstrap=True, 
                             class_weight='balanced', 
                             criterion='gini',
                             max_depth=None, 
                             max_features='sqrt', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.1, 
#                              min_impurity_split=None,
                             min_samples_leaf=33, 
                             min_samples_split=28,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=1000, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [None]:
# Remember about changing when changing models(ALL,NOPD,PD)!
rdf.fit(xtrain_nopd,ytrain)

In [None]:
# training set
predict_rdf_5 = rdf.predict(xtest_nopd)

#### D. Predict PD - Bootstraping

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

rdf = RandomForestClassifier(bootstrap=True, 
                             class_weight='balanced', 
                             criterion='gini',
                             max_depth=35, 
                             max_features='log2', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.1, 
#                              min_impurity_split=None,
                             min_samples_leaf=11, 
                             min_samples_split=12,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=1000, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [None]:
# Remember about changing when changing models(ALL,NOPD,PD)!
rdf.fit(xtrain_pd,ytrain)

In [None]:
# training set
predict_rdf_6 = rdf.predict(xtest_pd)

#### E. Others

In [None]:
base_models = [rdf]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)        

In [None]:
train_pred, test_pred, recall_scores, f1_scores, roc_auc_scores = lgb_stack.predict(xtrain, ytrain, xtest)

In [None]:
for i in list(range(test_pred.shape[1])):  
    print('1. The F-1 score of the model {}\n'.format(f1_score(ytest, test_pred[:,i], average='macro')))
    print('2. The roc_auc score of the model {}\n'.format(roc_auc_score(ytest, test_pred[:,i], average='macro')))
    print('3. Classification report \n {} \n'.format(classification_report(ytest, test_pred[:,i])))
    print('4. Confusion matrix \n {} \n'.format(confusion_matrix(ytest, test_pred[:,i])))

In [None]:
tpred_rf = pd.DataFrame(test_pred)
final_tpred = tpred_rf.mode(axis=1)

In [None]:
np.unique(final_tpred)

In [None]:
# final_tpred.to_csv('predicted_labels_1.csv', index=False, header= False)

In [None]:
# XGBoost Model

## 4. XGBoost Classification
### A. Hyperparameters Tuning

In [None]:
from xgboost import XGBClassifier

In [None]:
# Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

In [None]:
# Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

In [None]:
# Minimum number of instaces needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

In [None]:
# Tree construction algorithm used in XGBoost
xgb_tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist']

In [None]:
# Learning rate
xgb_eta = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1]

In [None]:
xgb_eta

In [None]:
# Minimum loss reduction required to make further partition
xgb_gamma = [x for x in np.linspace(0, 0.5, 6)]

In [None]:
xgb_gamma

In [None]:
# Learning objective used
xgb_objective = ['binary:logistic']

In [None]:
xgb_lambda = [10,20,50,100]

In [None]:
# Balancing of positive and negative weights
xgb_weight = [119.85522788203754, None]

In [None]:
44706/373

In [None]:
ytrain['y'].value_counts()

In [None]:
# pipeline try

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFECV

In [None]:
pipe_lr = make_pipeline(RFECV(estimator = xgb, step = 1, cv=cv, scoring = 'roc_auc'),
                        RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, cv = cv, n_jobs=-1, verbose=True, scoring = scoring, refit = 'roc_auc', n_iter = 100))

In [None]:
pipe_lr.fit(xtrain, ytrain)

In [None]:
score = pipe_lr['randomizedsearchcv'].best_params_

In [None]:
ypredict = pipe_lr.predict(xtest)

In [None]:
pipe_lr['rfecv'].ranking_

In [None]:
roc_auc_score(ytest,ypredict)

In [None]:
# extract the numerical values of feature importance from the grid search
importances = pipe_lr['randomizedsearchcv'].feature_importances_

#create a feature list from the original dataset (list of columns)
# What are this numbers? Let's get back to the columns of the original dataset
feature_list = list(xtrain.columns)

#create a list of tuples
feature_importance= sorted(zip(importances, feature_list), reverse=True)

#create two lists from the previous list of tuples
df = pd.DataFrame(feature_importance, columns=['importance', 'feature'])
importance= list(df['importance'])
feature= list(df['feature'])

#see df
print(df)

In [None]:
# pipeline end

In [None]:
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = random_state)

xgb = XGBClassifier(random_state = random_state) 
scoring = {'Recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)}

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'tree_method': xgb_tree_method,
            'learning_rate': xgb_eta,
            'gamma': xgb_gamma,
            'objective': xgb_objective,
            'reg_lambda':xgb_lambda,
            'scale_pos_weight': xgb_weight}

In [None]:
grid_clf = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, cv = cv, n_jobs=-1, verbose=True, scoring = scoring, refit = 'roc_auc', n_iter = 100)
grid_clf.fit(xtrain, ytrain)

In [None]:
print(grid_clf.best_estimator_)
print(grid_clf.best_params_)
print(grid_clf.best_score_)

In [None]:
from xgboost import XGBClassifier
xgb =  XGBClassifier(tree_method = 'exact',
                     objective = 'binary:logistic',
                     n_estimators = 1200,
                     min_child_weight = 8,
                     max_depth = 2,
                     gamma = 0.3,
                     reg_lambda = 90,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb.fit(xtrain,ytrain)

In [None]:
# training set
predict_xgb = xgb.predict(xtrain)

In [None]:
roc_auc_score(ytrain,predict_xgb)

In [None]:
accuracy_score(ytrain,predict_xgb)

In [None]:
recall_score(ytrain,predict_xgb)

In [None]:
precision_score(ytrain,predict_xgb)

In [None]:
confusion_matrix(ytrain,predict_xgb)

In [None]:
# test set
predict_xgb = xgb.predict(xtest)

In [None]:
roc_auc_score(ytest,predict_xgb)

In [None]:
accuracy_score(ytest,predict_xgb)

In [None]:
recall_score(ytest,predict_xgb)

In [None]:
precision_score(ytest,predict_xgb)

In [None]:
confusion_matrix(ytest,predict_xgb)

In [None]:
# extract the numerical values of feature importance from the grid search
importances = xgb.feature_importances_

#create a feature list from the original dataset (list of columns)
# What are this numbers? Let's get back to the columns of the original dataset
feature_list = list(xtrain.columns)

#create a list of tuples
feature_importance= sorted(zip(importances, feature_list), reverse=True)

#create two lists from the previous list of tuples
df = pd.DataFrame(feature_importance, columns=['importance', 'feature'])
importance= list(df['importance'])
feature= list(df['feature'])

#see df
print(df)

In [None]:
xtrain

#### B. Predict All - Bootstraping

In [None]:
xgb =  XGBClassifier(tree_method = 'exact',
                     objective = 'binary:logistic',
                     n_estimators = 1200,
                     min_child_weight = 8,
                     max_depth = 2,
                     gamma = 0.3,
                     reg_lambda = 90,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb.fit(xtrain,ytrain)

In [None]:
# training set
predict_xgb_4 = xgb.predict(xtest)

#### C. Predict No PD - Bootstraping

In [None]:
xgb =  XGBClassifier(tree_method = 'auto',
                     objective = 'binary:logistic',
                     n_estimators = 1000,
                     min_child_weight = 4,
                     max_depth = 2,
                     gamma = 0.5,
                     reg_lambda = 50,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb.fit(xtrain_nopd,ytrain)

In [None]:
# training set
predict_xgb_5 = xgb.predict(xtest_nopd)

#### D. Predict PD - Bootstraping

In [None]:
xgb =  XGBClassifier(tree_method = 'hist',
                     objective = 'binary:logistic',
                     n_estimators = 200,
                     min_child_weight = 3,
                     max_depth = 2,
                     gamma = 0.0,
                     reg_lambda = 50,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb.fit(xtrain_pd,ytrain)

In [None]:
# training set
predict_xgb_6 = xgb.predict(xtest_pd)

#### E. Others

In [None]:
base_models = [xgb]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)        

In [None]:
train_pred, test_pred, recall_scores, f1_scores, roc_auc_scores = lgb_stack.predict(xtrain, ytrain, xtest)

In [None]:
for i in list(range(test_pred.shape[1])):  
    print('1. The F-1 score of the model {}\n'.format(f1_score(ytest, test_pred[:,i], average='macro')))
    print('2. The roc_auc score of the model {}\n'.format(roc_auc_score(ytest, test_pred[:,i], average='macro')))
    print('3. Classification report \n {} \n'.format(classification_report(ytest, test_pred[:,i])))
    print('4. Confusion matrix \n {} \n'.format(confusion_matrix(ytest, test_pred[:,i])))

In [None]:
tpred_xgb = pd.DataFrame(test_pred)
final_tpred = tpred.mode(axis=1)LogisticRegression.ipynb

In [None]:
np.unique(final_tpred)

## 5. Logistic Regression Classificaton
### A. Hyperparameters Tuning

In [None]:
# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# the norm used in the penalization
logreg_penalty = ['l1', 'l2', 'elasticnet', None]

In [None]:
# Inverse of regularization strength
logreg_c = np.logspace(-4, 4, 20)

In [None]:
# Algorithm to use in the optimization problem
logreg_solver = ['newton-cg','liblinear', 'saga', 'lbfgs']

In [None]:
logreg_weight = ['balanced', None]

In [None]:
logreg_grid = {'penalty' : logreg_penalty,
                'C' : logreg_c,
                'solver' : logreg_solver,
                'class_weight': logreg_weight}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = random_state)

logreg = LogisticRegression(random_state = random_state) 
scoring = {'Recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)}

# Create the grid
logreg_grid = {'penalty' : logreg_penalty,
                'C' : logreg_c,
                'solver' : logreg_solver,
                'class_weight': logreg_weight}

grid_clf = RandomizedSearchCV(estimator = logreg, param_distributions = logreg_grid, cv = cv, n_jobs=-1, verbose=True, scoring = scoring, refit = 'roc_auc', n_iter = 10)
grid_clf.fit(xtrain, ytrain)

In [None]:
print(grid_clf.best_estimator_)
print(grid_clf.best_params_)
print(grid_clf.best_score_)

In [None]:
log =  LogisticRegression(penalty = 'l2',
                         solver = 'liblinear',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.0001)

In [None]:
log.fit(xtrain_pd,ytrain)

In [None]:
# training set
pred_log = log.predict(xtrain_pd)

In [None]:
roc_auc_score(ytrain,pred_log)

In [None]:
accuracy_score(ytrain,pred_log)

In [None]:
recall_score(ytrain,pred_log)

In [None]:
precision_score(ytrain,pred_log)

In [None]:
confusion_matrix(ytrain,pred_log)

In [None]:
# test set
pred_log = log.predict(xtest_pd)

In [None]:
roc_auc_score(ytest,pred_log)

In [None]:
accuracy_score(ytest,pred_log)

In [None]:
recall_score(ytest,pred_log)

In [None]:
precision_score(ytest,pred_log)

In [None]:
confusion_matrix(ytest,pred_log)

In [None]:
# method - solver
import statsmodels.api as sm
logit_model=sm.Logit(ytrain,xtrain)

In [None]:
result = logit_model.fit(method = 'lbfgs',maxiter = 1000)

In [None]:
print(result.summary2())

#### B. Predict All - Bootstraping

In [None]:
log =  LogisticRegression(penalty = 'l2',
                         solver = 'newton-cg',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.03359818286283781)

In [None]:
log.fit(xtrain,ytrain)

In [None]:
# training set
predict_log_4 = log.predict(xtest)

#### C. Predict No PD - Bootstraping

In [None]:
log =  LogisticRegression(penalty = 'l2',
                         solver = 'newton-cg',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.012742749857031334)

In [None]:
log.fit(xtrain_nopd,ytrain)

In [None]:
# training set
predict_log_5 = log.predict(xtest_nopd)

#### D. Predict PD - Bootstraping

In [None]:
log =  LogisticRegression(penalty = 'l2',
                         solver = 'liblinear',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.0001)

In [None]:
log.fit(xtrain_pd,ytrain)

In [None]:
# training set
predict_log_6 = log.predict(xtest_pd)

#### E. Others

In [None]:
base_models = [log]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)        

In [None]:
train_pred, test_pred, recall_scores, f1_scores, roc_auc_scores = lgb_stack.predict(xtrain, ytrain, xtest)

In [None]:
for i in list(range(test_pred.shape[1])):  
    print('1. The F-1 score of the model {}\n'.format(f1_score(ytest, test_pred[:,i], average='macro')))
    print('2. The roc_auc score of the model {}\n'.format(roc_auc_score(ytest, test_pred[:,i], average='macro')))
    print('3. Classification report \n {} \n'.format(classification_report(ytest, test_pred[:,i])))
    print('4. Confusion matrix \n {} \n'.format(confusion_matrix(ytest, test_pred[:,i])))

In [None]:
tpred_logreg = pd.DataFrame(test_pred)
final_tpred = tpred.mode(axis=1)

In [None]:
np.unique(final_tpred)

The datasets used in problem-1 are highly unbalance. Therefore, all the evaluation metrics shows the expected random performance. Due to the unbalance data, the probabilities for minor classes (class-2 and 3) are inaccurate. But we can still get good predictions by choosing a more appropriate probability cutoff. In the problem-3 section, I will imporve the model performance by choosing a cutoff by ovserving the minor class probability distribution and ROC curve and by setting unequal importance of the class.

## 6. Modified Ensemble

In this problem the labels have unequal importance, in the sense that we want to penalize the model most if it misclassified label 3, a little less for 2 and the least for label 1. Additionally, in case of a misclassification, it is preferable to over-predict a label than under-predict (i.e. misclassifying label 3 as 2 is worse than misclassifying label 2 as 3). To implement the above constraints I would rebuild problem-1 model in following steps:

<li> <b>Step 1: </b>Predict probabilities instead of actual prediction.
<li> <b>Step 2: </b>Set the class weight.
<li> <b>Step 3: </b>Get probability distribution of minor class.
<li> <b>Step 4: </b>From the ROC curve and probability distribution obtain probability thresholds for classes.
<li> <b>Step 5: </b>Finally use the threshold to over-predict a label than under-predict.

In [None]:
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y):
        X = np.array(X)
        y = np.array(y)
#         T = np.array(T)
        no_class = len(np.unique(y))

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, 
                                     random_state = random_state).split(X, y))

        train_proba = np.zeros((X.shape[0], no_class))
#         test_proba = np.zeros((T.shape[0], no_class))
        
        train_pred = np.zeros((X.shape[0], len(self.base_models)))
#         test_pred = np.zeros((T.shape[0], len(self.base_models)* self.n_splits))
        f1_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
        roc_auc_scores = np.zeros((len(self.base_models), self.n_splits))
        
        test_col = 0
        
        for i, clf in enumerate(self.base_models):
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                
                clf.fit(X_train, Y_train)
                
                ## Probabilities
                valid_proba = clf.predict_proba(X_train)
                train_proba[train_idx, :] = valid_proba
                
#                 recall  = recall_score(Y_train, valid_proba, average='macro')
#                 f1 = f1_score(Y_train, valid_proba, average='macro')
                roc_auc = roc_auc_score(Y_train, valid_proba[:,1], average='macro')
                
#                 recall_scores[i][j] = recall
#                 f1_scores[i][j] = f1
                roc_auc_scores[i][j] = roc_auc
                
#                 train_pred[valid_idx, i] = valid_pred
#                 test_pred[:, test_col] = clf.predict(T)
#                 test_col += 1
                
#                 print( "Model- {} and CV- {} recall: {}, f1_score: {}, roc_auc_score: {}".format(i, j, recall, f1, roc_auc))
                
#             test_proba /= self.n_splits
            
        return train_proba, roc_auc_scores

In [None]:
no_class = len(np.unique(y))
train_proba = np.zeros((X.shape[0], no_class))

In [None]:
train_proba

### A. Random Forest Classification

#### 1. All

In [361]:
from sklearn.ensemble import RandomForestClassifier

rdf_1 = RandomForestClassifier(bootstrap=False, 
                             class_weight='balanced_subsample', 
                             criterion='gini',
                             max_depth=5, 
                             max_features='log2', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.0, 
#                              min_impurity_split=None,
#                              min_samples_leaf=8, 
                             min_samples_split=33,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=600, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [362]:
rdf_1.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample',
                       max_depth=5, max_features='log2', min_samples_split=33,
                       n_estimators=600, n_jobs=-1, random_state=2020)

In [363]:
predict_rdf_1 = rdf_1.predict_proba(xtest)

In [364]:
predict_rdf_1 = predict_rdf_1[:,1]

In [365]:
roc_auc_score(ytest, predict_rdf_1)

0.9412822938413361

#### 2. PD Omitted

In [477]:
from sklearn.ensemble import RandomForestClassifier

rdf_2 = RandomForestClassifier(bootstrap=True, 
                             class_weight='balanced', 
                             criterion='entropy',
                             max_depth=None, 
                             max_features='auto', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.05, 
#                              min_impurity_split=None,
#                              min_samples_leaf=8, 
                             min_samples_split=11,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=2000, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [478]:
rdf_2.fit(xtrain_nopd,ytrain)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       min_impurity_decrease=0.05, min_samples_split=11,
                       n_estimators=2000, n_jobs=-1, random_state=2020)

In [479]:
predict_rdf_2 = rdf_2.predict_proba(xtest_nopd)

In [480]:
predict_rdf_2 = predict_rdf_2[:,1]

In [481]:
roc_auc_score(ytest,predict_rdf_2)

0.9267118997912317

#### 3. PD

In [None]:
from sklearn.ensemble import RandomForestClassifier

rdf_3 = RandomForestClassifier(bootstrap=True, 
                             class_weight='balanced', 
                             criterion='gini',
                             max_depth=25, 
                             max_features='sqrt', 
#                              max_leaf_nodes=None,
                             min_impurity_decrease=0.05, 
#                              min_impurity_split=None,
#                              min_samples_leaf=8, 
                             min_samples_split=38,
#                              min_weight_fraction_leaf=0.0,
                             n_estimators=1000, 
                             n_jobs=-1,
#                              oob_score=False,
                             random_state=random_state,
                             verbose=0, 
#                              warm_start=False
                            )

In [None]:
rdf_3.fit(xtrain_pd,ytrain)

In [None]:
predict_rdf_3 = rdf_3.predict_proba(xtest_pd)

In [None]:
predict_rdf_3 = predict_rdf_3[:,1]

In [None]:
roc_auc_score(ytest, predict_rdf_3)

#### 4. Bootstraping and DeLong Test

In [None]:
roc_auc_scores, feat_selected, feat_importance

In [73]:
test_pred = pd.DataFrame(np.zeros((xtest.shape[0], len(base_models))), columns=[str(i) for i in base_models])
test_pred.columns = pd.MultiIndex.from_product([[chosen_set], test_pred.columns])

In [75]:
test_pred1 = pd.DataFrame(np.zeros((xtest.shape[0], len(base_models))), columns=[str(i) for i in base_models])
test_pred1.columns = pd.MultiIndex.from_product([['NOPD'], test_pred1.columns])

In [76]:
predict_df = [test_pred, test_pred1]

In [113]:
predict_df_all = pd.concat(predict_df,axis=1)

In [116]:
predict_df_all['ALL'].loc[0,'LogisticRegression()']

0.0

In [None]:
roc_auc = [logreg,rdf]
predict_df = [logreg_grid]
lgb_stack = Scoring(roc_auc = roc_auc, predict_df = predict_df)        
roc_auc_scores, feat_selected, feat_importance = lgb_stack.predict(xtrain, ytrain, xtest, ytest)

In [109]:
Test_df = pd.DataFrame(np.zeros((2, len(base_models))), index=['ALL/PDE','PDE/PD'], columns=[str(i) for i in base_models])

In [111]:
Test_df.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df.columns])

In [120]:
predict_df_all

Unnamed: 0_level_0,ALL,ALL,NOPD,NOPD
Unnamed: 0_level_1,LogisticRegression(),RandomForestClassifier(),LogisticRegression(),RandomForestClassifier()
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
19315,0.0,0.0,0.0,0.0
19316,0.0,0.0,0.0,0.0
19317,0.0,0.0,0.0,0.0
19318,0.0,0.0,0.0,0.0


In [119]:
Test_df['DeLong Test'].loc['ALL/PDE','LogisticRegression()']

0.0

In [177]:
Test_df

Unnamed: 0_level_0,DeLong Test,DeLong Test
Unnamed: 0_level_1,LogisticRegression(),RandomForestClassifier()
ALL/PDE,0.0,0.0
PDE/PD,0.0,0.0


In [178]:
predict_df_all['ALL']

Unnamed: 0,LogisticRegression(),RandomForestClassifier()
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
19315,0.0,0.0
19316,0.0,0.0
19317,0.0,0.0
19318,0.0,0.0


In [179]:
for i in predict_df_all:
    print(i)

('ALL', 'LogisticRegression()')
('ALL', 'RandomForestClassifier()')
('NOPD', 'LogisticRegression()')
('NOPD', 'RandomForestClassifier()')


In [182]:
base_models = [LogisticRegression(), RandomForestClassifier()]

In [183]:
Test_df_all = pd.DataFrame(np.zeros((1, len(base_models))), index=['ALL'], columns=[str(i) for i in predict_df_all['ALL']])

In [184]:
Test_df_all

Unnamed: 0,LogisticRegression(),RandomForestClassifier()
ALL,0.0,0.0


In [186]:
predict_df_all['ALL']

Unnamed: 0,LogisticRegression(),RandomForestClassifier()
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
19315,0.0,0.0
19316,0.0,0.0
19317,0.0,0.0
19318,0.0,0.0


In [909]:
class Scoring(object):
    def __init__(self, roc_auc, predict_df, base_models, chosen_set):
        self.roc_auc = roc_auc
        self.predict_df = predict_df
        self.base_models = base_models

    def joined_scores(self):
        roc_auc_all = pd.concat(self.roc_auc)
        predict_df_all = pd.concat(self.predict_df, axis = 1)
        return roc_auc_all, predict_df_all

    def delong_test(predict_df_all, labels):
        
        Test_df_sets = pd.DataFrame(np.zeros((2, len(base_models))), index=['ALL/PDE','ALL/PD'], columns=[str(i) for i in base_models])
        Test_df_sets.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df_sets.columns])
        
        Test_df_all = pd.DataFrame(list(combinations(Test_df['DeLong Test'].columns,2)),columns = ['1st Algorithm', '2nd Algorithm', 'score'])
        Test_df_all['score'] = 0
        Test_df_all.columns = pd.MultiIndex.from_product([['DeLong Test'], Test_df_all.columns])    
            
        for i, clf in enumerate(self.base_models):
        
            Test_df['DeLong Test'].loc['ALL/PDE',str(clf)] = delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PDE'][str(clf)])
            Test_df['DeLong Test'].loc['ALL/PD',str(clf)] = delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PD'][str(clf)])
            
        for j in len(range(Test_df_all.shape[0])):
            Test_df_all['DeLong Test'].loc[i,'score'] = delong_roc_test(labels.values.ravel(), predict_df_all['ALL'][Test_df_all['DeLong Test'].loc[i,'1st Algorithm']], predict_df_all['ALL'][Test_df_all['DeLong Test'].loc[i,'2nd Algorithm']])
    
        return Test_df_sets, Test_df_all
    
    def bootstrap_test(predict_df_all, labels):
    
        Test_df_sets = pd.DataFrame(np.zeros((2, len(base_models))), index=['ALL/PDE','ALL/PD'], columns=[str(i) for i in base_models])
        Test_df_sets.columns = pd.MultiIndex.from_product([['Bootstrap Test'], Test_df_sets.columns])
        
        Test_df_all = pd.DataFrame(list(combinations(Test_df['Bootstrap Test'].columns,2)),columns = ['1st Algorithm', '2nd Algorithm', 'score'])
        Test_df_all['score'] = 0
        Test_df_all.columns = pd.MultiIndex.from_product([['Bootstrap Test'], Test_df_all.columns])
            
        for i, clf in enumerate(self.base_models):
        
            Test_df['Bootstrap Test'].loc['ALL/PDE',str(clf)] = pvalue(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PDE'][str(clf)], score_fun=roc_auc_score)
            Test_df['Bootstrap Test'].loc['ALL/PD',str(clf)] = pvalue(labels.values.ravel(), predict_df_all['ALL'][str(clf)], predict_df_all['PD'][str(clf)], score_fun=roc_auc_score)
            
        for j in len(range(Test_df_all.shape[0])):
            Test_df_all['Bootstrap Test'].loc[i,'score'] = pvalue(labels.values.ravel(), predict_df_all['ALL'][Test_df_all['Bootstrap Test'].loc[i,'1st Algorithm']], predict_df_all['ALL'][Test_df_all['Bootstrap Test'].loc[i,'2nd Algorithm']], score_fun=roc_auc_score)
    
        return Test_df_sets, Test_df_all
    
    def likelihood_RT(predict_df_all, labels):
    
        Test_df_sets = pd.DataFrame((np.zeros((2, 1))), index=['ALL/PDE','ALL/PD'], columns=['LogisticRegression()'])
        Test_df_sets.columns = pd.MultiIndex.from_product([['LRT'], Test_df_sets.columns])

        alt_log_likelihood = -log_loss(labels,
                                       predict_df_all['ALL']['LogisticRegression()'],
                                       normalize=False)
        null_log_likelihood = -log_loss(ytest,
                                        predict_df_all['PDE']['LogisticRegression()'],
                                        normalize=False)
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        p_log_l = chi2.sf(G, x_train.shape[1])
        
        alt_log_likelihood = -log_loss(labels,
                                       predict_df_all['ALL']['LogisticRegression()'],
                                       normalize=False)
        null_log_likelihood = -log_loss(ytest,
                                        predict_df_all['PD']['LogisticRegression()'],
                                        normalize=False)
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        
        p_log_2 = chi2.sf(G, x_train.shape[1])
        
        Test_df_sets['LRT'].loc['ALL/PDE','LogisticRegression()'] = p_log_l
        Test_df_sets['LRT'].loc['ALL/PD','LogisticRegression()'] = p_log_2

        return Test_df_sets
    
    def combined_ftest_5x2cv(estimator1, estimator2, x_train, y_train, scoring, random_seed):

        if isinstance(scoring, str):
            scorer = get_scorer(scoring)
        else:
            scorer = scoring

        variances = []
        differences = []

        def score_diff(X_1, X_2, y_1, y_2):

            estimator1.fit(X_1, y_1)
            estimator2.fit(X_1, y_1)
            est1_score = scorer(estimator1, X_2, y_2)
            est2_score = scorer(estimator2, X_2, y_2)
            score_diff = est1_score - est2_score
            return score_diff

        for i in range(5):

            X_1, X_2, y_1, y_2 = train_test_split(x_train, y_train, test_size=0.5, random_state=random_state)

            score_diff_1 = score_diff(X_1, X_2, y_1, y_2)
            score_diff_2 = score_diff(X_2, X_1, y_2, y_1)
            score_mean = (score_diff_1 + score_diff_2) / 2.
            score_var = ((score_diff_1 - score_mean)**2 + (score_diff_2 - score_mean)**2)

            differences.extend([score_diff_1**2, score_diff_2**2])
            variances.append(score_var)

        numerator = sum(differences)
        denominator = 2*(sum(variances))
        f_stat = numerator / denominator

        p_value = scipy.stats.f.sf(f_stat, 10, 5)

        return float(f_stat), float(p_value)
    
    def f_test(estimators_list, x_train, y_train):
    
        estimators = []
        for i in range(len(self.base_models)):
            estimators.append(estimators_list[i][2])
        estimators = list(combinations(estimators,2))
        
        p_values = []

        for i in range(len(estimators)): 

            estimator1 = eval(estimators[i][0])
            estimator2 = eval(estimators[i][1])
            
            f_stat, p_value = combined_ftest_5x2cv(estimator1, estimator2, x_train, y_train, roc_auc_score, random_state)
            
            p_values.append(p_value)
            
        f_p_values = pd.DataFrame(columns = ['algorithm1', 'algorithm2', 'score'])    
            
        for j in range(len(estimators)): 
            a.loc[j,'algorithm1'] = estimators[j][0]
            a.loc[j,'algorithm2'] = estimators[j][1]
            a.loc[j,'score'] = p_values[j]
        
        return(f_p_values)

In [427]:
def yeoj_graph(x_train, lbd_list, feature=''):
    
    plt.figure(figsize=(8,6))

    for i in range(len(lbd_list)):
        n_lines = len(lbd_list)
        c = np.arange(1, n_lines + 1)
        norm = mpl.colors.Normalize(vmin=c.min(), vmax=c.max())
        cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues)
        cmap.set_array([])
        a = x_train[feature].values.ravel()
        a = np.sort(a)
        b = stats.yeojohnson(x_train[feature], lmbda=lbd_list[i])
        b = np.sort(b)
        plt.plot(a,b, c=cmap.to_rgba(i + 1), label='λ = '+str(lbd_list[i]))
    plt.legend(loc=0)
    plt.ylabel("ψ(λ,x)", fontsize=15)
    plt.xlabel("x", fontsize=15)
    plt.savefig('yeo-johnson.png', dpi=1200)
    
    return plt.show()

In [None]:
# All vs without PD

In [None]:
# DeLong Test

In [None]:
p_rdf_1 = delong_roc_test(ytest.values.ravel(), predict_rdf_1, predict_rdf_2)

In [None]:
math.exp(p_rdf_1)

In [None]:
# Bootstraping - it is possible that not probabilities but {0,1} has to be passed; then code needs to be adjusted

In [None]:
p_rdf_2, z = pvalue(ytest.values.ravel(), predict_rdf_1, predict_rdf_2, score_fun=roc_auc_score)

In [None]:
p_rdf_2

In [None]:
# All vs only PD

In [None]:
# DeLong Test

In [None]:
p_rdf_3 = delong_roc_test(ytest.values.ravel(), predict_rdf_1, predict_rdf_3)

In [None]:
predict_rdf_1

In [None]:
math.exp(p_rdf_3)

In [None]:
# Bootstraping

In [None]:
p_rdf_4, z = pvalue(ytest.values.ravel(), predict_rdf_1, predict_rdf_3, score_fun=roc_auc_score)

In [None]:
p_rdf_4

In [None]:
# Model CV
scores = cross_val_score(rdf_1, xtest.values, ytest.values.ravel(), scoring='roc_auc')
roc_auc = np.mean(scores)
print('ROC AUC: %.2f%%' % (100*roc_auc))

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

### B. XGBoost Classification

#### 1. All

In [None]:
from xgboost import XGBClassifier
xgb_1 =  XGBClassifier(tree_method = 'exact',
                     objective = 'binary:logistic',
                     n_estimators = 1200,
                     min_child_weight = 8,
                     max_depth = 2,
                     gamma = 0.3,
                     reg_lambda = 90,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb_1.fit(xtrain,ytrain)

In [None]:
predict_xgb_1 = xgb_1.predict_proba(xtest)

In [None]:
predict_xgb_1 = predict_xgb_1[:,1]

In [None]:
roc_auc_score(ytest, predict_xgb_1)

#### 2. PD Ommitted

In [None]:
xgb_2 = XGBClassifier(tree_method = 'auto',
                     objective = 'binary:logistic',
                     n_estimators = 1000,
                     min_child_weight = 4,
                     max_depth = 2,
                     gamma = 0.5,
                     reg_lambda = 50,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb_2.fit(xtrain_nopd,ytrain)

In [None]:
predict_xgb_2 = xgb_2.predict_proba(xtest_nopd)

In [None]:
predict_xgb_2 = predict_xgb_2[:,1]

In [None]:
roc_auc_score(ytest, predict_xgb_2)

#### 3. Only PD

In [None]:
xgb_3 =  XGBClassifier(tree_method = 'hist',
                     objective = 'binary:logistic',
                     n_estimators = 200,
                     min_child_weight = 3,
                     max_depth = 2,
                     gamma = 0.0,
                     reg_lambda = 50,
                     learning_rate = 0.01,
                     scale_pos_weight = 119.85522788203754,
                     random_state = random_state)

In [None]:
xgb_3.fit(xtrain_pd,ytrain)

In [None]:
predict_xgb_3 = xgb_3.predict_proba(xtest_pd)

In [None]:
predict_xgb_3 = predict_xgb_3[:,1]

In [None]:
roc_auc_score(ytest, predict_xgb_3)

#### 4. Bootstraping and DeLong Test

In [None]:
# All vs without PD

In [None]:
# DeLong Test

In [None]:
p_xgb_1 = delong_roc_test(ytest.values.ravel(), predict_xgb_1, predict_xgb_2)

In [None]:
math.exp(p_xgb_1)

In [None]:
# Bootstraping

In [None]:
p_xgb_2, z = pvalue(ytest.values.ravel(), predict_xgb_1, predict_xgb_2, score_fun=roc_auc_score)

In [None]:
p_xgb_2

In [None]:
# All vs only PD

In [None]:
# DeLong Test

In [None]:
p_xgb_3 = delong_roc_test(ytest.values.ravel(), predict_xgb_1, predict_xgb_3)

In [None]:
math.exp(p_xgb_3)

In [None]:
# Bootstraping

In [None]:
p_xgb_4, z = pvalue(ytest.values.ravel(), predict_xgb_1, predict_xgb_3, score_fun=roc_auc_score)

In [None]:
p_xgb_4

### C. Logistic Regression Classification

#### 1. All

In [381]:
from sklearn.linear_model import LogisticRegression
log_1 =  LogisticRegression(penalty = 'l2',
                         solver = 'lbfgs',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.03359818286283781)

In [382]:
log_1.fit(xtrain,ytrain)

LogisticRegression(C=0.03359818286283781, class_weight='balanced',
                   random_state=2020)

In [383]:
predict_log_1 = log_1.predict_proba(xtest)

In [384]:
predict_log_1 = predict_log_1[:,1]

In [385]:
roc_auc_score(ytest, predict_log_1)

0.9225326200417537

#### 2. PD Ommitted

In [386]:
log_2 =  LogisticRegression(penalty = 'l2',
                         solver = 'newton-cg',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.03359818286283781)

In [470]:
xtrain_nopd = xtrain.loc[:,xtrain.columns != 'pd']
xtest_nopd = xtest.loc[:,xtest.columns != 'pd']

In [471]:
log_2.fit(xtrain_nopd,ytrain)

LogisticRegression(C=0.03359818286283781, class_weight='balanced',
                   random_state=2020, solver='newton-cg')

In [472]:
predict_log_2 = log_2.predict_proba(xtest_nopd)

In [473]:
predict_log_2 = predict_log_2[:,1]

In [474]:
roc_auc_score(ytest, predict_log_2)

0.9135914665970772

In [475]:
xtest_nopd

Unnamed: 0,atch,empch,salech,roech,ptbch,dlcpdlttdebit,nwcdta,redat,ebitdat,mvaluedtd,...,dtdat,actdlct,quickratio,bvdmv,nidseq,actdnat,ebitdxint,redsale,nidsale,ebitdsale
34821,0.074614,0.395604,0.001913,-0.090583,1.198695,3.837877,0.786032,0.219158,0.012346,0.027980,...,0.047381,5.279364,0.672710,0.467244,0.011865,1.225588,0.226686,0.281262,0.012048,0.015844
43997,0.066593,0.523044,0.653347,4.288486,21.293688,-5.435913,-0.005221,-0.318094,-0.133187,0.353727,...,0.723993,0.950281,0.753601,0.042694,-2.321567,1.142058,-0.515496,-0.931641,-0.572588,-0.390082
37849,0.151686,0.069767,0.113960,-0.000376,-1.813216,1.146984,0.015560,0.384759,0.099188,0.103472,...,0.113766,1.040633,0.287743,0.455054,0.122928,0.796451,0.036940,0.115344,0.018058,0.029735
61278,0.233387,0.200000,0.244911,0.076141,-0.380435,11.589041,0.524310,-1.226347,0.023982,1.947670,...,0.277924,2.198198,0.792042,3.469431,-0.257465,1.942933,4.577400,-1.849851,-0.192270,0.036174
3616,0.079972,0.029520,0.138705,0.011085,0.504975,4.711203,0.013980,0.165614,0.062117,0.399008,...,0.292645,1.230389,0.951531,0.420929,0.089412,0.241834,0.318678,0.494824,0.082474,0.185593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20751,0.113819,0.112600,0.138292,-0.025657,0.581703,1.278457,0.394583,0.523434,0.148059,0.133484,...,0.189286,3.457386,2.257829,0.386293,0.173398,1.013454,0.083159,0.443786,0.077263,0.125529
32829,0.241142,0.134888,0.192826,0.024265,1.480461,0.000000,0.435059,0.396876,0.182044,0.000000,...,0.000000,1.977005,1.808575,0.141071,0.245877,1.587080,0.000000,0.355831,0.122283,0.163217
28330,0.030897,0.146051,0.143349,0.110031,-0.842300,1.157527,0.290178,0.328776,0.235045,0.128600,...,0.272071,1.860646,1.082056,0.181488,0.450480,1.633864,0.045712,0.303109,0.159116,0.216696
44680,0.129805,0.162963,0.094389,-0.059211,0.060807,18.607639,-0.226996,0.045959,0.032461,3.800677,...,0.604019,0.382717,0.268883,1.494953,0.011704,0.592372,0.864308,0.113165,0.006491,0.079928


#### 3. Only PD

In [None]:
log_3 =  LogisticRegression(penalty = 'l1',
                         solver = 'liblinear',
                         class_weight = 'balanced',
                         random_state = random_state,
                         C = 0.0006951927961775605)

In [None]:
log_3.fit(xtrain_pd,ytrain)

In [None]:
predict_log_3 = log_3.predict_proba(xtest_pd)

In [None]:
predict_log_3 = predict_log_3[:,1]

In [None]:
roc_auc_score(ytest, predict_log_3)

#### 4. Bootstraping and DeLong Test

In [None]:
# All vs without PD

In [None]:
# DeLong Test

In [None]:
p_log_1 = delong_roc_test(ytest.values.ravel(), predict_log_1, predict_log_2)

In [None]:
math.exp(p_log_1)

In [None]:
# Bootstraping

In [None]:
p_log_2, z = pvalue(ytest.values.ravel(), predict_log_1, predict_log_2, score_fun=roc_auc_score)

In [None]:
p_log_2

In [None]:
# All vs only PD

In [None]:
# DeLong Test

In [None]:
p_log_3 = delong_roc_test(ytest.values.ravel(), predict_log_1, predict_log_3)

In [None]:
math.exp(p_log_3)

In [None]:
# Bootstraping

In [None]:
p_log_4, z = pvalue(ytest.values.ravel(), predict_log_1, predict_log_3,score_fun=roc_auc_score)

In [None]:
p_log_4

In [None]:
# Macimum Likelihood Tests

In [None]:
# All/without PD

In [None]:
from sklearn.metrics import log_loss
from scipy.stats import chi2

In [392]:
predict_df_all

Unnamed: 0_level_0,ALL,ALL,NOPD,NOPD
Unnamed: 0_level_1,LogisticRegression(),RandomForestClassifier(),LogisticRegression(),RandomForestClassifier()
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
19315,0.0,0.0,0.0,0.0
19316,0.0,0.0,0.0,0.0
19317,0.0,0.0,0.0,0.0
19318,0.0,0.0,0.0,0.0


In [None]:
def likelihood_RT():
    
    Test_df_sets = pd.DataFrame(np.zeros((2, len('LogisticRegression()'))), index=['ALL/PDE','ALL/PD'], columns=['LogisticRegression()'])
    Test_df_sets.columns = pd.MultiIndex.from_product([['LRT'], Test_df_sets.columns])
    
    alt_log_likelihood = -log_loss(labels,
                                   predict_log_1,
                                   normalize=False)
    null_log_likelihood = -log_loss(ytest,
                                    predict_log_2,
                                    normalize=False)
    G = 2 * (alt_log_likelihood - null_log_likelihood)
    p_log_l = chi2.sf(G, xtrain.shape[1])
    
    return Test_df_sets

In [None]:
alt_log_likelihood = -log_loss(ytest,
                               predict_log_1,
                               normalize=False)
null_log_likelihood = -log_loss(ytest,
                                predict_log_2,
                                normalize=False)

In [None]:
G = 2 * (alt_log_likelihood - null_log_likelihood)
p_log_l = chi2.sf(G, xtrain.shape[1])

In [None]:
p_log_l

In [None]:
# All/Only PD

In [None]:
alt_log_likelihood = -log_loss(ytest,
                               predict_log_1,
                               normalize=False)
null_log_likelihood = -log_loss(ytest,
                                predict_log_3,
                                normalize=False)

In [None]:
G = 2 * (alt_log_likelihood - null_log_likelihood)
p_log_2 = chi2.sf(G, xtrain.shape[1])

In [None]:
p_log_2

### D. Models ROC Curves Tests

In [None]:
# RF vs XGBoost

In [None]:
# DeLong Test

In [None]:
p_rf_xgb = delong_roc_test(ytest.values.ravel(), predict_rdf_1, predict_xgb_1)

In [None]:
math.exp(p_rf_xgb)

In [None]:
# Bootstraping

In [None]:
p_rf_xgb2, z = pvalue(ytest.values.ravel(), predict_xgb_1, predict_rdf_1, score_fun=roc_auc_score)

In [None]:
p_rf_xgb2

In [None]:
# RF vs LogReg

In [None]:
# DeLong Test

In [None]:
p_rf_logreg = delong_roc_test(ytest.values.ravel(), predict_rdf_1, predict_log_1)

In [None]:
math.exp(p_rf_logreg)

In [None]:
# Bootstraping

In [None]:
p_rf_logreg2, z = pvalue(ytest.values.ravel(), predict_rdf_1, predict_log_1, score_fun=roc_auc_score)

In [None]:
p_rf_logreg2

In [None]:
from statistics import mean 

In [None]:
# XGBoost vs LogReg

In [None]:
# DeLong Test

In [None]:
p_xgb_logreg = delong_roc_test(ytest.values.ravel(), predict_xgb_1, predict_log_1)

In [None]:
math.exp(p_xgb_logreg)

In [None]:
# Bootstraping

In [None]:
p_xgb_logreg2, z = pvalue(ytest.values.ravel(), predict_xgb_1, predict_log_1, score_fun=roc_auc_score)

In [None]:
p_xgb_logreg2

In [None]:
import scipy.stats

In [None]:
sum(z)

### E. Bootstraping Boxplots

#### 1. Random Forest

In [368]:
from sklearn.utils import resample

In [369]:
from scipy.stats import norm

In [None]:
# def predict(clf, X_train, y_train, X_test, y_test): 
#     xtest_1 = np.array(X_test)
#     ytest_1 = np.array(y_test)
    
#     folds = list(StratifiedKFold(n_splits=5, shuffle=True, 
#                              random_state = random_state).split(xtest_1, ytest_1))
    
#     x_list = np.zeros((len(folds[0][0]), len(folds)))
#     y_list = np.zeros((len(folds[0][0]), len(folds)))

#     for j, (train_idx, valid_idx) in enumerate(folds):

#         X_valid = xtest_1[train_idx]
#         Y_valid = ytest_1[train_idx]
        
#         clf.fit(X_train, y_train)

#         valid_pred = clf.predict_proba(X_valid)[:,1]

#         y_list[:,j] = Y_valid[:,0]
#         x_list[:,j] = valid_pred

#     return y_list, x_list

In [None]:
# rdf_list_y, rdf_list_x = predict(rdf_1, xtrain, ytrain, xtest, ytest)

In [None]:
# bootstrapped_rdf_1 = []

In [None]:
# for i in range(0,5): 
#     bootstrapped_rdf = bootstrap_error_estimate(rdf_list_x[:,i], rdf_list_y[:,i],method)
#     lower = bootstrapped_rdf_1.append(bootstrapped_rdf[0])
#     upper = bootstrapped_rdf_1.append(bootstrapped_rdf[1])

In [None]:
class Create_classifier(object):
    def __init__(self, n_splits, base_models, grids):
        self.n_splits = n_splits
        self.base_models = base_models
        self.grids = grids

    def predict(self, x_train, y_train, x_test, y_test, chosen_set = ''):
        
#         x_train = x_train
#         x_test =x_test
        
#         x_train_nopd = x_train.loc[:,x_train.columns != Merton_label]
#         x_test_nopd = x_test.loc[:,x_test.columns != Merton_label]
        
#         x_train_pd = x_train.loc[:,x_train.columns == Merton_label]
#         x_test_pd = x_test.loc[:,x_test.columns == Merton_label]
        
#         chosen_train_set = [x_train, x_train_nopd, x_train_pd]
#         chosen_test_set = [x_test, x_test_nopd, x_test_pd]
        
        
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state = random_state)
                  
        roc_auc_scores = pd.DataFrame(columns = [str(i) for i in self.base_models])
        test_pred = pd.DataFrame(np.zeros((x_test.shape[0], len(self.base_models))), columns=[str(i) for i in self.base_models])
        test_pred.columns = pd.MultiIndex.from_product([[chosen_set], test_pred.columns])
        feat_selected = pd.DataFrame(np.zeros((len(x_train.columns), len(self.base_models))), index=x_train.columns, columns=[str(i) for i in self.base_models])
        feat_importance = pd.DataFrame(np.zeros((len(x_train.columns), len(self.base_models))), index=x_train.columns, columns=[str(i) for i in self.base_models])
                  
        for i, clf in enumerate(self.base_models):
            for train_index, test_index in cv.split(x_train):
        
            pipe_lr = make_pipeline(PowerTransformer(method='yeo-johnson',standardize = True),
                                    RFECV(estimator = clf, step = 1, cv=cv, scoring = 'roc_auc'),
                                    clf)
                  
            search = RandomizedSearchCV(estimator=pipe_lr, param_distributions = self.grids[i], cv = cv, n_jobs=-1, verbose=True, scoring = 'roc_auc')
            search.fit(x_train, y_train)        
            
            predict_rdf = search.predict_proba(x_test)[:,1]
            test_pred[chosen_set][str(clf)] = predict_rdf
                  
            roc_auc_scores.loc[0,str(clf)] = roc_auc_score(y_test, predict_rdf)
                  
            for j in x_train.columns:
                feat_est = dict(zip(x_train.columns, search.best_estimator_.named_steps["rfecv"].ranking_))
                feat_selected.loc[str(j), str(clf)] = feat_est[str(j)]
                
                try:
                    importances = dict(zip(x_train.columns, search.best_estimator_.named_steps[str(clf).split('(')[0].lower()].feature_importances_))
                    feat_importance.loc[str(j), str(clf)] = importances[str(j)]
                except Exception:
                    pass
            
            estimator1.fit(X_train, y_train)
            est1_score = scorer(estimator1, X_test, y_test)
            score_diff.append(est1_score - est2_score)
            
            estimator2.fit(X_train, y_train)
                
        return roc_auc_scores, feat_selected, feat_importance, test_pred

In [440]:
def method(x,y):
    return roc_auc_score(x,y)

In [441]:
def bootstrap_error_estimate(pred, truth, method, method_name="", alpha=0.95, sample_frac=0.5, iterations=2000):
    """
    Generate a bootstrapped estimate of confidence intervals
    :param pred: list of predicted values
    :param truth: list of experimental values
    :param method: method to evaluate performance, e.g. matthews_corrcoef
    :param method_name: name of the method for the progress bar
    :param alpha: confidence limit (e.g. 0.95 for 95% confidence interval)
    :param sample_frac: fraction to resample for bootstrap confidence interval
    :param iterations: number of iterations for resampling
    :return: lower and upper bounds for confidence intervals
    """
    index_list = range(0, len(pred))
    num_samples = int(len(index_list) * sample_frac)
    stats = []
    for _ in range(0, iterations):
        sample_idx = resample(index_list, n_samples=num_samples)
        pred_sample = [pred[x] for x in sample_idx]
        truth_sample = [truth[x] for x in sample_idx]
        stats.append(method(truth_sample, pred_sample))
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(stats, p))
    p = (alpha+((1.0-alpha)/2.0)) * 100
    upper = min(1.0, np.percentile(stats, p))
    return lower, upper

In [None]:
for i in enumerate(self.base_models):
    predict_df_all[]

In [571]:
def interval_graph(predict_df_all, ):

    columns = []
    delong = []
    bootstrap = []

    for i in predict_df_all.columns:
        columns.append(i)
        
    for j in enumerate(columns):
        delong.append(calc_auc_ci(ytest.values.ravel(), predict_df_all[columns[j]], alpha=0.95))
        bootstrap.append(bootstrap_error_estimate(ytest.values.ravel(), predict_df_all[columns[j]], roc_auc_score))

In [917]:
columns = []
delong = []
bootstrap = []

for i in predict_df_all.columns:
    columns.append(i)
    for j in enumerate(columns):
        delong.append(calc_auc_ci(ytest.values.ravel(), predict_df_all[columns[j[0]]], alpha=0.95))
#         bootstrap.append(bootstrap_error_estimate(ytest.values.ravel(), predict_df_all[columns[j]], roc_auc_score))

In [102]:
delong

NameError: name 'delong' is not defined

In [913]:
predict_df_all[columns[0]]

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
19315    0.0
19316    0.0
19317    0.0
19318    0.0
19319    0.0
Name: (ALL, LogisticRegression()), Length: 19320, dtype: float64

In [103]:
delong[0][0]

NameError: name 'delong' is not defined

In [591]:
columns = []
for i in predict_df_all.columns:
    columns.append(i)

In [594]:
columns[0][0]

'ALL'

In [587]:
delong = []

for j in enumerate(columns):
    delong.append(calc_auc_ci(ytest.values.ravel(), predict_df_all[columns[j[0]]], alpha=0.95))

In [589]:
for j in enumerate(columns):
    print(columns[j[0]])

('ALL', 'LogisticRegression()')
('ALL', 'RandomForestClassifier()')
('NOPD', 'LogisticRegression()')
('NOPD', 'RandomForestClassifier()')


In [942]:
delong

[array([nan, nan]), array([nan, nan]), array([nan, nan]), array([nan, nan])]

In [930]:
columns = []

for i in predict_df_all.columns:
    columns.append(i)

In [936]:
columns

[('ALL', 'LogisticRegression()'),
 ('ALL', 'RandomForestClassifier()'),
 ('NOPD', 'LogisticRegression()'),
 ('NOPD', 'RandomForestClassifier()')]

In [939]:
delong = []
for j,k in enumerate(columns):
    delong.append(calc_auc_ci(ytest.values.ravel(), predict_df_all[columns[j]], alpha=0.95))

In [940]:
delong

[array([nan, nan]), array([nan, nan]), array([nan, nan]), array([nan, nan])]

In [922]:
predict_df_all[columns[0]]

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
19315    0.0
19316    0.0
19317    0.0
19318    0.0
19319    0.0
Name: (ALL, LogisticRegression()), Length: 19320, dtype: float64

In [552]:
predict_df_all.columns

MultiIndex([( 'ALL',     'LogisticRegression()'),
            ( 'ALL', 'RandomForestClassifier()'),
            ('NOPD',     'LogisticRegression()'),
            ('NOPD', 'RandomForestClassifier()')],
           )

In [556]:
for i in base_models:
    print(i)

LogisticRegression()
RandomForestClassifier()


In [None]:
predict_df_all.loc[:, predict_df_all.columns.get_level_values(1).isin([str(i) for i in self.base_models])] 

In [564]:
delong = []
for j in predict_df_all.loc[:, predict_df_all.columns.get_level_values(1).isin(['LogisticRegression()'])]:
    delong.append(calc_auc_ci(ytest.values.ravel(), predict_df_all[columns[j[1]]], alpha=0.95))

TypeError: list indices must be integers or slices, not str

In [602]:
for j in list(range(3)):
    print(j)

0
1
2


In [604]:
delong = []
for i in base_models:
    for j in list(range(3)):
        delong.append(calc_auc_ci(ytest.values.ravel(), predict_df_all.loc[:, predict_df_all.columns.get_level_values(1).isin([str(i) for i in base_models])].values[:,j]))

In [608]:
delong[1][1]

nan

In [None]:
bootstrapped_rdf_1

In [None]:
bootstrapped_rdf_2 = bootstrap_error_estimate(a, ytest.values.ravel(),method)

In [None]:
bootstrapped_rdf_3 = bootstrap_error_estimate(predict_rdf_3, ytest.values.ravel(),method)

In [513]:
delong_rdf_1 = calc_auc_ci(ytest.values.ravel(), a.values.ravel(), alpha=0.95)

In [None]:
delong_rdf_2 = calc_auc_ci(ytest.values.ravel(), predict_rdf_2.values, alpha=0.95)

In [None]:
delong_rdf_3 = calc_auc_ci(ytest.values.ravel(), predict_rdf_3, alpha=0.95)

In [None]:
# n_bootstraps = 2000
# rng_seed = 42  # control reproducibility

In [None]:
# # rdf_1
# bootstrapped_rdf_1 = []
# indices = rng.randint(0, len(predict_rdf_1), len(predict_rdf_1))

In [None]:
# # rdf_2
# bootstrapped_rdf_2 = []
# indices = rng.randint(0, len(predict_rdf_2), len(predict_rdf_2))

In [None]:
# # rdf_3
# bootstrapped_rdf_3 = []
# indices = rng.randint(0, len(predict_rdf_3), len(predict_rdf_3))

In [None]:
# rng = np.random.RandomState(rng_seed)
# for i in range(n_bootstraps):
#     # bootstrap by sampling with replacement on the prediction indices
#     indices = rng.randint(0, len(predict_rdf_3), len(predict_rdf_3))
#     if len(np.unique(ytest)) < 2:
#         # We need at least one positive and one negative sample for ROC AUC
#         # to be defined: reject the sample
#         continue

#     score = roc_auc_score(ytest.values.ravel()[indices], predict_rdf_3[indices])
#     bootstrapped_rdf_3.append(score)

In [None]:
# bootstrapped_rdf_1 = np.array(bootstrapped_rdf_1)
# bootstrapped_rdf_1.sort()

# # Computing the lower and upper bound of the 90% confidence interval
# # You can change the bounds percentiles to 0.025 and 0.975 to get
# # a 95% confidence interval instead.
# confidence_lower = sorted_scores[int(0.05 * len(sorted_scores))]
# confidence_upper = sorted_scores[int(0.95 * len(sorted_scores))]
# print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
#     confidence_lower, confidence_upper))

In [None]:
# intervals = []
# sample_means = []

# for sample in data_to_plot_rdf:
#     sample_mean = np.mean(sample)
#     sample_means.append(sample_mean)

#     sorted_scores = np.array(sample)
#     sorted_scores.sort()

#     # Computing the lower and upper bound of the 90% confidence interval
#     # You can change the bounds percentiles to 0.025 and 0.975 to get
#     # a 95% confidence interval instead.
#     confidence_lower = sorted_scores[int(0.0025 * len(sorted_scores))]
#     confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]

#     confidence_interval = (confidence_lower,
#                            confidence_upper)  
    
#     intervals.append(confidence_interval)

In [539]:
delong[0][1]

nan

In [None]:
bootstrap[0][0]

In [None]:
for i,j in 

In [None]:
intervals_rdf = [bootstrapped_rdf_1, bootstrapped_rdf_2, bootstrapped_rdf_3, delong_rdf_1, delong_rdf_2, delong_rdf_3]

In [941]:
def interval_graph(predict_df_all, ):

    columns = []
    delong = []
    bootstrap = []

    for i in predict_df_all.columns:
        columns.append(i)
        
    for j in enumerate(columns):
        delong.append([columns[j], calc_auc_ci(ytest.values.ravel(), predict_df_all[columns[0]], alpha=0.95)])
        bootstrap.append([columns[j], bootstrap_error_estimate(ytest.values.ravel(), predict_df_all[columns[0]], roc_auc_score)])
    
    delong_df = pd.DataFrame(columns = ['Set', 'Algorithm', 'Score'])
    b = pd.DataFrame(columns = ['Set', 'Algorithm', 'Score'])
    
    for j in range(len(delong)): 
        a.loc[j,'set'] = delong[j][0]
        a.loc[j,'algorithm'] = delong[j][1]
        a.loc[j,'score'] = delong[j][0]
        b.loc[j,'set'] = bootstrap[j][0]
        b.loc[j,'algorithm'] = bootstrap[j][1]
        b.loc[j,'score'] = bootstrap[j][0]
        
    a.groupby(['algorithm'])[['Set', 'Score']].min()

In [109]:
len(base_models)

2

In [115]:
predict_df

Unnamed: 0_level_0,ALL,ALL,NOPD,NOPD
Unnamed: 0_level_1,LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020)
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
19315,0.0,0.0,0.0,0.0
19316,0.0,0.0,0.0,0.0
19317,0.0,0.0,0.0,0.0
19318,0.0,0.0,0.0,0.0


In [314]:
pred_df3 = pd.DataFrame(np.zeros((xtest.shape[0], len(base_models))), columns=[str(i) for i in base_models])
pred_df3.columns = pd.MultiIndex.from_product([['PD'], pred_df3.columns])

In [315]:
pred_df_all = [pred_df,pred_df2,pred_df3]

In [316]:
pred_df = pd.concat(pred_df_all, axis = 1)

In [325]:
pred_df.iloc[0][2]

0.0

In [326]:
for i in range(pred_df.values.shape[0]):
    pred_df.iloc[i][0] = random()
    pred_df.iloc[i][1] = random()
    pred_df.iloc[i][2] = random()
    pred_df.iloc[i][3] = random()
    pred_df.iloc[i][4] = random()
    pred_df.iloc[i][5] = random()

In [334]:
predict_df = pred_df.copy(deep=False)

In [None]:
random()

In [280]:
random()

0.3241478056110484

In [None]:
columns = []
delong = []
bootstrap = []

for i in predict_df.columns:
    columns.append(i)

for j in enumerate(columns):
    delong.append([columns[j], calc_auc_ci(ytest.values.ravel(), predict_df[columns[0]], alpha=0.95)])
    bootstrap.append([columns[j], bootstrap_error_estimate(ytest.values.ravel(), predict_df[columns[0]], roc_auc_score)])

In [329]:
base_models

[LogisticRegression(random_state=2020),
 RandomForestClassifier(random_state=2020)]

In [345]:
columns[0]

('ALL', 'LogisticRegression(random_state=2020)')

In [351]:
delong = []
for i,j in enumerate(columns):
    delong.append([columns[i], calc_auc_ci(ytest.values.ravel(), predict_df[columns[i]], alpha=0.95)])

In [352]:
delong

[[('ALL', 'LogisticRegression(random_state=2020)'),
  array([0.470688  , 0.56133249])],
 [('ALL', 'RandomForestClassifier(random_state=2020)'),
  array([0.49467452, 0.58076716])],
 [('NOPD', 'LogisticRegression(random_state=2020)'),
  array([0.46338199, 0.55223387])],
 [('NOPD', 'RandomForestClassifier(random_state=2020)'),
  array([0.47753506, 0.5634207 ])],
 [('PD', 'LogisticRegression(random_state=2020)'),
  array([0.4610133, 0.5505779])],
 [('PD', 'RandomForestClassifier(random_state=2020)'),
  array([0.48170062, 0.57266068])]]

In [346]:
delong = []
delong.append([columns[0], calc_auc_ci(ytest.values.ravel(), predict_df[columns[0]], alpha=0.95)])

In [347]:
delong

[[('ALL', 'LogisticRegression(random_state=2020)'),
  array([0.470688  , 0.56133249])]]

In [355]:
predict_df[columns[0]].values

array([0.15885549, 0.97321207, 0.04793046, ..., 0.53369612, 0.54307258,
       0.78709885])

In [357]:
bootstrap.append([columns[0], bootstrap_error_estimate(predict_df[columns[0]], ytest.values.ravel(), roc_auc_score)])

In [340]:
for i,j in enumerate(columns):
    delong.append([columns[i], calc_auc_ci(ytest.values.ravel(), predict_df[columns[i]], alpha=0.95)])
    bootstrap.append([columns[i], bootstrap_error_estimate(ytest.values.ravel(), predict_df[columns[i]], roc_auc_score)])

ValueError: continuous format is not supported

In [339]:
columns = []
delong = []
bootstrap = []

for i in predict_df.columns:
    columns.append(i)

In [391]:
def graph_ci(predict_df, base_models):
    
    columns = []
    delong = []
    bootstrap = []

    for i in predict_df.columns:
        columns.append(i)

    for i,j in enumerate(columns):
        delong.append([columns[i], calc_auc_ci(ytest.values.ravel(), predict_df[columns[i]], alpha=0.95)])
        bootstrap.append([columns[i], bootstrap_error_estimate(predict_df[columns[i]], ytest.values.ravel(), roc_auc_score)])

    delong_df = pd.DataFrame(delong).rename(columns = {0:'Set', 1:'Score'})
    delong_df[['Set', 'Algorithm']] = pd.DataFrame(delong_df['Set'].tolist(), index=delong_df.index)
    delong_df[['Lower', 'Upper']] = pd.DataFrame(delong_df['Score'].tolist(), index=delong_df.index)
    bootstrap_df = pd.DataFrame(bootstrap).rename(columns = {0:'Set', 1:'Score'})
    bootstrap_df[['Set', 'Algorithm']] = pd.DataFrame(bootstrap_df['Set'].tolist(), index=bootstrap_df.index)
    bootstrap_df[['Lower', 'Upper']] = pd.DataFrame(bootstrap_df['Score'].tolist(), index=bootstrap_df.index)   

    for i,j in enumerate(base_models):
        delong_ci = delong_df.groupby(['Algorithm'])[['Set', 'Lower', 'Upper']].get_group(str(j)).reset_index()
        bootstrap_ci = bootstrap_df.groupby(['Algorithm'])[['Set', 'Lower', 'Upper']].get_group(str(j)).reset_index()

        plt.figure(figsize=(8,6))

        SMALL_SIZE = 10
        MEDIUM_SIZE = 12
        BIGGER_SIZE = 14

        plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
        plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
        plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
        plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
        plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

        x_ticks = ('Wszystkie Zmienne', 'Bez PD', 'Tylko PD')
        
        for k in range(len(x_ticks)):

            x_1 = k+1
            x_2 = x_1 + 0.1

            
            eb_1 = plt.errorbar(x=x_1, 
                             y=(bootstrap_ci['Upper'][k]+bootstrap_ci['Lower'][k])/2, 
                             yerr=[(bootstrap_ci['Upper'][k]-bootstrap_ci['Lower'][k])/2],
                             fmt='og',
                             capsize = 10,
                             ecolor = 'seagreen')

            eb_2 = plt.errorbar(x=x_2, 
                             y=(delong_ci['Upper'][k]+delong_ci['Lower'][k])/2,
                             yerr=[(delong_ci['Upper'][k]-delong_ci['Lower'][k])/2],
                             fmt='og',
                             capsize = 10,
                             ecolor = 'seagreen')
            eb_2[-1][0].set_linestyle('--')

            plt.xticks([1.05,2.05,3.05], x_ticks, rotation=90)
            plt.tight_layout()


            plt.ylabel("ROC AUC Przedział Ufności", fontsize=15)
            plt.tight_layout()

        plt.savefig('plot'+str(j)+'.png', dpi=1200)

        plt.close()

In [368]:
bootstrap

[[('ALL', 'LogisticRegression(random_state=2020)'),
  array([0.45306965, 0.57859698])]]

In [369]:
bootstrap_ci = pd.DataFrame(bootstrap).rename(columns = {0:'Set', 1:'Score'})

In [380]:
bootstrap_ci

Unnamed: 0,Set,Score,Algorithm,Lower,Upper
0,ALL,"[0.4530696528522594, 0.5785969821163989]",LogisticRegression(random_state=2020),0.45307,0.578597


In [371]:
bootstrap_ci[['Set', 'Algorithm']] = pd.DataFrame(bootstrap_ci['Set'].tolist(), index=bootstrap_ci.index)

In [372]:
bootstrap_ci[['Lower', 'Upper']] = pd.DataFrame(bootstrap_ci['Score'].tolist(), index=bootstrap_ci.index)   

In [373]:
bootstrap_ci

Unnamed: 0,Set,Score,Algorithm,Lower,Upper
0,ALL,"[0.4530696528522594, 0.5785969821163989]",LogisticRegression(random_state=2020),0.45307,0.578597


In [379]:
for i,j in enumerate(base_models):
    print(j)

LogisticRegression(random_state=2020)
RandomForestClassifier(random_state=2020)


In [381]:
bootstrap_ci = bootstrap_ci.groupby(['Algorithm'])[['Set', 'Lower', 'Upper']].get_group(str(j)).reset_index()

In [382]:
bootstrap_ci

Unnamed: 0,index,Set,Lower,Upper
0,0,ALL,0.45307,0.578597


In [392]:
graph_ci(predict_df, base_models)

In [277]:
from random import random

In [278]:
random()

0.49828526784465177

In [243]:
a = pd.DataFrame(delong).rename(columns = {0:'Set', 1:'Score'})

In [245]:
a[['Set', 'Algorithm']] = pd.DataFrame(a['Set'].tolist(), index=a.index)     

In [247]:
a[['Lower', 'Upper']] = pd.DataFrame(a['Score'].tolist(), index=a.index)     

In [194]:
delong_df = pd.DataFrame(columns = ['Set', 'Algorithm', 'Score'])

In [266]:
b = a.groupby(['Algorithm'])[['Algorithm','Set', 'Lower', 'Upper']].get_group('LogisticRegression(random_state=2020)').reset_index()

In [267]:
b

Unnamed: 0,index,Algorithm,Set,Lower,Upper
0,0,LogisticRegression(random_state=2020),ALL,,
1,2,LogisticRegression(random_state=2020),NOPD,,


In [270]:
str(b['Algorithm'].unique())

"['LogisticRegression(random_state=2020)']"

In [None]:
plt.figure(figsize=(8,6))

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

x_ticks = ('Wszystkie Zmienne', 'Bez PD', 'Tylko PD')

x_1 = i+1
x_2 = x_1 + 0.1

eb_1 = plt.errorbar(x=x_1, 
                 y=(bootstrap_ci['upper'][i]+bootstrap_ci['lower'][i])/2, 
                 yerr=[(bootstrap_ci['upper'][i]-bootstrap_ci['lower'][i])/2],
                 fmt='og',
                 capsize = 10,
                 ecolor = 'seagreen')

eb_2 = plt.errorbar(x=x_2, 
                 y=(delong_ci['upper'][i]+delong_ci['lower'][i])/2,
                 yerr=[(delong_ci['upper'][i]-delong_ci['lower'][i])/2],
                 fmt='og',
                 capsize = 10,
                 ecolor = 'seagreen')
eb_2[-1][0].set_linestyle('--')

plt.xticks([1,2,3], x_ticks, rotation=90)
plt.tight_layout()


plt.ylabel("ROC AUC Przedział Ufności", fontsize=15)
plt.tight_layout()

plt.savefig('INT_RDF.png', dpi=1200)

plt.close('plot'+str(base_models[i])+'.png')

In [609]:
for i in base_models:
    print(i)

LogisticRegression()
RandomForestClassifier()


#### 2. XGBoost

In [None]:
bootstrapped_xgb_1 = bootstrap_error_estimate(predict_xgb_1, ytest.values.ravel(),method)

In [None]:
bootstrapped_xgb_2 = bootstrap_error_estimate(predict_xgb_2, ytest.values.ravel(),method)

In [None]:
bootstrapped_xgb_3 = bootstrap_error_estimate(predict_xgb_3, ytest.values.ravel(),method)

In [None]:
delong_xgb_1 = calc_auc_ci(ytest.values.ravel(), predict_xgb_1, alpha=0.95)

In [None]:
delong_xgb_2 = calc_auc_ci(ytest.values.ravel(), predict_xgb_2, alpha=0.95)

In [None]:
delong_xgb_3 = calc_auc_ci(ytest.values.ravel(), predict_xgb_3, alpha=0.95)

In [None]:
intervals_xgb = [bootstrapped_xgb_1, bootstrapped_xgb_2, bootstrapped_xgb_3, delong_xgb_1, delong_xgb_2, delong_xgb_3]

In [None]:
plt.figure(figsize=(8,6))

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

x_ticks = ('All Features', 'PD Excluded', 'Only PD')

x_1 = 1
x_2 = 2
x_3 = 3
x_4 = x_1 + 0.1
x_5 = x_2 + 0.1
x_6 = x_3 + 0.1

eb_1 = plt.errorbar(x=x_1, 
             y=(intervals_xgb[0][1]+intervals_xgb[0][0])/2, 
             yerr=[(intervals_xgb[0][1]-intervals_xgb[0][0])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'royalblue')

eb_2 = plt.errorbar(x=x_2, 
             y=(intervals_xgb[1][1]+intervals_xgb[1][0])/2, 
             yerr=[(intervals_xgb[1][1]-intervals_xgb[1][0])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'lightsteelblue')

eb_3 = plt.errorbar(x=x_3, 
             y=(intervals_xgb[2][1]+intervals_xgb[2][0])/2, 
             yerr=[(intervals_xgb[2][1]-intervals_xgb[2][0])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'lavender')

eb_4 = plt.errorbar(x=x_4, 
             y=intervals_xgb[3][0], 
             yerr=[(intervals_xgb[3][1][0]-intervals_xgb[3][1][1])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'royalblue')
eb_4[-1][0].set_linestyle('--')

eb_5 = plt.errorbar(x=x_5, 
             y=intervals_xgb[4][0], 
             yerr=[(intervals_xgb[4][1][0]-intervals_xgb[4][1][1])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'lightsteelblue')
eb_5[-1][0].set_linestyle('--')

eb_6 = plt.errorbar(x=x_6, 
             y=intervals_xgb[5][0], 
             yerr=[(intervals_xgb[5][1][0]-intervals_xgb[5][1][1])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'lavender')
eb_6[-1][0].set_linestyle('--')

plt.xticks([x_1,x_2,x_3], x_ticks, rotation=90)
plt.tight_layout()


plt.ylabel("ROC AUC Score Confidence Interval", fontsize=15)
plt.tight_layout()

plt.savefig('INT_XGB.png', dpi=1200)

#### 3. Logistic Regression

In [None]:
bootstrapped_log_1 = bootstrap_error_estimate(predict_log_1, ytest.values.ravel(),method)

In [None]:
bootstrapped_log_2 = bootstrap_error_estimate(predict_log_2, ytest.values.ravel(),method)

In [None]:
bootstrapped_log_3 = bootstrap_error_estimate(predict_log_3, ytest.values.ravel(),method)

In [None]:
delong_log_1 = calc_auc_ci(ytest.values.ravel(), predict_log_1, alpha=0.95)

In [None]:
delong_log_2 = calc_auc_ci(ytest.values.ravel(), predict_log_2, alpha=0.95)

In [None]:
delong_log_3 = calc_auc_ci(ytest.values.ravel(), predict_log_3, alpha=0.95)

In [None]:
intervals_log = [bootstrapped_log_1, bootstrapped_log_2, bootstrapped_log_3, delong_log_1, delong_log_2, delong_log_3]

In [None]:
plt.figure(figsize=(8,6))

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

x_ticks = ('All Features', 'PD Excluded', 'Only PD')

x_1 = 1
x_2 = 2
x_3 = 3
x_4 = x_1 + 0.1
x_5 = x_2 + 0.1
x_6 = x_3 + 0.1

eb_1 = plt.errorbar(x=x_1, 
             y=(intervals_log[0][1]+intervals_log[0][0])/2, 
             yerr=[(intervals_log[0][1]-intervals_log[0][0])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'chocolate')

eb_2 = plt.errorbar(x=x_2, 
             y=(intervals_log[1][1]+intervals_log[1][0])/2, 
             yerr=[(intervals_log[1][1]-intervals_log[1][0])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'burlywood')

eb_3 = plt.errorbar(x=x_3, 
             y=(intervals_log[2][1]+intervals_log[2][0])/2, 
             yerr=[(intervals_log[2][1]-intervals_log[2][0])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'navajowhite')

eb_4 = plt.errorbar(x=x_4, 
             y=intervals_log[3][0], 
             yerr=[(intervals_log[3][1][0]-intervals_log[3][1][1])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'chocolate')
eb_4[-1][0].set_linestyle('--')

eb_5 = plt.errorbar(x=x_5, 
             y=intervals_log[4][0], 
             yerr=[(intervals_log[4][1][0]-intervals_log[4][1][1])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'burlywood')
eb_5[-1][0].set_linestyle('--')

eb_6 = plt.errorbar(x=x_6, 
             y=intervals_log[5][0], 
             yerr=[(intervals_log[5][1][0]-intervals_log[5][1][1])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'navajowhite')
eb_6[-1][0].set_linestyle('--')

plt.xticks([x_1,x_2,x_3], x_ticks, rotation=90)
plt.tight_layout()


plt.ylabel("ROC AUC Score Confidence Interval", fontsize=15)
plt.tight_layout()

plt.savefig('INT_LOG.png', dpi=1200)

#### 4. All

In [None]:
bootstrapped_all_2 = bootstrap_error_estimate(predict_rdf_1, ytest.values.ravel(),method)

In [None]:
bootstrapped_all_1 = bootstrap_error_estimate(predict_xgb_1, ytest.values.ravel(),method)

In [None]:
bootstrapped_all_3 = bootstrap_error_estimate(predict_log_1, ytest.values.ravel(),method)

In [None]:
delong_all_2 = calc_auc_ci(ytest.values.ravel(), predict_rdf_1, alpha=0.95)

In [None]:
delong_all_1 = calc_auc_ci(ytest.values.ravel(), predict_xgb_1, alpha=0.95)

In [None]:
delong_all_3 = calc_auc_ci(ytest.values.ravel(), predict_log_1, alpha=0.95)

In [None]:
intervals_all = [bootstrapped_all_1, bootstrapped_all_2, bootstrapped_all_3, delong_all_1, delong_all_2, delong_all_3]

In [None]:
intervals_all

In [None]:
plt.figure(figsize=(8,6))

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

x_ticks = ('XGBoost', 'Random Forest', 'Logistic Regression')

x_1 = 1
x_2 = 2
x_3 = 3
x_4 = x_1 + 0.1
x_5 = x_2 + 0.1
x_6 = x_3 + 0.1

eb_1 = plt.errorbar(x=x_1, 
             y=(intervals_all[0][1]+intervals_all[0][0])/2, 
             yerr=[(intervals_all[0][1]-intervals_all[0][0])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'royalblue')

eb_2 = plt.errorbar(x=x_2, 
             y=(intervals_all[1][1]+intervals_all[1][0])/2, 
             yerr=[(intervals_all[1][1]-intervals_all[1][0])/2],
             fmt='og',
             capsize = 10,
             ecolor = 'seagreen')

eb_3 = plt.errorbar(x=x_3, 
             y=(intervals_all[2][1]+intervals_all[2][0])/2, 
             yerr=[(intervals_all[2][1]-intervals_all[2][0])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'chocolate')

eb_4 = plt.errorbar(x=x_4, 
             y=intervals_all[3][0], 
             yerr=[(intervals_all[3][1][0]-intervals_all[3][1][1])/2],
             fmt='ob',
             capsize = 10,
             ecolor = 'royalblue')
eb_4[-1][0].set_linestyle('--')

eb_5 = plt.errorbar(x=x_5, 
             y=intervals_all[4][0], 
             yerr=[(intervals_all[4][1][0]-intervals_all[4][1][1])/2],
             fmt='og',
             capsize = 10,
             ecolor = 'seagreen')
eb_5[-1][0].set_linestyle('--')

eb_6 = plt.errorbar(x=x_6, 
             y=intervals_all[5][0], 
             yerr=[(intervals_all[5][1][0]-intervals_all[5][1][1])/2],
             fmt='or',
             capsize = 10,
             ecolor = 'chocolate')
eb_6[-1][0].set_linestyle('--')

plt.xticks([x_1,x_2,x_3], x_ticks, rotation=90)
plt.tight_layout()


plt.ylabel("ROC AUC Score Confidence Interval", fontsize=15)
plt.tight_layout()

plt.savefig('INT_ALL.png', dpi=1200)

### F. K-Fold Ttest

#### 4. All

In [None]:
# RDF/XGBoost

In [None]:
import mlxtend

In [None]:
from mlxtend.evaluate import paired_ttest_5x2cv


t, p = paired_ttest_5x2cv(estimator1=rdf_1,
                          estimator2=xgb_1,
                          X=x_test, y=y_test,
                          random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

In [None]:
# RDF/LOG

In [None]:
import mlxtend

In [None]:
from mlxtend.evaluate import paired_ttest_5x2cv


t, p = paired_ttest_5x2cv(estimator1=rdf_1,
                          estimator2=log_1,
                          X=xtest, y=ytest,
                          random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

In [None]:
# XGBoost/LOG

In [None]:
import mlxtend

In [None]:
from mlxtend.evaluate import paired_ttest_5x2cv


t, p = paired_ttest_5x2cv(estimator1=xgb_1,
                          estimator2=log_1,
                          X=xtest, y=ytest,
                          random_seed=1)

print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

### G. McNemar Test

In [None]:
from mlxtend.evaluate import mcnemar_table

#### 1. Random Forest

In [None]:
#All/PD Excluded

In [None]:
tb_rdf_1 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_rdf_4, 
                   y_model2=predict_rdf_5)

In [None]:
tb_rdf_1

In [None]:
from mlxtend.evaluate import mcnemar

chi2_rdf_1, p_rdf_1 = mcnemar(ary=tb_rdf_1, corrected=True)
print('chi-squared:', chi2_rdf_1)
print('p-value:', p_rdf_1)

In [None]:
#All/Only PD

In [None]:
tb_rdf_2 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_rdf_4, 
                   y_model2=predict_rdf_6)

In [None]:
tb_rdf_2

In [None]:
from mlxtend.evaluate import mcnemar

chi2_rdf_2, p_rdf_2 = mcnemar(ary=tb_rdf_2, corrected=True)
print('chi-squared:', chi2_rdf_2)
print('p-value:', p_rdf_2)

#### 2. XGBoost

In [None]:
#All/PD Excluded

In [None]:
tb_xgb_1 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_xgb_4, 
                   y_model2=predict_xgb_5)

In [None]:
tb_xgb_1

In [None]:
from mlxtend.evaluate import mcnemar

chi2_xgb_1, p_xgb_1 = mcnemar(ary=tb_xgb_1, corrected=True)
print('chi-squared:', chi2_xgb_1)
print('p-value:', p_xgb_1)

In [None]:
#All/Only PD

In [None]:
tb_xgb_2 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_xgb_4, 
                   y_model2=predict_xgb_6)

In [None]:
tb_xgb_2

In [None]:
from mlxtend.evaluate import mcnemar

chi2_xgb_2, p_xgb_2 = mcnemar(ary=tb_xgb_2, corrected=True)
print('chi-squared:', chi2_xgb_2)
print('p-value:', p_xgb_2)

#### 3. Logistic Regression

In [None]:
#All/PD Excluded

In [None]:
tb_log_1 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_log_4, 
                   y_model2=predict_log_5)

In [None]:
tb_log_1

In [None]:
from mlxtend.evaluate import mcnemar

chi2_log_1, p_log_1 = mcnemar(ary=tb_log_1, corrected=True)
print('chi-squared:', chi2_log_1)
print('p-value:', p_log_1)

In [None]:
#All/Only PD

In [None]:
tb_log_2 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_log_4, 
                   y_model2=predict_log_6)

In [None]:
tb_log_2

In [None]:
from mlxtend.evaluate import mcnemar

chi2_log_2, p_log_2 = mcnemar(ary=tb_log_2, corrected=True)
print('chi-squared:', chi2_log_2)
print('p-value:', p_log_2)

#### 4. All

In [None]:
# RDF/XGBoost

In [None]:
tb_all_1 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_rdf_4, 
                   y_model2=predict_xgb_4)

In [None]:
tb_all_1

In [None]:
from mlxtend.evaluate import mcnemar

chi2_all_1, p_all_1 = mcnemar(ary=tb_all_1, corrected=True)
print('chi-squared:', chi2_all_1)
print('p-value:', p_all_1)

In [None]:
# RDF/LOG

In [None]:
tb_all_2 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_rdf_4, 
                   y_model2=predict_log_4)

In [None]:
tb_all_2

In [None]:
from mlxtend.evaluate import mcnemar

chi2_all_2, p_all_2 = mcnemar(ary=tb_all_2, corrected=True)
print('chi-squared:', chi2_all_2)
print('p-value:', p_all_2)

In [None]:
# XGBoost/LOG

In [None]:
tb_all_3 = mcnemar_table(y_target=ytest.values.ravel(), 
                   y_model1=predict_log_4, 
                   y_model2=predict_xgb_4)

In [None]:
tb_all_3

In [None]:
from mlxtend.evaluate import mcnemar

chi2_all_3, p_all_3 = mcnemar(ary=tb_all_3, corrected=True)
print('chi-squared:', chi2_all_3)
print('p-value:', p_all_3)

In [None]:
# bootstrap

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# Model CV
scores = cross_val_score(rdf_1, xtest.values, ytest.values.ravel(), scoring='roc_auc')
roc_auc = np.mean(scores)
print('ROC AUC: %.2f%%' % (100*roc_auc))

# Confidence interval
lower = np.percentile(scores, 2.5)
upper = np.percentile(scores, 97.5)
print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))

In [None]:
scores

### H. ROC Curve All

In [None]:
# Instantiate the classfiers and make a list
classifiers = [predict_xgb_1,
               predict_rdf_1,  
               predict_log_1]

# Define a result table as a DataFrame
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

In [None]:
# Train the models and record the results
for cls in classifiers:
    yproba = cls
    
    fpr, tpr, _ = roc_curve(ytest.values.ravel(),  yproba)
    auc = roc_auc_score(ytest.values.ravel(), yproba)
    
    result_table = result_table.append({'classifiers':cls.__class__.__name__,
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)

# Set name of the classifiers as index labels
result_table.set_index('classifiers', inplace=True)

In [None]:
new_index = ['XGBoost', 'Random Forest', 'Logistic Regression']

In [None]:
new_index = ['All features', 'PD Excluded', 'Only PD']

In [None]:
result_table.index = new_index

In [None]:
result_table

In [445]:
predict_df

Unnamed: 0_level_0,ALL,ALL,NOPD,NOPD,PD,PD
Unnamed: 0_level_1,LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020)
0,0.158855,0.524334,0.963207,0.343301,0.990515,0.409867
1,0.973212,0.456156,0.436855,0.394809,0.210037,0.569305
2,0.047930,0.294047,0.726307,0.157301,0.860371,0.892455
3,0.765273,0.372387,0.689347,0.296049,0.889302,0.396982
4,0.196238,0.072257,0.485313,0.781989,0.493927,0.630451
...,...,...,...,...,...,...
19315,0.082049,0.484743,0.279380,0.756635,0.651031,0.287432
19316,0.148862,0.677764,0.544325,0.385573,0.703964,0.779929
19317,0.533696,0.597796,0.248720,0.660196,0.656771,0.899105
19318,0.543073,0.938828,0.547139,0.035523,0.008473,0.715117


In [457]:
for i,(j, clf) in enumerate(predict_df):
    print(predict_df[j][clf]);

SyntaxError: invalid syntax (<ipython-input-457-604d346906c9>, line 1)

In [459]:
for i in base_models[:2]:
    print(i)

LogisticRegression(random_state=2020)
RandomForestClassifier(random_state=2020)


In [None]:
base_models2 = 

In [465]:
predict_df

Unnamed: 0_level_0,ALL,ALL,NOPD,NOPD,PD,PD
Unnamed: 0_level_1,LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020),LogisticRegression(random_state=2020),RandomForestClassifier(random_state=2020)
0,0.158855,0.524334,0.963207,0.343301,0.990515,0.409867
1,0.973212,0.456156,0.436855,0.394809,0.210037,0.569305
2,0.047930,0.294047,0.726307,0.157301,0.860371,0.892455
3,0.765273,0.372387,0.689347,0.296049,0.889302,0.396982
4,0.196238,0.072257,0.485313,0.781989,0.493927,0.630451
...,...,...,...,...,...,...
19315,0.082049,0.484743,0.279380,0.756635,0.651031,0.287432
19316,0.148862,0.677764,0.544325,0.385573,0.703964,0.779929
19317,0.533696,0.597796,0.248720,0.660196,0.656771,0.899105
19318,0.543073,0.938828,0.547139,0.035523,0.008473,0.715117


In [497]:
for i,(j,clf) in enumerate(predict_df):
    predict_df.query(str(j)=='ALL')

ValueError: expr must be a string to be evaluated, <class 'bool'> given

In [498]:
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
for i, (j, clf) in enumerate(predict_df):
    yproba = predict_df[j][clf]
    
    fpr, tpr, _ = roc_curve(ytest.values.ravel(),  yproba)
    auc = roc_auc_score(ytest.values.ravel(), yproba)
    
    result_table = result_table.append({'classifiers': [j,clf],
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)
    
    result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)

    # Set name of the classifiers as index labels
#     result_table.set_index('classifiers', inplace=True)

In [504]:
result_table.loc[result_table.set == 'ALL']['fpr']

0    [0.0, 5.2192066805845514e-05, 0.01064718162839...
1    [0.0, 5.2192066805845514e-05, 0.00120041753653...
Name: fpr, dtype: object

In [None]:
[str(clf).split('(')[0].lower()]

In [510]:
len(base_models[:2])

2

In [621]:
def roc_comparison_all(predict_df, y_test):
    
    # Plot the figure
    # Train the models and record the results
    result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

    for i, (j, clf) in enumerate(predict_df):
        yproba = predict_df[j][clf]

        fpr, tpr, _ = roc_curve(y_test.values.ravel(),  yproba)
        auc = roc_auc_score(y_test.values.ravel(), yproba)

        result_table = result_table.append({'classifiers': [j,clf],
                                            'fpr':fpr, 
                                            'tpr':tpr, 
                                            'auc':auc}, ignore_index=True)

        result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)

    fig = plt.figure(figsize=(8,6))

    for k,m in enumerate(result_table.classifier.unique()):

    #     n_lines = len(base_models[:2])
    #     c = np.arange(1, n_lines + 1)
    #     norm = mpl.colors.Normalize(vmin=c.min(), vmax=c.max())
    #     cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues)
    #     cmap.set_array([])

        plt.plot(result_table.loc[result_table.set == 'ALL']['fpr'][k], 
                 result_table.loc[result_table.set == 'ALL']['tpr'][k],
    #              c=cmap.to_rgba(k + 2),
                 label="{}, AUC={:.3f}".format(str(m).split('(')[0].lower(), result_table.loc[result_table.set == 'ALL']['auc'][k]))

    plt.plot([0,1], [0,1], color='gray', linestyle='--')

    plt.xticks(np.arange(0.0, 1.1, step=0.1))
    plt.xlabel("False Positive Rate", fontsize=15)

    plt.yticks(np.arange(0.0, 1.1, step=0.1))
    plt.ylabel("True Positive Rate", fontsize=15)

    # plt.title('ROC Curve Logistic Regression Analysis', fontweight='bold', fontsize=15)
    plt.legend(prop={'size':13}, loc='lower right')
    plt.savefig('ALL.png',  dpi=1200)
    plt.close()

In [628]:
def roc_comparison_sets(predict_df, y_test):
    
    # Plot the figure
    # Train the models and record the results
    result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
    for i, (j, clf) in enumerate(predict_df):
        yproba = predict_df[j][clf]

        fpr, tpr, _ = roc_curve(y_test.values.ravel(),  yproba)
        auc = roc_auc_score(y_test.values.ravel(), yproba)

        result_table = result_table.append({'classifiers': [j,clf],
                                            'fpr':fpr, 
                                            'tpr':tpr, 
                                            'auc':auc}, ignore_index=True)

        result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)

        fig = plt.figure(figsize=(8,6))

        for k,m in enumerate(result_table.set.unique()):

            plt.plot(result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['fpr'].values[0], 
                     result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['tpr'].values[0],
                     label="{}, AUC={:.3f}".format(str(m), result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['auc'].values[0]))

            plt.plot([0,1], [0,1], color='gray', linestyle='--')

            plt.xticks(np.arange(0.0, 1.1, step=0.1))
            plt.xlabel("False Positive Rate", fontsize=15)

            plt.yticks(np.arange(0.0, 1.1, step=0.1))
            plt.ylabel("True Positive Rate", fontsize=15)

            plt.legend(prop={'size':13}, loc='lower right')
            
        plt.savefig(str(clf)+str(m)+'.png',  dpi=1200)
        plt.close()

In [734]:
def graph_ci(predict_df, base_models):
    
        
    result_table = pd.DataFrame(columns=['classifiers', 'delong','bootstrap'])
    for i, (j, clf) in enumerate(predict_df):
        yproba = predict_df[j][clf]

        delong = calc_auc_ci(ytest.values.ravel(),  yproba, alpha=0.95) 
        bootstrap = bootstrap_error_estimate(yproba, ytest.values.ravel(), roc_auc_score)

        result_table = result_table.append({'classifiers': [j,clf],
                                            'delong':delong, 
                                            'bootstrap':bootstrap}, ignore_index=True)

        result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)
        
        plt.figure(figsize=(8,6))

        for k,(m,n) in enumerate(result_table.classifiers):

            SMALL_SIZE = 10
            MEDIUM_SIZE = 12
            BIGGER_SIZE = 14

            plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
            plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
            plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
            plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
            plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
            plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
            plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

            x_ticks = ('Wszystkie Zmienne', 'Bez PD', 'Tylko PD')
            
            def m_value(x):
                if x == 'ALL':
                    x_1 = 1
                    x_2 = x_1 + 0.1
                elif x == 'NOPD':
                    x_1 = 2
                    x_2 = x_1 + 0.1
                elif x == 'PD':
                    x_1 = 3
                    x_2 = x_1 + 0.1
                return list([x_1,x_2])

            eb_1 = plt.errorbar(x=m_value(str(m))[0], 
                             y=(result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['bootstrap'].values[0][1] + result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['bootstrap'].values[0][0])/2, 
                             yerr=[(result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['bootstrap'].values[0][1] - result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['bootstrap'].values[0][0])/2],
                             fmt='o',
                             capsize = 10)

            eb_2 = plt.errorbar(x=m_value(str(m))[1], 
                             y=(result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['delong'].values[0][1] + result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['delong'].values[0][0])/2, 
                             yerr=[(result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['delong'].values[0][1] - result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['delong'].values[0][0])/2],
                             fmt='o',
                             capsize = 10)
            eb_2[-1][0].set_linestyle('--')

            plt.xticks([1.05,2.05,3.05], x_ticks, rotation=90)
            plt.tight_layout()


            plt.ylabel("ROC AUC Przedział Ufności", fontsize=15)
            plt.tight_layout()

        plt.savefig('plot'+str(clf)+'ci.png', dpi=1200)
        plt.close()

In [728]:
def m_value(x):
    if x == 'ALL':
        x_1 = 1
        x_2 = x_1 + 0.1
    elif x == 'NOPD':
        x_1 = 2
        x_2 = x_1 + 0.1
    elif x == 'PD':
        x_1 = 3
        x_2 = x_1 + 0.1
    return list([x_1,x_2])

In [733]:
for k,(m,n) in enumerate(result_table.classifiers):
    print(k,m,n)

0 ALL LogisticRegression(random_state=2020)
1 ALL RandomForestClassifier(random_state=2020)
2 NOPD LogisticRegression(random_state=2020)
3 NOPD RandomForestClassifier(random_state=2020)
4 PD LogisticRegression(random_state=2020)
5 PD RandomForestClassifier(random_state=2020)


In [735]:
graph_ci(predict_df,base_models[:2])

In [676]:
result_table = pd.DataFrame(columns=['classifiers', 'delong','bootstrap'])
for i, (j, clf) in enumerate(predict_df):
    yproba = predict_df[j][clf]

    delong = calc_auc_ci(ytest.values.ravel(),  yproba, alpha=0.95) 
    bootstrap = bootstrap_error_estimate(yproba, ytest.values.ravel(), roc_auc_score)

    result_table = result_table.append({'classifiers': [j,clf],
                                        'delong':delong, 
                                        'bootstrap':bootstrap}, ignore_index=True)
    
    result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifiers'].tolist(), index=result_table.index)

# result_table[['set', 'classifier']] = pd.DataFrame(result_table['classifier'].tolist(), index=result_table.index)

# for i,j in enumerate(result_table.classifier.unique()):
    
#     plt.plot(result_table.loc[(result_table.classifier == str(j)) & (result_table.set == str(m))]['fpr'].values[0], 
#                      result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['tpr'].values[0],
#                      label="{}, AUC={:.3f}".format(str(m), result_table.loc[(result_table.classifier == str(clf)) & (result_table.set == str(m))]['auc'].values[0]))

#             plt.plot([0,1], [0,1], color='gray', linestyle='--')

#             plt.xticks(np.arange(0.0, 1.1, step=0.1))
#             plt.xlabel("False Positive Rate", fontsize=15)

#             plt.yticks(np.arange(0.0, 1.1, step=0.1))
#             plt.ylabel("True Positive Rate", fontsize=15)

#             plt.legend(prop={'size':13}, loc='lower right')
            
#         plt.savefig(str(clf)+str(m)+'.png',  dpi=1200)
#         plt.close()
    

In [692]:
for k,(m,n) in enumerate(result_table.classifiers):
    print(k,m,n)

0 ALL LogisticRegression(random_state=2020)
1 ALL RandomForestClassifier(random_state=2020)
2 NOPD LogisticRegression(random_state=2020)
3 NOPD RandomForestClassifier(random_state=2020)
4 PD LogisticRegression(random_state=2020)
5 PD RandomForestClassifier(random_state=2020)


In [697]:
result_table.loc[(result_table.classifier == 'RandomForestClassifier(random_state=2020)') & (result_table.set == 'ALL')]['delong'].values[0][0]

0.4946745167682817

In [678]:
y

Series([], Name: delong, dtype: object)

In [672]:
for k,(m,n) in enumerate(result_table.classifier):
    print(k,m,n)

0 ALL LogisticRegression(random_state=2020)
1 ALL RandomForestClassifier(random_state=2020)
2 NOPD LogisticRegression(random_state=2020)
3 NOPD RandomForestClassifier(random_state=2020)
4 PD LogisticRegression(random_state=2020)
5 PD RandomForestClassifier(random_state=2020)


In [653]:
# def graph_ci(predict_df, base_models):
    
#     columns = []
#     delong = []
#     bootstrap = []

#     for i in predict_df.columns:
#         columns.append(i)

#     for i,j in enumerate(columns):
#         delong.append([columns[i], calc_auc_ci(ytest.values.ravel(), predict_df[columns[i]], alpha=0.95)])
#         bootstrap.append([columns[i], bootstrap_error_estimate(predict_df[columns[i]], ytest.values.ravel(), roc_auc_score)])

#     delong_df = pd.DataFrame(delong).rename(columns = {0:'Set', 1:'Score'})
#     delong_df[['Set', 'Algorithm']] = pd.DataFrame(delong_df['Set'].tolist(), index=delong_df.index)
#     delong_df[['Lower', 'Upper']] = pd.DataFrame(delong_df['Score'].tolist(), index=delong_df.index)
#     bootstrap_df = pd.DataFrame(bootstrap).rename(columns = {0:'Set', 1:'Score'})
#     bootstrap_df[['Set', 'Algorithm']] = pd.DataFrame(bootstrap_df['Set'].tolist(), index=bootstrap_df.index)
#     bootstrap_df[['Lower', 'Upper']] = pd.DataFrame(bootstrap_df['Score'].tolist(), index=bootstrap_df.index)   

#     for i,j in enumerate(base_models[:2]):
#         delong_ci = delong_df.groupby(['Algorithm'])[['Set', 'Lower', 'Upper']].get_group(str(j)).reset_index()
#         bootstrap_ci = bootstrap_df.groupby(['Algorithm'])[['Set', 'Lower', 'Upper']].get_group(str(j)).reset_index()

#         plt.figure(figsize=(8,6))

#         SMALL_SIZE = 10
#         MEDIUM_SIZE = 12
#         BIGGER_SIZE = 14

#         plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
#         plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
#         plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
#         plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
#         plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
#         plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
#         plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

#         x_ticks = ('Wszystkie Zmienne', 'Bez PD', 'Tylko PD')
        
#         for k in range(len(x_ticks)):

#             x_1 = k+1
#             x_2 = x_1 + 0.1

            
#             eb_1 = plt.errorbar(x=x_1, 
#                              y=(bootstrap_ci['Upper'][k]+bootstrap_ci['Lower'][k])/2, 
#                              yerr=[(bootstrap_ci['Upper'][k]-bootstrap_ci['Lower'][k])/2],
#                              fmt='og',
#                              capsize = 10)

#             eb_2 = plt.errorbar(x=x_2, 
#                              y=(delong_ci['Upper'][k]+delong_ci['Lower'][k])/2,
#                              yerr=[(delong_ci['Upper'][k]-delong_ci['Lower'][k])/2],
#                              fmt='og',
#                              capsize = 10)
#             eb_2[-1][0].set_linestyle('--')

#             plt.xticks([1.05,2.05,3.05], x_ticks, rotation=90)
#             plt.tight_layout()


#             plt.ylabel("ROC AUC Przedział Ufności", fontsize=15)
#             plt.tight_layout()

#         plt.savefig('plot'+str(j)+'.png', dpi=1200)

#         plt.close()
    
#     return delong, bootstrap

In [664]:
result_table

Unnamed: 0,classifier,delong,bootstrap,set
0,LogisticRegression(random_state=2020),"[0.4706879995996858, 0.5613324857865355]","[0.45963064905739076, 0.5693807497078069]",ALL
1,RandomForestClassifier(random_state=2020),"[0.4946745167682817, 0.5807671585970627]","[0.46985884961199, 0.5948487029918954]",ALL
2,LogisticRegression(random_state=2020),"[0.4633819916711999, 0.5522338747171092]","[0.461787920343777, 0.5783646269699532]",NOPD
3,RandomForestClassifier(random_state=2020),"[0.4775350630235856, 0.5634207041997964]","[0.45206475083733855, 0.5732989906387373]",NOPD
4,LogisticRegression(random_state=2020),"[0.46101330408158914, 0.5505779015551542]","[0.4402038441820304, 0.5645910949661206]",PD
5,RandomForestClassifier(random_state=2020),"[0.4817006239345414, 0.572660675647922]","[0.44345131886259637, 0.5928249305764615]",PD


In [654]:
a, b = graph_ci(predict_df, base_models)

In [655]:
a

[[('ALL', 'LogisticRegression(random_state=2020)'),
  array([0.470688  , 0.56133249])],
 [('ALL', 'RandomForestClassifier(random_state=2020)'),
  array([0.49467452, 0.58076716])],
 [('NOPD', 'LogisticRegression(random_state=2020)'),
  array([0.46338199, 0.55223387])],
 [('NOPD', 'RandomForestClassifier(random_state=2020)'),
  array([0.47753506, 0.5634207 ])],
 [('PD', 'LogisticRegression(random_state=2020)'),
  array([0.4610133, 0.5505779])],
 [('PD', 'RandomForestClassifier(random_state=2020)'),
  array([0.48170062, 0.57266068])]]

In [656]:
b

[[('ALL', 'LogisticRegression(random_state=2020)'),
  array([0.46412033, 0.58517592])],
 [('ALL', 'RandomForestClassifier(random_state=2020)'),
  array([0.48594683, 0.5919749 ])],
 [('NOPD', 'LogisticRegression(random_state=2020)'),
  array([0.45803984, 0.57427771])],
 [('NOPD', 'RandomForestClassifier(random_state=2020)'),
  array([0.46958834, 0.57768336])],
 [('PD', 'LogisticRegression(random_state=2020)'),
  array([0.44704029, 0.55731509])],
 [('PD', 'RandomForestClassifier(random_state=2020)'),
  array([0.46083828, 0.58935916])]]

In [651]:
columns = []
delong = []
bootstrap = []

for i in predict_df.columns:
    columns.append(i)

for i,j in enumerate(columns):
    delong.append([columns[i], calc_auc_ci(ytest.values.ravel(), predict_df[columns[i]], alpha=0.95)])
    bootstrap.append([columns[i], bootstrap_error_estimate(predict_df[columns[i]], ytest.values.ravel(), roc_auc_score)])

In [629]:
roc_comparison_sets(predict_df, ytest)

In [616]:
result_table.loc[(result_table.classifier == 'LogisticRegression(random_state=2020)') & (result_table.set == 'ALL')]['fpr'].values[0]

array([0.00000000e+00, 5.21920668e-05, 1.06471816e-02, 1.06471816e-02,
       1.10647182e-02, 1.10647182e-02, 1.45615866e-02, 1.45615866e-02,
       1.96242171e-02, 1.96242171e-02, 2.22860125e-02, 2.22860125e-02,
       2.73486430e-02, 2.73486430e-02, 3.27766180e-02, 3.27766180e-02,
       3.47077244e-02, 3.47077244e-02, 3.93006263e-02, 3.93006263e-02,
       5.29227557e-02, 5.29227557e-02, 5.43841336e-02, 5.43841336e-02,
       5.46972860e-02, 5.46972860e-02, 6.28392484e-02, 6.28392484e-02,
       6.90501044e-02, 6.90501044e-02, 6.98851775e-02, 6.98851775e-02,
       7.11899791e-02, 7.11899791e-02, 7.44780793e-02, 7.44780793e-02,
       7.57828810e-02, 7.57828810e-02, 9.53549061e-02, 9.53549061e-02,
       9.93215031e-02, 9.93215031e-02, 1.03810021e-01, 1.03810021e-01,
       1.14509395e-01, 1.14509395e-01, 1.17484342e-01, 1.17484342e-01,
       1.24634656e-01, 1.24634656e-01, 1.25469729e-01, 1.25469729e-01,
       1.31002088e-01, 1.31002088e-01, 1.32828810e-01, 1.32828810e-01,
      

In [613]:
result_table.loc[(result_table['classifier'] == 'LogisticRegression(random_state=2020)') & (result_table['set'] == 'ALL')]['auc'].values[0]

0.5160102426931106

In [610]:
result_table.loc[result_table.set == 'ALL']['auc'][0]

0.5160102426931106

In [600]:
result_table.loc[(result_table.classifier == 'LogisticRegression(random_state=2020)') & (result_table.set == 'ALL')]['fpr']

0    [0.0, 5.2192066805845514e-05, 0.01064718162839...
Name: fpr, dtype: object

In [597]:
for k,m in enumerate(result_table.set.unique()):
    print(k,m)

0 ALL
1 NOPD
2 PD


In [None]:
# Plot the figure
fig = plt.figure(figsize=(8,6))

for i in (self.base_models):

    plt.plot(result_table.loc['XGBoost']['fpr'], 
             result_table.loc['XGBoost']['tpr'],
             color = 'royalblue',
             label="{}, AUC={:.3f}".format('XGBoost', result_table.loc['XGBoost']['auc']))
    plt.plot(result_table.loc['Random Forest']['fpr'], 
             result_table.loc['Random Forest']['tpr'],
             color = 'seagreen',
             label="{}, AUC={:.3f}".format('Random Forest', result_table.loc['Random Forest']['auc']))
    plt.plot(result_table.loc['Logistic Regression']['fpr'], 
             result_table.loc['Logistic Regression']['tpr'],
             color = 'chocolate',
             label="{}, AUC={:.3f}".format('Logistic Regression', result_table.loc['Logistic Regression']['auc']))
    
plt.plot([0,1], [0,1], color='gray', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.legend(prop={'size':13}, loc='lower right')
plt.savefig('ROC_ALL.png',  dpi=1200)
plt.show()

In [587]:
for i,j in enumerate(result_table.classifier.unique()):
    print(i,j)

0 LogisticRegression(random_state=2020)
1 RandomForestClassifier(random_state=2020)


In [583]:
result_table.classifier.unique()[0]

'LogisticRegression(random_state=2020)'

In [None]:
# # Plot the figure
# fig = plt.figure(figsize=(8,6))

# for i in result_table.index:
#     plt.plot(result_table.loc[i]['fpr'], 
#              result_table.loc[i]['tpr'], 
#              label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
# plt.plot([0,1], [0,1], color='gray', linestyle='--')

# plt.xticks(np.arange(0.0, 1.1, step=0.1))
# plt.xlabel("Flase Positive Rate", fontsize=15)

# plt.yticks(np.arange(0.0, 1.1, step=0.1))
# plt.ylabel("True Positive Rate", fontsize=15)

# plt.title('ROC Curve Logistic Regression Analysis', fontweight='bold', fontsize=15)
# plt.legend(prop={'size':13}, loc='lower right')
# plt.savefig('ROC_LOG.png',  dpi=1200)
# plt.show()

## 7. Probability distribution for all classes

In [None]:
# histogram of predicted probabilities
plt.figure(figsize=(12, 4))
nclasses = 2
for i in range(nclasses):
    
    plt.subplot(1, 2, i+1)
    plt.hist(train_proba[:, i], bins=10, histtype='bar', rwidth=0.95)
    plt.xlim(0,1)
    plt.title('Predicted class-{} probabilities'.format(i+1))
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

### C. Over-Predict a Label than Under-Predict:

In [None]:
np.argmax(data)

In [None]:
def re_predict(data, threshods):

    argmax = np.argmax(data)

    ## If the argmax is 2 (class-3) then ovbiously return this highest label
    if argmax == 2: 
        return (argmax +1)

    # If argmax is 1 (class-2) there is a chnace that, label is class-2 if
    # the probability of the class is greater than the threshold otherwise obviously
    # return this highest label (class-3)
    elif argmax == 1:
        if data[argmax] >= threshods[argmax] : 
            return (argmax +1)
        else:
            return (argmax +2)

    # If the argmax is 0 (class-1) then there are chances that label is class-1 if
    # the probability of the class is greater than the threshold otherwise label can be
    # either next two highest labels (class-2 or class-3). To determine the exact class
    # class, we have to consider four cases.
    # case A : if class_2_prob >= threshold and class_3_prob < threshold then pick class-2
    # case B : if class_3_prob >= threshold and class_2_prob < threshold then pick class-3
    # case C : if class_2_prob < threshold and class_3_prob < threshold then pick class-1
    # case D : if class_2_prob > threshold and class_3_prob > threshold then pick class-3

    elif argmax == 0:

        if data[argmax] >= threshods[argmax] : 
            return (argmax +1)
        else:
            # case A : if class_2_prob >= threshold and class_3_prob < threshold then pick class-2
            if data[argmax + 1] >= threshods[argmax + 1] and data[argmax + 2] < threshods[argmax + 2]:
                return (argmax + 2)

            # case B : if class_3_prob >= threshold and class_2_prob < threshold then pick class-3
            if data[argmax + 2] >= threshods[argmax + 2] and data[argmax + 1] < threshods[argmax + 1]:
                return (argmax + 3)

            # case C : if class_2_prob < threshold and class_3_prob < threshold then pick class-1
            if data[argmax + 1] < threshods[argmax + 1] and data[argmax + 2] < threshods[argmax + 2]:
                return (argmax + 1)

            # case D : if class_2_prob > threshold and class_3_prob > threshold then pick class-3
            if data[argmax + 1] > threshods[argmax + 1] and data[argmax + 2] > threshods[argmax + 2]:
                return (argmax + 3)

    

### Finding threshold probability of classes

In [None]:
# y = label_binarize(ytrain, classes=[0, 1])
th1 = roc_curve(ytrain['y'], train_proba[:, 0])
th2 = roc_curve(ytrain['y'], train_proba[:, 1])

In [None]:
from sklearn.preprocessing import label_binarize

In [None]:
print(np.median(th1))
print(np.median(th2))

In [None]:
np.argmax(train_proba[1
                      , :])

In [None]:
train_proba[0, :]

In [None]:
threshold = [0.575, 0.425]
new_pred = []
for i in range(train_pred.shape[0]):
    new_pred.append(re_predict(train_proba[i, :], threshold))

In [None]:
print('1. The F-1 score of the model {}\n'.format(f1_score(ytrain, new_pred, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(ytrain, new_pred, average='macro')))
print('3. The roc score of the model {}\n'.format(roc_auc_score(ytrain, new_pred, average='macro')))
print('4. Classification report \n {} \n'.format(classification_report(ytrain, new_pred)))
print('5. Confusion matrix \n {} \n'.format(confusion_matrix(ytrain, new_pred)))

In [None]:
final_tpred_prob3 = []
for i in range(test_proba.shape[0]):
    final_tpred_prob3.append(re_predict(test_proba[i, :], threshold))

In [None]:
# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'min_samples_split':rf_min_samples_leaf,
               'bootstrap': rf_bootstrap,
               'class_weight': rf_class}from xgboost import XGBClassifier
tpred_prob3 = pd.DataFrame(final_tpred_prob3)
tpred_prob3.to_csv('final.csv', index=False, header=False)