In [27]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np
import simpy
#from SimPy.Statistics import SummaryStat
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE, SelectFromModel

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score, log_loss, precision_score, accuracy_score, f1_score, roc_curve, auc, matthews_corrcoef, mean_squared_error
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import CategoricalNB
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
## read data 
CIP_data = pd.read_csv("CIP_data_encode_prev.csv")
CIP_data.head()
print(CIP_data.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP',
       'Susceptible', 'MSMW', 'MSW', 'Oth/Unk/Missing', 'REGION', 'Northeast',
       'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC'],
      dtype='object')


In [30]:
def get_features_based_on_importance_rank(features, importance_col_name, importance_value, n_features_wanted):
    """ create a sorted table containing feature name and its importance score
    :param features: list of feature names
    :param importance_col_name: the name of the column which shows the importance value or rank for each feature
    :param importance_value: the importance value or rank for each feature
    :param n_features_wanted: number of features you want to select
    :return a sorted DataFrame containing the name of feature and its corresponding importance
    """

    # creat a DataFrame
    d = {'Features': features, importance_col_name: importance_value}
    df = pd.DataFrame(d)
    # sort by descending
    df = df.sort_values(by=[importance_col_name], ascending=False)
    selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
    return selected_feature_names


def get_features_based_on_TF_result(features, indicator_list):
    """ get features with "True" indicators and discard features with "False" indicator
    :param features: a list containing all features
    :param indicator_list: list of true and false indicating whether corresponding feature should be selected
    :return a list of selected features
    """
    # find the index of selected feature names
    selected_feature_index = []
    for i, judge in enumerate(indicator_list):
        if judge:
            selected_feature_index.append(i)

    # print selected feature names
    selected_feature_names = (features[selected_feature_index]).tolist()
    return selected_feature_names


def get_x_y_features(df, y_name):
    """ get Xs and y in np.array format, and feature names
    :param df: DataFrame
    :param y_name: name of outcome variable
    :return array of X, array of y, list of feature names
    """

    # get y
    y = df[y_name].astype("int")
    y = np.asarray(y)
    # get x
    df = df.drop(columns=y_name)
    X = np.asarray(df)
    # feature names
    features = df.columns
    return X, y, features


def rfe(classifier, num_features, df=None, y_name=None, X=None, y=None, features=None,
        rf_ntree=100, rf_min_leaf=5):
    """ use recursive feature elimination algorithm to select significant features, df and X cannot both be None  """

    if X is None:
        # get Xs, y, and feature names
        X, y, features = get_x_y_features(df, y_name)
    # reference for logistic regression:
    # https://towardsdatascience.com/a-look-into-feature-importance-in-logistic-regression-models-a4aa970f9b0f
    estimator = LogisticRegression()
    if classifier == 'rf':
        estimator = RandomForestClassifier(min_samples_leaf=rf_min_leaf, n_estimators=rf_ntree)
    # backward algorithm
    selector = RFE(estimator, n_features_to_select=num_features, step=1)
    # use logistic regression to fit the data, do backward elimination and delete least important feature
    selector = selector.fit(X, y)
    # a list of indicators for each features
    whether_use = selector.support_

    # get selected features based on True/False result
    selected_feature_names = get_features_based_on_TF_result(features=features, indicator_list=whether_use)

    return selected_feature_names


def lasso(df=None, y_name=None, penalty=0.1, X=None, y=None, features=None):
    """ use L1 regularization algorithms to select significant features, df and X cannot both be None """
    if X is None:
        # get Xs, y, and feature names
        X, y, features = get_x_y_features(df, y_name)
    # use LASSO to choose
    sel = SelectFromModel(LogisticRegression(penalty="l1", C=penalty, solver='liblinear'))
    sel.fit(X, y)
    lasso_support = sel.get_support()  # selection decision for each indicator
    # get selected features based on T/F result
    selected_feature_names = get_features_based_on_TF_result(features=features, indicator_list=lasso_support)

    return selected_feature_names


def pi(classifier, num_features, df=None, y_name=None, X=None, y=None, features=None,
       nn_solver='lbfgs', nn_alpha=2, nn_activation='tanh',
       rf_ntree=100, rf_min_leaf=5
       ):
    """ use permutation importance algorithm to select significant features, df and X cannot both be None  """
    if X is None:
        # get Xs, y, and feature names
        X, y, features = get_x_y_features(df, y_name)
    clf = LogisticRegression()
    if classifier == 'rf':
        clf = RandomForestClassifier(min_samples_leaf=rf_min_leaf, n_estimators=rf_ntree)
    elif classifier == 'neural':
        clf = MLPClassifier(solver=nn_solver,  # adam works well for large dataset, lbfgs is better for small dataset
                            alpha=nn_alpha,
                            random_state=1,
                            activation=nn_activation)
    clf.fit(X, y)
    result = permutation_importance(clf, X, y, n_repeats=10, random_state=0)
    importance_mean = result.importances_mean
    # get n wanted features
    wanted_features = get_features_based_on_importance_rank(features=features,
                                                            importance_col_name='Importance',
                                                            importance_value=importance_mean,
                                                            n_features_wanted=num_features)
    return wanted_features



def get_mean_min_max(string):
    split_list = re.split('[(,)]', string)
    mean = float(split_list[0])
    min_value = float(split_list[1])
    max_value = float(split_list[2])
    return mean, min_value, max_value


def subtract_two_lists(list1, list2):
    difference = []
    zip_object = zip(list1, list2)
    for list1_i, list2_i in zip_object:
        difference.append(list1_i - list2_i)
    return difference


def get_pfm_with_same_p(list_of_list):
    """ get list of optimisms of fpr/tpr under the same classification threshold (p) """
    list_to_array = np.array(list_of_list)
    transposed_array = list_to_array.T
    transposed_list = transposed_array.tolist()
    return transposed_list


def get_performance_helper(app_pfm, opti_distr):
    """ calculate mean and confidence interval for optimism-corrected performance
    :param app_pfm: (float) a constant number, apparent performance
    :param opti_distr: (string) mean (min, max), a distribution
    :return (string) pfm_mean [pfm_min, pfm_max]
    """
    mean, min_value, max_value = get_mean_min_max(opti_distr)
    performance_mean = round((app_pfm - mean), 2)
    performance_min = round((app_pfm - max_value), 2)
    performance_max = round((app_pfm - min_value), 2)
    performance_output = '{} ({},{})'.format(performance_mean, performance_min, performance_max)
    return performance_output


def get_utility_helper(component1_distr, component2_distr, tradeoff):
    """
    calculate utility and 95% confidence interval based on tradeoff threshold
    :param component1_distr: (string) mean (min, max) of sensitivity * flq_prev
    :param component2_distr: (string) mean (min, max) of (1 - specificity) * (1 - flq_prev)
    :param tradeoff: policy makers' trade-off threshold
    :return: (string) mean (min, max) of utility
    """
    mean1, min1, max1 = get_mean_min_max(component1_distr)
    mean2, min2, max2 = get_mean_min_max(component2_distr)
    performance_mean = round(tradeoff * mean1 + mean2, 2)
    performance_min = round(tradeoff * min1 + max2, 2)
    performance_max = round(tradeoff * max1 + min2, 2)
    performance_output = '{} ({},{})'.format(performance_mean, performance_mean, performance_mean)
    return performance_output


class BootstrapModel:
    def __init__(self, df, y_name, classifier, feature_selection=None,
                 nn_activation='tanh', nn_solver='lbfgs', nn_alpha=2,  # NN combination of best performance
                 rf_ntree=100, rf_min_leaf=5,                          # RF combination of best performance
                 num_selected_features=10, lasso_penalty=0.1, threshold=0.5, tradeoff=None, tradeoff_list=None, flq_prev=None):
        """
        calculate the optimism-corrected performance based on specified feature selection and classifier
        :param df: DataFrame used for model construction
        :param y_name: outcome of interests
        :param feature_selection: "PI": permutation importance,
                                  "RFE": recursive feature selection,
                                  "LASSO": L1 regularization
        :param classifier: "logistic": logistic regression
                           "neural": neural network
                           "rf": random forest
        :param num_selected_features: specify number of features wanted, cannot coexist with penalty
        :param lasso_penalty: hyper-parameter for LASSO feature selection method, cannot coexist with num_selected_features
        :param threshold: classification threshold
        :param tradeoff: policy makers' utility towards two different scenarios proposed in the paper
        :param flq_prev: prevalence of resistance to FLQs in the whole dataset
        """
        self.df = df                                        # the whole dataset
        self.y_name = y_name                                # outcome of interest
        self.threshold = threshold                          # classification threshold
        self.tradeoff = tradeoff
        self.tradeoff_list = tradeoff_list
        self.flq_prev = flq_prev
        self.classifier = classifier
        self.feature_selection = feature_selection
        self.num_selected_features = num_selected_features
        self.penalty = lasso_penalty                              # if LASSO is specified, use this than num_selected_features
        self.original_apparent_pfm = None                   # apparent performance using the whole dataset
        self.corrected_pfm = None                           # optimism-corrected performance
        self.predictor_counts = dict()                      # recording frequency of features identified as significant
        # For NN
        self.nn_activation = nn_activation
        self.nn_solver = nn_solver
        self.nn_alpha = nn_alpha
        # For RF
        self.rf_ntree = rf_ntree
        self.rf_min_leaf = rf_min_leaf

    def select_significant_features(self, df):
        """ select features based on the given dataset, the feature selection method and the classifier """
        if self.feature_selection is None:
            significant_features = df.columns.tolist()
            significant_features.remove(self.y_name)
        else:
            # logistic regression, feature selection method include PI, RFE, and LASSO
            if self.classifier == 'logistic':
                if self.feature_selection == 'LASSO':
                    significant_features = lasso(df=df, y_name=self.y_name, penalty=self.penalty)
                elif self.feature_selection == 'PI':
                    significant_features = pi(df=df, y_name=self.y_name, classifier='logistic',
                                              num_features=self.num_selected_features)
                elif self.feature_selection == 'RFE':
                    significant_features = rfe(df=df, y_name=self.y_name, classifier='logistic',
                                               num_features=self.num_selected_features)
                else:
                    raise ValueError('invalid feature selection method for logistic regression model')
            # neural network model, only PI is applicable
            elif self.classifier == 'neural':
                if self.feature_selection == 'PI':
                    significant_features = pi(df=df, y_name=self.y_name, classifier='neural',
                                              num_features=self.num_selected_features,
                                              nn_alpha=self.nn_alpha, nn_solver=self.nn_solver,
                                              nn_activation=self.nn_activation)
                else:
                    raise ValueError('invalid feature selection method for neural network model')
            # random forest, applicable feature selection methods include: PI and RFE
            elif self.classifier == 'rf':
                if self.feature_selection == 'PI':
                    significant_features = pi(df=df, y_name=self.y_name, classifier='rf',
                                              rf_ntree=self.rf_ntree, rf_min_leaf=self.rf_min_leaf,
                                              num_features=self.num_selected_features)
                elif self.feature_selection == 'RFE':
                    significant_features = rfe(df=df, y_name=self.y_name, classifier='rf',
                                               rf_ntree=self.rf_ntree, rf_min_leaf=self.rf_min_leaf,
                                               num_features=self.num_selected_features)
                else:
                    raise ValueError('invalid feature selection method for random forest model')
            else:
                raise ValueError('invalid classifier')
        return significant_features

    def _get_origin_apparent_pfm(self, display_roc_curve=False):
        """ use the whole dataset to train and evaluate the obtained model """
        original_df = self.df
        # feature selection
        significant_features = self.select_significant_features(df=original_df)
        print('features selected for whole dataset:', significant_features)
        # train and test using the whole dataset
        # specify classification algorithm
        if self.classifier == 'logistic':
            predict_model = LogRegression(features=significant_features, y_name=self.y_name)
            predict_model.run(df_train=original_df, df_test=original_df, display_roc_curve=display_roc_curve,
                              threshold=self.threshold, flq_prev=self.flq_prev)
        elif self.classifier == 'neural':
            predict_model = NeuralNet(features=significant_features, y_name=self.y_name)
            # just for NN
            predict_model.run(df_train=original_df, df_test=original_df, display_roc_curve=display_roc_curve,
                              activation=self.nn_activation, solver=self.nn_solver, alpha=self.nn_alpha,
                              threshold=self.threshold, flq_prev=self.flq_prev)
        else:
            predict_model = RandomForest(features=significant_features, y_name=self.y_name)
            predict_model.run(df_train=original_df, df_test=original_df, display_roc_curve=display_roc_curve,
                              min_samples_leaf=self.rf_min_leaf, n_trees=self.rf_ntree,
                              threshold=self.threshold, flq_prev=self.flq_prev)
        # run the model
        # predict_model.run(df_train=original_df, df_test=original_df, display_roc_curve=display_roc_curve,
        #                   threshold=self.threshold, flq_prev=self.flq_prev)
        # collect apparent performance
        self.original_apparent_pfm = predict_model.performanceTest

    def _get_bootstrap_pfm(self, rng, display_roc_curve=False):
        """ use the same bootstrap sample to train and evaluate the obtained model """
        # bootstrap sample
        resampled_df = self.df.sample(frac=1, replace=True, random_state=rng)
        # feature selection
        significant_features = self.select_significant_features(df=resampled_df)
        # train and test use the same resampled dataset
        # specify classification algorithm
        if self.classifier == 'logistic':
            bootstrap_model = LogRegression(features=significant_features, y_name=self.y_name)
            bootstrap_model.run(df_train=resampled_df, df_test=resampled_df, display_roc_curve=display_roc_curve,
                                threshold=self.threshold, flq_prev=self.flq_prev)
        elif self.classifier == 'neural':
            bootstrap_model = NeuralNet(features=significant_features, y_name=self.y_name)
            bootstrap_model.run(df_train=resampled_df, df_test=resampled_df, display_roc_curve=display_roc_curve,
                                activation=self.nn_activation, solver=self.nn_solver, alpha=self.nn_alpha,
                                threshold=self.threshold, flq_prev=self.flq_prev)
        else:
            bootstrap_model = RandomForest(features=significant_features, y_name=self.y_name)
            bootstrap_model.run(df_train=resampled_df, df_test=resampled_df, display_roc_curve=display_roc_curve,
                                min_samples_leaf=self.rf_min_leaf, n_trees=self.rf_ntree,
                                threshold=self.threshold, flq_prev=self.flq_prev)
        # run the model
        # bootstrap_model.run(df_train=resampled_df, df_test=resampled_df, display_roc_curve=display_roc_curve,
        #                     threshold=self.threshold, flq_prev=self.flq_prev)
        # collect apparent performance
        bootstrap_apparent_performance = bootstrap_model.performanceTest
        return bootstrap_apparent_performance, significant_features

    def _get_test_pfm(self, rng, display_roc_curve=False):
        """ use bootstrap sample to train, and use the whole dataset to evaluate the obtained model """
        # bootstrap sample (use same random_state, get the same sample set)
        resampled_df = self.df.sample(frac=1, replace=True, random_state=rng)
        # feature selection
        significant_features = self.select_significant_features(df=resampled_df)
        # train use resampled dataset, test use whole dataset
        # specify classification algorithm
        if self.classifier == 'logistic':
            test_model = LogRegression(features=significant_features, y_name=self.y_name)
            test_model.run(df_train=resampled_df, df_test=self.df, display_roc_curve=display_roc_curve,
                           threshold=self.threshold, flq_prev=self.flq_prev)
        elif self.classifier == 'neural':
            test_model = NeuralNet(features=significant_features, y_name=self.y_name)
            test_model.run(df_train=resampled_df, df_test=self.df, display_roc_curve=display_roc_curve,
                           activation=self.nn_activation, solver=self.nn_solver, alpha=self.nn_alpha,
                           threshold=self.threshold, flq_prev=self.flq_prev)
        else:
            test_model = RandomForest(features=significant_features, y_name=self.y_name)
            test_model.run(df_train=resampled_df, df_test=self.df, display_roc_curve=display_roc_curve,
                           min_samples_leaf=self.rf_min_leaf, n_trees=self.rf_ntree,
                           threshold=self.threshold, flq_prev=self.flq_prev)
        # run the model
        # test_model.run(df_train=resampled_df, df_test=self.df, display_roc_curve=display_roc_curve,
        #                threshold=self.threshold, flq_prev=self.flq_prev)
        # collect apparent performance
        test_performance = test_model.performanceTest
        return test_performance

    def _calculate_optimism(self, rng, roc_curve):
        """ calculate optimism """
        bootstrap_pfm, significant_features = self._get_bootstrap_pfm(rng=rng)
        optimism = BootstrapOptimism(bootstrap_performance=bootstrap_pfm,
                                     test_performance=self._get_test_pfm(rng=rng),
                                     roc_curve=roc_curve,
                                     tradeoff=self.tradeoff,
                                     tradeoff_list=self.tradeoff_list,
                                     flq_prev=self.flq_prev)
        return optimism, significant_features

    def get_optimism_corrected_pfm(self, num_bootstrap, predictor_names, plot_optimism_graph=False, roc_curve=False):
        """ calculate optimism-corrected performance (AUC-ROC), plot optimism graph
        :param num_bootstrap: number of bootstrap iterations
        :param predictor_names: list of names of predictors
        :param plot_optimism_graph: whether we want to plot optimism graph
        :param roc_curve: whether we want to calculate stats used for plotting ROC curve
        """
        optimism_list = []
        for feature in predictor_names:
            self.predictor_counts[feature] = 0
        self._get_origin_apparent_pfm()     # update self.original_apparent_pfm
        # append optimism-corrected performance into corresponding list
        for i in range(0, num_bootstrap):
            optimism, significant_features = self._calculate_optimism(rng=i, roc_curve=roc_curve)
            optimism_list.append(optimism)
            for feature in significant_features:
                self.predictor_counts[feature] += 1
        self.corrected_pfm = BootstrapOptimismCorrectedPfm(optimism_list=optimism_list,
                                                           app_pfm=self.original_apparent_pfm,
                                                           # predictor_counts=self.predictor_counts,
                                                           tradeoff=self.tradeoff,
                                                           flq_prev=self.flq_prev,
                                                           tradeoff_list=self.tradeoff_list,
                                                           roc_curve=roc_curve)
        # calculate optimism-corrected performance
        self.corrected_pfm._calculate_optimism_corrected_pfm()
        # plot graph (add area under the curve texts)
        if roc_curve:
            fig = plt.gcf()
            fig.set_size_inches(6, 6)
            plt.legend(loc="lower right")
            plt.text(0.9, 0.1,
                     "Area:{}".format(self.corrected_pfm.auc, 3),
                     ha="right", va="bottom")
            plt.show()
        if plot_optimism_graph:
            self.corrected_pfm.plot_auc_optimism()


class BootstrapOptimism:
    """ optimism values for different performance metrics for one bootstrap sample"""
    def __init__(self, bootstrap_performance, test_performance, roc_curve, flq_prev=None,
                 tradeoff=None, tradeoff_list=None):
        """
        calculate performance optimisms for different metrics
        :param bootstrap_performance: bootstrap apparent performance
        :param test_performance: bootstrap test performance
        :param roc_curve: (boolean) whether want calculate optimism for ROC curve
        :param tradeoff: (int) trade-off threshold (should not coexist with tradeoff_list)
        :param tradeoff_list: (list) a list of trade-off thresholds of interests (should not coexist with tradeoff)
        """
        self.auc = bootstrap_performance.roc_auc - test_performance.roc_auc
        self.sen = bootstrap_performance.sensitivity - test_performance.sensitivity
        self.spe = bootstrap_performance.specificity - test_performance.specificity
        self.F1 = bootstrap_performance.F1 - test_performance.F1
        self.mcc = bootstrap_performance.mcc - test_performance.mcc
        if flq_prev is not None:
            self.effective = bootstrap_performance.receive_effective_regimen - test_performance.receive_effective_regimen
            self.dlm = bootstrap_performance.receive_unnecessary_DML - test_performance.receive_unnecessary_DML
            self.uComponent1 = bootstrap_performance.uComponent1 - test_performance.uComponent1
            self.uComponent2 = bootstrap_performance.uComponent2 - test_performance.uComponent2
        if tradeoff is not None:
            bootstrap_utility = tradeoff * bootstrap_performance.uComponent1 + bootstrap_performance.uComponent2
            test_utility = tradeoff * test_performance.uComponent1 + test_performance.uComponent2
            self.utility = bootstrap_utility - test_utility  # optimism for DeltaUtility
        # use bootstrap/test sensitivity, specificity, and tradeoff threshold to calculate bootstrap and test utility
        if tradeoff_list is not None:
            self.utility_list = []
            for i in tradeoff_list:
                bootstrap_utility = i * bootstrap_performance.uComponent1 + bootstrap_performance.uComponent2
                test_utility = i * test_performance.uComponent1 + test_performance.uComponent2
                self.utility_list.append(bootstrap_utility - test_utility)  # list of optimisms for DeltaUtility
        # calculate optimism for fpr and tpr (a list with various classification threshold)
        if roc_curve:
            self.fpr_list = subtract_two_lists(list1=bootstrap_performance.fpr, list2=test_performance.fpr)
            self.tpr_list = subtract_two_lists(list1=bootstrap_performance.tpr, list2=test_performance.tpr)
            # bootstrap_performance.plot_roc_curve()




def performance_table_by_diff_features(df, y_name, classifier, feature_selection, predictor_names,
                                       num_bootstrap=200, filename=None,
                                       max_num_feature=None, min_num_features=None, max_penalty=None, min_penalty=None):
    """ explore relationship between number of features, feature selection method, and model performance """
    optimism_corrected_auc_list = []
    apparent_performance_auc_list = []
    num_features = []
    penalty_list = []
    if feature_selection == 'LASSO':
        num = round((max_penalty - min_penalty) / 0.01)
        for i in np.linspace(min_penalty, max_penalty, num + 1):
            print('penalty value:', i)
            bootstrap_model = BootstrapModel(df=df, y_name=y_name, classifier='logistic',
                                             feature_selection='LASSO', lasso_penalty=i)
            bootstrap_model.get_optimism_corrected_pfm(num_bootstrap=num_bootstrap, predictor_names=predictor_names)
            optimism_corrected_auc_list.append(bootstrap_model.corrected_pfm.auc)
            apparent_performance_auc_list.append(round(bootstrap_model.original_apparent_pfm.roc_auc, 3))
            num_features.append(len(bootstrap_model.select_significant_features(df=df)))
            penalty_list.append(i)
    elif feature_selection == 'PI' or 'RFE':
        for i in range(min_num_features, max_num_feature + 1):
            print('number of features selected:', i)
            bootstrap_model = BootstrapModel(df=df, y_name=y_name, classifier=classifier,
                                             feature_selection=feature_selection, num_selected_features=i)
            bootstrap_model.get_optimism_corrected_pfm(num_bootstrap=num_bootstrap, predictor_names=predictor_names)
            optimism_corrected_auc_list.append(bootstrap_model.corrected_pfm.auc)
            apparent_performance_auc_list.append(round(bootstrap_model.original_apparent_pfm.roc_auc, 3))
            num_features.append(i)
            penalty_list.append('not applicable')
    else:
        raise ValueError('feature selection method not included in this study')
    performance_dict = {'optimism-corrected performance': optimism_corrected_auc_list,
                        'apparent performance': apparent_performance_auc_list,
                        'number of features': num_features,
                        'penalty strength': penalty_list}
    df_performance = pd.DataFrame(performance_dict)
    print(df_performance)
    if filename is not None:
        df_performance.to_csv("../data/feature_performance_table/{}.csv".format(filename))

In [31]:
def get_prediction_values(x_test, x_train, model, threshold=None):
    y_test_hat = model.predict(x_test)  # predict class label
    y_test_hat_prob = model.predict_proba(x_test)  # predict probability
    y_train_hat = model.predict(x_train)
    y_train_hat_prob = model.predict_proba(x_train)
    if threshold is not None:
        y_test_hat = np.where(y_test_hat_prob[:, 1] > threshold, 1, 0)
        y_train_hat = np.where(y_train_hat_prob[:, 1] > threshold, 1, 0)
    return y_test_hat, y_test_hat_prob, y_train_hat, y_train_hat_prob


def split_Xs_and_y(df, y_name, features):
    Xs = np.asarray(df[features])
    y = np.asarray(df[y_name])
    return Xs, y


class Classifier:
    def __init__(self, features, y_name):
        """
        run a single classifier model
        :param features: list of names of independent variables
        :param y_name: name of the dependent variable
        """

        # specify feature names and y_name
        self.y_name = y_name
        self.features = features
        # performance measures
        self.performanceTest = None
        self.performanceTrain = None

    def run(self, df_train, df_test, test_size):
        """
        :param df_train: training set
        :param df_test: test set
        :param test_size: proportion of dataset that is divided into test set
        :return:
        """
        raise NotImplementedError

    def _update_performance_plot_roc_curve(self, y_train, y_train_hat, y_train_hat_prob,
                                           y_test, y_test_hat, y_test_hat_prob,
                                           display_roc_curve=True, flq_prev=0):
        """
        update performance measures for train and test set
        :param y_train: list of actual binary y in the train set
        :param y_train_hat: list of predicted binary y values for train set
        :param y_train_hat_prob: list of predicted probability outcomes (0~1) for train set
        :param y_test: list of actual binary y in the test set
        :param y_test_hat: list of predicted binary y values for test set
        :param y_test_hat_prob: list of predicted probability outcomes (0~1) for train set
        :param display_roc_curve: if True, plot roc curve
        reflecting policy makers' willingness to sacrifice % unnecessarily DML to increase % people effectively treated
        :param flq_prev: the prevalence of FLQ-resistance in the whole dataset
        :return:
        """
        # print('Test set performance')
        self.performanceTest = PerformanceSummary(y_test=y_test, y_hat=y_test_hat,
                                                  y_hat_prob=y_test_hat_prob, flq_prev=flq_prev)
        # print('Train set performance')
        self.performanceTrain = PerformanceSummary(y_test=y_train, y_hat=y_train_hat,
                                                   y_hat_prob=y_train_hat_prob, flq_prev=flq_prev)
        if display_roc_curve:
            self.performanceTest.plot_roc_curve()


class LogRegression(Classifier):

    def __init__(self, features, y_name):
        super().__init__(features, y_name)

    # def run(self, aim, test_size, random_state, threshold=0.5, display_roc_curve=True, tradeoff=0, flq_prev=0):
    def run(self, df_train=None, df_test=None, threshold=0.5, display_roc_curve=True, flq_prev=0,
            x_train=None, y_train=None, x_test=None, y_test=None):
        """
        :param df_train: training set
        :param df_test: test set
        :param threshold: the logistic classification threshold
        :param display_roc_curve: whether plot the roc curve
        :param flq_prev: the prevalence of FLQ resistance among the whole population
        :param x_train: training set of Xs, cannot coexist with df_train & df_test
        :param x_test: testing  set of Xs, cannot coexist with df_train & df_test
        :param y_train: training set of ys, cannot coexist with df_train & df_test
        :param y_test: testing set of ys, cannot coexist with df_train & df_test
        """

        if x_train is None:
            # split x and y
            x_train, y_train = split_Xs_and_y(df=df_train, y_name=self.y_name, features=self.features)
            x_test, y_test = split_Xs_and_y(df=df_test, y_name=self.y_name, features=self.features)

        # fit the model
        LR = LogisticRegression(class_weight='balanced')
        LR.fit(X=x_train, y=y_train)

        # prediction
        y_test_hat, y_test_hat_prob, y_train_hat, y_train_hat_prob = get_prediction_values(x_test=x_test,
                                                                                           x_train=x_train,
                                                                                           model=LR,
                                                                                           threshold=threshold)

        # update performance measures
        self._update_performance_plot_roc_curve(y_train=y_train,
                                                y_train_hat=y_train_hat,
                                                y_train_hat_prob=y_train_hat_prob,
                                                y_test=y_test,
                                                y_test_hat=y_test_hat,
                                                y_test_hat_prob=y_test_hat_prob,
                                                display_roc_curve=display_roc_curve,
                                                flq_prev=flq_prev)


class NeuralNet(Classifier):

    def __init__(self, features, y_name, len_neurons=None):
        super().__init__(features, y_name)

        if len_neurons is None:
            self.len_neurons = len(features) + 2

    def run(self, flq_prev=0, df_train=None, df_test=None,
            display_roc_curve=True, threshold=0.5, class_weight=None,
            x_train=None, y_train=None, x_test=None, y_test=None,
            alpha=2, activation='logistic', solver='lbfgs'):
        """
        :param df_train: training set
        :param df_test: test set
        :param display_roc_curve: whether plot the roc curve
        :param threshold: classification threshold
        :param flq_prev: prevalence of resistance to FLQs in the whole dataset
        :param x_train: training set of Xs, cannot coexist with df_train & df_test
        :param x_test: testing  set of Xs, cannot coexist with df_train & df_test
        :param y_train: training set of ys, cannot coexist with df_train & df_test
        :param y_test: testing set of ys, cannot coexist with df_train & df_test
        :param alpha: L2 penalty term, default 0.0001
        :param solver: 'adam' works well for large dataset, l'bfgs' is better for small dataset
        :param activation: {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’logistic’
        :param class_weight: (default) none. If 'balanced', oversampling df_train to get a balanced training set
        """

        if x_train is None:
            # split x and y
            x_train, y_train = split_Xs_and_y(df=df_train, y_name=self.y_name, features=self.features)
            x_test, y_test = split_Xs_and_y(df=df_test, y_name=self.y_name, features=self.features)
            # print('length of y train', len(y_train))
            # print("Before oversampling: {}".format(Counter(y_train)))

        if class_weight == 'balanced':
            over_sampler = RandomOverSampler(random_state=42)
            x_train, y_train = over_sampler.fit_resample(x_train, y_train)
            # print('length of y train after oversampling', len(y_train))
            # print("After oversampling: {}".format(Counter(y_train)))

        # fit the model
        clf = MLPClassifier(solver=solver,
                            # solver='adam', # cannot remember for the original analysis whether I used adam or lbfgs
                            alpha=alpha,
                            hidden_layer_sizes=(self.len_neurons,),
                            # (# units for 1st layer, # for 2nd layer, etc)
                            activation=activation,
                            random_state=1)
        clf.fit(X=x_train, y=y_train)

        # prediction
        y_test_hat, y_test_hat_prob, y_train_hat, y_train_hat_prob = get_prediction_values(x_test=x_test,
                                                                                           x_train=x_train,
                                                                                           model=clf,
                                                                                           threshold=threshold)

        # update performance measures
        self._update_performance_plot_roc_curve(y_train=y_train,
                                                y_train_hat=y_train_hat,
                                                y_train_hat_prob=y_train_hat_prob,
                                                y_test=y_test,
                                                flq_prev=flq_prev,
                                                y_test_hat=y_test_hat,
                                                y_test_hat_prob=y_test_hat_prob,
                                                display_roc_curve=display_roc_curve)


class RandomForest(Classifier):
    # for imbalanced dataset, decision tree frequently perform well
    # https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18

    def __init__(self, features, y_name):
        super().__init__(features, y_name)

    def run(self, df_train=None, df_test=None, display_roc_curve=True, threshold=0.5, flq_prev=0, class_weight=None,
            x_train=None, y_train=None, x_test=None, y_test=None, min_samples_leaf=5, n_trees=100):
        """
        :param df_train: training set
        :param df_test: test set
        :param display_roc_curve: whether plot the roc curve
        :param threshold: classification threshold
        :param flq_prev: prevalence of resistance to FLQs in the whole dataset
        :param x_train: training set of Xs, cannot coexist with df_train & df_test
        :param x_test: testing  set of Xs, cannot coexist with df_train & df_test
        :param y_train: training set of ys, cannot coexist with df_train & df_test
        :param y_test: testing set of ys, cannot coexist with df_train & df_test
        :param min_samples_leaf: min num of leaves
        :param n_trees: num of trees
        :param class_weight: default is None. If 'balanced', use SMOTE to oversample the training set
        """

        if x_train is None:
            # split x and y
            x_train, y_train = split_Xs_and_y(df=df_train, y_name=self.y_name, features=self.features)
            x_test, y_test = split_Xs_and_y(df=df_test, y_name=self.y_name, features=self.features)

        if class_weight == 'balanced':
            over_sampler = RandomOverSampler(random_state=42)
            x_train, y_train = over_sampler.fit_resample(x_train, y_train)

        # fit the model
        RF = RandomForestClassifier(min_samples_leaf=min_samples_leaf, random_state=0, n_estimators=n_trees)
        RF.fit(X=x_train, y=y_train)

        # prediction
        y_test_hat, y_test_hat_prob, y_train_hat, y_train_hat_prob = get_prediction_values(x_test=x_test,
                                                                                           x_train=x_train,
                                                                                           model=RF,
                                                                                           threshold=threshold)

        # update performance measures
        self._update_performance_plot_roc_curve(y_train=y_train,
                                                y_train_hat=y_train_hat,
                                                y_train_hat_prob=y_train_hat_prob,
                                                y_test=y_test,
                                                y_test_hat=y_test_hat,
                                                y_test_hat_prob=y_test_hat_prob,
                                                display_roc_curve=display_roc_curve,
                                                flq_prev=flq_prev)


class PerformanceSummary:
    def __init__(self, y_test, y_hat, y_hat_prob, flq_prev=0):
        """
        summary of the performance of a single model
        :param y_test: list of true ys for model validation
        :param y_hat: list of predicted ys (binary)
        :param y_hat_prob: list of predicted ys (probabilities, not binary)
        :param flq_prev: the prevalence of FLQ-resistance in the whole dataset
        """

        self.y_test = y_test
        self.flq_prev = flq_prev
        self.y_hat_prob = y_hat_prob

        self.J = jaccard_score(y_test, y_hat)
        self.precision = precision_score(y_test, y_hat)
        self.accuracy = accuracy_score(y_test, y_hat)
        self.F1 = f1_score(y_test, y_hat)
        self.mcc = matthews_corrcoef(y_true=y_test, y_pred=y_hat)
        # self.logLoss = log_loss(y_test, y_hat_prob)
        tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_hat).ravel()
        self.sensitivity = tp / (tp + fn)
        self.specificity = tn / (tn + fp)
        # self.F1_manual = tp / (tp + 0.5 * (fp + fn))
        self.fpr, self.tpr, threshold = roc_curve(y_test, y_hat_prob[:, 1], drop_intermediate=False)
        self.roc_auc = auc(self.fpr, self.tpr)
        # print('y test', y_test)
        # print('y test pred', y_hat)
        # print('f1', self.F1)
        # print('f1 manual', self.F1_manual)
        # print('difference', self.F1 - self.F1_manual)
        # print('roc auc', self.roc_auc)

        if flq_prev is not None:
            # y-prevalence among the whole dataset
            # self.receive_effective_regimen = 1 - (self.true_resistant * (1 - self.sensitivity))
            self.receive_effective_regimen = self.sensitivity * self.flq_prev + (1 - self.flq_prev)
            self.receive_DML = self.flq_prev * self.sensitivity + (1 - self.flq_prev) * (1 - self.specificity)
            self.receive_unnecessary_DML = (1 - self.flq_prev) * (1 - self.specificity)
            # # net benefit based on a specific tradeoff threshold
            # self.net_benefit = tradeoff * (self.sensitivity * self.flq_prev) - (1 - self.specificity) * (1 - self.flq_prev)
            # component 1 and component 2 used for utility calculation
            self.uComponent1 = self.sensitivity * self.flq_prev
            self.uComponent2 = - (1 - self.specificity) * (1 - self.flq_prev)

    def print(self):
        print("Sensitivity:", self.sensitivity)
        print("Specificity:", self.specificity)
        print("F1 score:", self.F1)
        print("MCC:", self.mcc)
        print("Accuracy:", self.accuracy)
        print("Precision:", self.precision)
        print("Jaccard similarity score:", self.J)
        # print("Log Loss:", self.logLoss)
        print("AUC:", self.roc_auc)
        print("% receive effective regimen", self.receive_effective_regimen)
        print('% receive DML', self.receive_DML)
        # print('net benefit', self.net_benefit)

    def plot_roc_curve(self):
        fpr, tpr, threshold = roc_curve(self.y_test, self.y_hat_prob[:, 1])
        plt.plot(fpr, tpr, color='lightblue', lw=0.5, alpha=0.4)
        plt.plot([0, 1], [0, 1], color='blue', lw=0.8, alpha=0.6, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Logistic Regression + Permutation Importance')
        # plt.legend(loc="lower right")
        # plt.text(0.9, 0.1,
        #          "Area:{}".format(round(self.roc_auc, 3)),
        #          ha="right", va="bottom")
        # plt.show()



In [33]:
CIP_data_abbreviated = CIP_data[['Susceptible','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
TRADEOFF = 0
THRESHOLD = 0.5
NUM_BOOTSTRAP = 2
predictor_names = ['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']
flq_prev = sum(CIP_data_abbreviated['Susceptible']) / len(CIP_data_abbreviated['Susceptible'])
bootstrap_model = BootstrapModel(df=CIP_data_abbreviated, y_name='Susceptible', classifier='neural', feature_selection='PI',
                                 nn_activation='identity', nn_solver='lbfgs', nn_alpha=2,   # NN hyper-parameters
                                 # rf_ntree=300, rf_min_leaf=6,                          # RF hyper-parameters
                                 num_selected_features=9,
                                 lasso_penalty=0.2,                                     # LASSO penalty
                                 threshold=THRESHOLD, tradeoff=TRADEOFF, flq_prev=flq_prev)
bootstrap_model.get_optimism_corrected_pfm(num_bootstrap=NUM_BOOTSTRAP, predictor_names=predictor_names,
                                           plot_optimism_graph=False, roc_curve=False)

features selected for whole dataset: ['West', 'MSW', 'Southeast', 'Oth/Unk/Missing', 'Southwest', 'Northeast', 'MSMW', 'PREV_REGION', 'PREV_CLINIC']


NameError: name 'BootstrapOptimismCorrectedPfm' is not defined