In [5]:
#%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
## read data 
CIP_data = pd.read_csv("CIP_data_encode_prev.csv")
CIP_data.head()
print(CIP_data.columns)
len(CIP_data)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP',
       'Susceptible', 'MSMW', 'MSW', 'Oth/Unk/Missing', 'REGION', 'Northeast',
       'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC'],
      dtype='object')


112487

In [89]:
### Step 1: create model and calculate apparent performance metric of interest (P)
CIP_data.columns
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']
#print(X["PREV_CLINIC"].isnull().values.any())
model = LogisticRegression(class_weight = 'balanced', max_iter=1000)
model_fit = model.fit(X, y)

#print(model_fit.coef_)
print(model_fit.score(X,y)) # 0.72996879639425


y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict) #0.679905016859595
print(ROC_AUC_logistic) # 0.7201325178334779
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## ACCURACY OF THE MODEL:  0.72996879639425


## this is "P" from S4 https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000059 (step 1)

## add in confusion matrix 
tn, fp, fn, tp = confusion_matrix(y, y_predict).ravel()
specificity = tn / (tn+fp)
sensitivity = tp / (tp + fn)
print(specificity) #0.706282979506688

print(sensitivity )#0.7339820561602678



0.72996879639425
0.7201325178334779
ACCURACY OF THE MODEL:  0.72996879639425
0.706282979506688
0.7339820561602678


In [None]:
#

In [None]:
## Step 2: Bootstrapping validation 
n_iterations = 10
bootstrapped_stats = pd.DataFrame()
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

train = resample(CIP_data, replace=True, n_samples=len(CIP_data))

train.head()

X_train = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y_train = CIP_data['Susceptible']

model_train = LogisticRegression(class_weight = 'balanced', max_iter = 500)
model_train = model_train.fit(X_train, y_train)

#print(model.coef_)
#print(model.score(X,y)) # 0.56

y_predict = model_train.predict(X_train)

ROC_AUC_logistic_train = metrics.roc_auc_score(y_train, y_predict)

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

       y_sample = sample['Susceptible']

       model = LogisticRegression(class_weight = 'balanced', max_iter = 1000, solver = "lbfgs") #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)
       tn_sample, fp_sample, fn_sample, tp_sample = confusion_matrix(y_sample, y_predict_sample).ravel()
       specificity_sample = tn_sample / (tn_sample+fp_sample)
       sensitivity_sample = tp_sample / (tp_sample + fn_sample)


       y_test = model_sample.predict(X) #see how model trained on sample data performns on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test) 
       tn_test, fp_test, fn_test, tp_test = confusion_matrix(y, y_test).ravel() ##confusion matrix between predicted data from original data and the actual original data
       specificity_test = tn_test / (tn_test+fp_test)
       sensitivity_test = tp_test / (tp_test + fn_test)


       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation
       optomisation_specificity = specificity_sample - specificity_test #optimisation
       optomisation_sensitivity = sensitivity_sample - sensitivity_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation,
            'Sensitivity_sample': sensitivity,
            'Specificity_sample':specificity, 
            'Optimisation_sensitivity': optomisation_sensitivity,
            'Optimisation_specificity': optomisation_specificity
        }
       )


In [None]:
bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
#print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

#0.49942646634516624
#0.6772986683407078 0.3152392553329397

## sensitivity and specificity 
average_optimised_sensitivity = bootstrapped_stats["Optimisation_sensitivity"].mean()  #0.6417965394526935
average_optimised_specificity = bootstrapped_stats["Optimisation_specificity"].mean()   #0.7180134942664962 

optimization_corrected_performance_sensitivity = sensitivity - average_optimised_sensitivity ## 0.7178314223295834

optimization_corrected_performance_specificity = specificity - average_optimised_specificity ## 0.640672271758783

print(optimization_corrected_performance_sensitivity, optimization_corrected_performance_specificity)

#so both are low...

In [None]:
## try code from  https://github.com/yaesoubilab/PredictMDRTB/blob/d8450cc2c158d07baf19eb01c46bbb2d4bae6803/source/ClassifierClasses.py#L15

from sklearn.metrics import roc_curve, auc 
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

y = CIP_data['Susceptible']
y = 1-CIP_data['Susceptible']

## what happens if you flip susceptible 
model = LogisticRegression(C=0.01, class_weight = 'balanced')
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)
predict_hat_prob = model.predict_proba(X) ##
ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic)
print(predict_hat_prob[:,1]) ## so this is the probability it'll be 1 (i.e. cipro sus)


bootstrapped_stats = []
n_iterations = 100
#threshold = 0.9
for i in range(n_iterations):
       threshold = 0.5

       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
 
       y_sample = sample['Susceptible']
       ####
       model = LogisticRegression(C=1, penalty = "l2", class_weight = 'balanced', solver = "lbfgs") #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ## could try and set threshold - from "get prediction value" https://github.com/yaesoubilab/PredictMDRTB/blob/d8450cc2c158d07baf19eb01c46bbb2d4bae6803/source/ClassifierClasses.py#L15
       
       y_train_hat = model.predict(X) ## original data
       y_train_hat_prob = model.predict_proba(X) ##
       y_test_hat = model.predict(X_sample)  # predict class label
       y_test_hat_prob = model.predict_proba(X_sample)  # predict probability
       
       if threshold is not None:
          y_test_hat = np.where(y_test_hat_prob[:, 1] > threshold, 1, 0)
          y_train_hat = np.where(y_train_hat_prob[:, 1] > threshold, 1, 0)
       tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_test_hat).ravel()
       sensitivity = tp / (tp + fn)
       specificity = tn / (tn + fp)
       fpr, tpr, threshold = roc_curve(y_test, y_test_hat_prob[:, 1], drop_intermediate=False)
       fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_train_hat_prob[:, 1], drop_intermediate=False)


       ROC_AUC_logistic_test = roc_auc = auc(fpr, tpr)
       ROC_AUC_logistic_train = roc_auc = auc(fpr_train, tpr_train)

       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation... Equivalent to bootstrap apparent performance - bootstrap test performance

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation,
            'FPR test': fpr,
            'FPR train': fpr_train,
            'TPR test': tpr,
            'TPR train': tpr_train,

        }
       )

bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
#print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

performance_mean = (ROC_AUC_logistic - bootstrapped_stats["FPR train"].mean())
performance_max = (ROC_AUC_logistic - bootstrapped_stats["FPR train"].all().max())
performance_min = (ROC_AUC_logistic - bootstrapped_stats["FPR train"].all().min())

#


In [None]:
## Random search https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)

space = dict()
#space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l1', 'l2', 'elasticnet']
space['C'] = np.arange(0, 1, 0.05)#loguniform(1e-5, 100)

model = LogisticRegression(class_weight = 'balanced', max_iter = 1000, solver = 'lbfgs')
model_fit = model.fit(X, y)

search = RandomizedSearchCV(model, space, n_iter=60, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
#Best Score: 0.7078653885719192
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.05}
#Best Score: 0.733893787482822
#Best Hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.9500000000000001}

In [None]:
## Grid search from above
from sklearn.model_selection import GridSearchCV
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
model = LogisticRegression(class_weight = 'balanced')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

result = search.fit(X, y)
print('Best Hyperparameters: %s' % result.best_params_) #Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
print('Best Hyperparameters: %s' % result.best_params_) #Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [3]:
####### repeat steps 1 - 4 with new hyperparameters
#Best Hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.9500000000000001}

#1. Create model using all data and get ROC_AUC ("ROC_AUC_logistic")
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']
model = LogisticRegression(penalty = 'l2', C= 0.05, solver ='lbfgs', class_weight = 'balanced', max_iter = 500)
model_fit = model.fit(X, y)

y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)


## Step 2: Bootstrapping validation 
n_iterations = 200
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

for i in range(n_iterations):
       #2. (A) Sample all individuals w/replacement

       sample = CIP_data.sample(frac = 1, replace=True)  ##(a) sample n individuals with replacement
       X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
       y_sample = sample['Susceptible']

        # (B) Develop predictive model and find apparent performance
       model_fit = model.fit(X_sample, y_sample)
       y_sample_predict = model_fit.predict(X_sample)
       ROC_AUC_logistic_bootstrap_sample_performance = metrics.roc_auc_score(y_sample, y_sample_predict) 
        # (C) Performance of predictive model on original sample (i.e. original population, X)
       y_test_predict = model_fit.predict(X)
       ROC_AUC_logistic_bootstrap_test_performance = metrics.roc_auc_score(y, y_test_predict) 
        # (D) Calculate optimisation by getting (B) - (D) 
       optimism = ROC_AUC_logistic_bootstrap_sample_performance - ROC_AUC_logistic_bootstrap_test_performance

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_bootstrap_sample_performance,
            'Test ROC': ROC_AUC_logistic_bootstrap_test_performance,
            'Optimisation': optimism
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
#print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##
print(optimization_corrected_performance)
## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)
#0.5001378786356426
#0.6745810201109699 0.31370439675223116

0.719897737558955
0.7159843083393953 0.7156096371756776


In [None]:
#### now try bootstrapping with new hyperparameters
#bootstrap data
n_iterations = 100
bootstrapped_stats = []

#1. Create model using all data and get ROC_AUC ("ROC_AUC_random_forest")
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']

model = LogisticRegression(penalty = 'l2', C= 0.05, solver ='lbfgs', class_weight = 'balanced', max_iter = 500)
model = LogisticRegression(class_weight = 'balanced', max_iter = 500)

model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_logistic_apparent = metrics.roc_auc_score(y, y_predict)

for i in range(n_iterations):
       #2. (A) Sample all individuals w/replacement
        sample = CIP_data.sample(frac = 1, replace=True, random_state = 1) ##(a) sample n individuals with replacement
        X_sample = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
        y_sample = sample['Susceptible']

       #  (B) Develop predictive model and find apparent performance
        model_fit = model.fit(X_sample, y_sample)
        y_sample_predict = model_fit.predict(X_sample)
        ROC_AUC_logistic_bootstrap_sample_performance = metrics.roc_auc_score(y_sample, y_sample_predict) 

       #  (C) Performance of predictive model on original sample (i.e. original population, X)
        y_test_predict = model_fit.predict(X)
        ROC_AUC_logistic_bootstrap_test_performance = metrics.roc_auc_score(y, y_test_predict) 
      ### (D) Calculate optimisation by getting (B) - (D) 
        optimism = ROC_AUC_logistic_bootstrap_sample_performance - ROC_AUC_logistic_bootstrap_test_performance



        bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_bootstrap_sample_performance,
            'Test ROC': ROC_AUC_logistic_bootstrap_test_performance,
            'Optimisation': optimism
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats.head())
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic_apparent - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

print(n_iterations)

In [4]:
#### now try bootstrapping with new hyperparameters
#bootstrap data
n_iterations = 1
bootstrapped_stats = []

#1. Create model using all data and get ROC_AUC ("ROC_AUC_random_forest")
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_CLINIC', 'PREV_REGION']]
y = CIP_data['Susceptible']

model = LogisticRegression(solver ='lbfgs', class_weight = 'balanced')
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_logistic_apparent = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic_apparent)
#2. (A) Sample all individuals w/replacement
sample = CIP_data.sample(frac = 1, replace=True) ##(a) sample n individuals with replacement
X_sample = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_CLINIC','PREV_REGION']]
y_sample = sample['Susceptible']
print(sample.head())

       #  (B) Develop predictive model and find apparent performance
model_fit = model.fit(X_sample, y_sample)
y_sample_predict = model_fit.predict(X_sample)
ROC_AUC_logistic_bootstrap_sample_performance = metrics.roc_auc_score(y_sample, y_sample_predict) 
print(ROC_AUC_logistic_bootstrap_sample_performance)

       #  (C) Performance of predictive model on original sample (i.e. original population, X)
y_test_predict = model_fit.predict(X)
ROC_AUC_logistic_bootstrap_test_performance = metrics.roc_auc_score(y, y_test_predict) 
print(ROC_AUC_logistic_bootstrap_test_performance)

      ### (D) Calculate optimisation by getting (B) - (D) 
optimism = ROC_AUC_logistic_bootstrap_sample_performance - ROC_AUC_logistic_bootstrap_test_performance
print(optimism)





0.7201325178334779
        Unnamed: 0.1  Unnamed: 0 CLINIC  YEAR GENDERSP  Susceptible  MSMW  \
70329          70329        1220    NOR  2003      MSW            1   0.0   
16973          16973         329    CAM  2019      MSW            0   0.0   
37286          37286         626    DEN  2003      MSW            1   0.0   
111082        111082        1950    STL  2000      MSW            1   0.0   
103043        103043        1813    SEA  2006      MSM            0   0.0   

        MSW  Oth/Unk/Missing     REGION  Northeast  Southeast  Southwest  \
70329   1.0              0.0  Southeast        0.0        1.0        0.0   
16973   1.0              0.0  Northeast        1.0        0.0        0.0   
37286   1.0              0.0       West        0.0        0.0        0.0   
111082  1.0              0.0    Midwest        0.0        0.0        0.0   
103043  0.0              0.0       West        0.0        0.0        0.0   

        West  PREV_REGION  PREV_CLINIC  
70329    0.0     0.0