In [25]:
%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
## read data 
CIP_data = pd.read_csv("CIP_data_encode_prev.csv")
CIP_data.head()
print(CIP_data.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP',
       'Susceptible', 'MSMW', 'MSW', 'Oth/Unk/Missing', 'REGION', 'Northeast',
       'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC'],
      dtype='object')


In [3]:
### Step 1: create model and calculate apparent performance metric of interest (P)
CIP_data.columns
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']
#print(X["PREV_CLINIC"].isnull().values.any())
model = LogisticRegression(class_weight = 'balanced', max_iter=1000)
model_fit = model.fit(X, y)

#print(model_fit.coef_)
print(model_fit.score(X,y)) # 0.72996879639425


y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict) #0.679905016859595
print(ROC_AUC_logistic) # 0.7201325178334779
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## ACCURACY OF THE MODEL:  0.72996879639425


## this is "P" from S4 https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000059 (step 1)

## add in confusion matrix 
tn, fp, fn, tp = confusion_matrix(y, y_predict).ravel()
specificity = tn / (tn+fp)
sensitivity = tp / (tp + fn)
print(specificity) #0.706282979506688

print(sensitivity )#0.7339820561602678



0.72996879639425
0.7201325178334779
ACCURACY OF THE MODEL:  0.72996879639425
0.706282979506688
0.7339820561602678


In [None]:
#

In [17]:
## Step 2: Bootstrapping validation 
n_iterations = 10
bootstrapped_stats = pd.DataFrame()
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

train = resample(CIP_data, replace=True, n_samples=len(CIP_data))

train.head()

X_train = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y_train = CIP_data['Susceptible']

model_train = LogisticRegression(class_weight = 'balanced', max_iter = 500)
model_train = model_train.fit(X_train, y_train)

#print(model.coef_)
#print(model.score(X,y)) # 0.56

y_predict = model_train.predict(X_train)

ROC_AUC_logistic_train = metrics.roc_auc_score(y_train, y_predict)

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

       y_sample = sample['Susceptible']

       model = LogisticRegression(class_weight = 'balanced', max_iter = 1000, solver = "lbfgs") #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)
       tn_sample, fp_sample, fn_sample, tp_sample = confusion_matrix(y_sample, y_predict_sample).ravel()
       specificity_sample = tn_sample / (tn_sample+fp_sample)
       sensitivity_sample = tp_sample / (tp_sample + fn_sample)


       y_test = model_sample.predict(X) #see how model trained on sample data performns on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test) 
       tn_test, fp_test, fn_test, tp_test = confusion_matrix(y, y_test).ravel() ##confusion matrix between predicted data from original data and the actual original data
       specificity_test = tn_test / (tn_test+fp_test)
       sensitivity_test = tp_test / (tp_test + fn_test)


       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation
       optomisation_specificity = specificity_sample - specificity_test #optimisation
       optomisation_sensitivity = sensitivity_sample - sensitivity_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation,
            'Sensitivity_sample': sensitivity,
            'Specificity_sample':specificity, 
            'Optimisation_sensitivity': optomisation_sensitivity,
            'Optimisation_specificity': optomisation_specificity
        }
       )


In [18]:
bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
#print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

#0.49942646634516624
#0.6772986683407078 0.3152392553329397

## sensitivity and specificity 
average_optimised_sensitivity = bootstrapped_stats["Optimisation_sensitivity"].mean()  #0.6417965394526935
average_optimised_specificity = bootstrapped_stats["Optimisation_specificity"].mean()   #0.7180134942664962 

optimization_corrected_performance_sensitivity = sensitivity - average_optimised_sensitivity ## 0.7178314223295834

optimization_corrected_performance_specificity = specificity - average_optimised_specificity ## 0.640672271758783

print(optimization_corrected_performance_sensitivity, optimization_corrected_performance_specificity)

#so both are low...

0.4971436829353752
0.7143344405423121 0.2708816318195387
0.6753622428649423 0.3241001110245461


In [24]:
## try code from  https://github.com/yaesoubilab/PredictMDRTB/blob/d8450cc2c158d07baf19eb01c46bbb2d4bae6803/source/ClassifierClasses.py#L15

from sklearn.metrics import roc_curve, auc 
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

y = CIP_data['Susceptible']


model = LogisticRegression(C=0.01, class_weight = 'balanced')
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic)


bootstrapped_stats = []
n_iterations = 100
threshold = 0.5
for i in range(n_iterations):
       threshold = 0.3

       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

       y_sample = sample['Susceptible']
       ####
       model = LogisticRegression(C=0.05, class_weight = 'balanced', solver = "lbfgs") #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ## could try and set threshold - from "get prediction value" https://github.com/yaesoubilab/PredictMDRTB/blob/d8450cc2c158d07baf19eb01c46bbb2d4bae6803/source/ClassifierClasses.py#L15
       
       y_train_hat = model.predict(X) ## original data
       y_train_hat_prob = model.predict_proba(X) ##
       y_test_hat = model.predict(X_sample)  # predict class label
       y_test_hat_prob = model.predict_proba(X_sample)  # predict probability
       
       if threshold is not None:
          y_test_hat = np.where(y_test_hat_prob[:, 1] > threshold, 1, 0)
          y_train_hat = np.where(y_train_hat_prob[:, 1] > threshold, 1, 0)
       tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=y_test_hat).ravel()
       sensitivity = tp / (tp + fn)
       specificity = tn / (tn + fp)
       fpr, tpr, threshold = roc_curve(y_test, y_test_hat_prob[:, 1], drop_intermediate=False)
       fpr_train, tpr_train, threshold_train = roc_curve(y_train, y_train_hat_prob[:, 1], drop_intermediate=False)


       ROC_AUC_logistic_test = roc_auc = auc(fpr, tpr)
       ROC_AUC_logistic_train = roc_auc = auc(fpr, tpr)

       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation
        }
       )

bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
#print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)



#


0.7187539596917403
0.4983119089202026
0.7153835618253448 0.27482112951112625


In [9]:
## Random search https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)

space = dict()
#space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l1', 'l2', 'elasticnet']
space['C'] = np.arange(0, 1, 0.05)#loguniform(1e-5, 100)

model = LogisticRegression(class_weight = 'balanced', max_iter = 1000, solver = 'lbfgs')
model_fit = model.fit(X, y)

search = RandomizedSearchCV(model, space, n_iter=60, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
#Best Score: 0.7078653885719192
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.05}
#Best Score: 0.733893787482822
#Best Hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.9500000000000001}

4100 fits failed out of a total of 6000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-packages/sklearn/base.py", line 570, in _validate_params
    validate_parameter_constraints(
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-p

Best Score: 0.7967922687746966
Best Hyperparameters: {'penalty': 'l2', 'C': 0.05}


In [None]:
## Grid search from above
from sklearn.model_selection import GridSearchCV
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
model = LogisticRegression(class_weight = 'balanced')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

result = search.fit(X, y)
print('Best Hyperparameters: %s' % result.best_params_) #Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
print('Best Hyperparameters: %s' % result.best_params_) #Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [92]:
####### repeat steps 1 - 4 with new hyperparameters
#Best Hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.9500000000000001}
### Step 1: create model and calculate apparent performance metric of interest (P)
X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

y_sample = sample['Susceptible']

model = LogisticRegression(penalty = 'l2', C= 0.95, solver ='lbfgs', class_weight = 'balanced', max_iter = 500)
model_fit = model.fit(X, y)

#print(model_fit.coef_)
#print(model_fit.score(X,y)) # 0.56

y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic) #0.7275951887773698
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## but get model accuracy of 0.65511978853664

## this is "P" from S4 https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000059 (step 1)


## Step 2: Bootstrapping validation 
n_iterations = 100
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]

       y_sample = sample['Susceptible']

       model = LogisticRegression(penalty = 'l2', C= 0.95, solver ='lbfgs', class_weight = 'balanced', max_iter = 500) #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

#0.5001378786356426
#0.6745810201109699 0.31370439675223116

0.679905016859595
ACCURACY OF THE MODEL:  0.7069705832673998
    Sample ROC  Test ROC  Optimisation
0     0.676928  0.502300      0.174628
1     0.681349  0.499890      0.181459
2     0.682233  0.499565      0.182668
3     0.679573  0.499934      0.179639
4     0.678026  0.499540      0.178486
..         ...       ...           ...
95    0.681790  0.494846      0.186944
96    0.680793  0.499707      0.181086
97    0.681081  0.498316      0.182765
98    0.682734  0.500297      0.182438
99    0.679882  0.500716      0.179166

[100 rows x 3 columns]
0.5001378786356426
0.6745810201109699 0.31370439675223116
