In [79]:
%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
## read data 
CIP_data = pd.read_csv("CIP_data_encoded.csv")
CIP_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,CLINIC,YEAR,GENDERSP,Susceptible,ANC,ATL,BAL,BHM,...,RIC,SDG,SEA,SFO,SLC,STL,WDC,MSMW,MSW,Oth/Unk/Missing
0,0,0,ALB,2000,MSW,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,ALB,2000,MSW,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,0,ALB,2000,MSW,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,0,ALB,2000,MSW,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,0,ALB,2000,MSW,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [82]:
### Step 1: create model and calculate apparent performance metric of interest (P)

X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

model = LogisticRegression(class_weight = 'balanced', max_iter=1000)
model_fit = model.fit(X, y)

#print(model_fit.coef_)
print(model_fit.score(X,y)) # 0.7069705832673998

y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict) #0.679905016859595
print(ROC_AUC_logistic) # 0.635
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## but get model accuracy of 0.7069705832673998

## this is "P" from S4 https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000059 (step 1)

## add in confusion matrix 
tn, fp, fn, tp = confusion_matrix(y, y_predict).ravel()
specificity = tn / (tn+fp)
sensitivity = tp / (tp + fn)
print(specificity) #0.6417965394526935
print(sensitivity )#0.7180134942664962



0.7069705832673998
0.679905016859595
ACCURACY OF THE MODEL:  0.7069705832673998
0.6417965394526935
0.7180134942664962


In [None]:
#

In [85]:
## Step 2: Bootstrapping validation 
n_iterations = 10
bootstrapped_stats = pd.DataFrame()
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

train = resample(CIP_data, replace=True, n_samples=len(CIP_data))

train.head()

X_train  = train[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y_train = train['Susceptible']

model_train = LogisticRegression(class_weight = 'balanced', max_iter = 500)
model_train = model_train.fit(X_train, y_train)

#print(model.coef_)
#print(model.score(X,y)) # 0.56

y_predict = model_train.predict(X_train)

ROC_AUC_logistic_train = metrics.roc_auc_score(y_train, y_predict)

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model = LogisticRegression(class_weight = 'balanced', max_iter = 500) #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)
       tn_sample, fp_sample, fn_sample, tp_sample = confusion_matrix(y_sample, y_predict_sample).ravel()
       specificity_sample = tn_sample / (tn_sample+fp_sample)
       sensitivity_sample = tp_sample / (tp_sample + fn_sample)


       y_test = model_sample.predict(X) #see how model trained on sample data performns on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test) 
       tn_test, fp_test, fn_test, tp_test = confusion_matrix(y, y_test).ravel() ##confusion matrix between predicted data from original data and the actual original data
       specificity_test = tn_test / (tn_test+fp_test)
       sensitivity_test = tp_sample / (tp_test + fn_test)


       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation
       optomisation_specificity = specificity_sample - specificity_test #optimisation
       optomisation_sensitivity = sensitivity_sample - sensitivity_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation,
            'Sensitivity_sample': sensitivity,
            'Specificity_sample':specificity, 
            'Optimisation_sensitivity': optomisation_sensitivity,
            'Optimisation_specificity': optomisation_specificity
        }
       )


In [84]:
bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

#0.49942646634516624
#0.6772986683407078 0.3152392553329397

## sensitivity and specificity 
average_sensitivity = bootstrapped_stats["Sensitivity_sample"].mean()  #0.6417965394526935
average_specificity = bootstrapped_stats["Specificity_sample"].mean()   #0.7180134942664962 
print(average_sensitivity,average_specificity)



   Sample ROC  Test ROC  Optimisation  Sensitivity_sample  Specificity_sample
0    0.678889  0.499965      0.178924            0.718013            0.641797
1    0.680879  0.501048      0.179832            0.718013            0.641797
2    0.680547  0.495366      0.185181            0.718013            0.641797
3    0.678436  0.500487      0.177949            0.718013            0.641797
4    0.679098  0.501030      0.178069            0.718013            0.641797
5    0.678492  0.501786      0.176706            0.718013            0.641797
6    0.677673  0.502471      0.175202            0.718013            0.641797
7    0.679537  0.497034      0.182503            0.718013            0.641797
8    0.681570  0.501275      0.180294            0.718013            0.641797
9    0.681490  0.501582      0.179908            0.718013            0.641797
0.5004483708837584
0.6759886524575897 0.31587019657772764
0.7180134942664962 0.6417965394526935


In [None]:
## try L1 regularization 
from sklearn.linear_model import Lasso
X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

#lasso = Lasso(alpha = 0.1)
#model = lasso.fit(X, y)
#print(model_fit.coef_)
#print(model_fit.score(X,y)) # 0.56

#y_predict = lasso.predict(X)

#ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
#print(ROC_AUC_logistic) # 0.635
#print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## but get model accuracy of 0.5620116102305155


model = LogisticRegression(C=0.01, class_weight = 'balanced')
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic)


bootstrapped_stats = []
n_iterations = 20

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model = LogisticRegression(C=0.001, class_weight = 'balanced') #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation
        }
       )

bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
#print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)



#


In [None]:
## Random search https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = np.arange(0, 1, 0.05)#loguniform(1e-5, 100)

model = LogisticRegression(class_weight = 'balanced')
model_fit = model.fit(X, y)

search = RandomizedSearchCV(model, space, n_iter=100, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
#Best Score: 0.7078653885719192
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.05}

In [None]:
## Grid search from above
from sklearn.model_selection import GridSearchCV
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
model = LogisticRegression(class_weight = 'balanced')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

result = search.fit(X, y)
print('Best Hyperparameters: %s' % result.best_params_) #Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
print('Best Hyperparameters: %s' % result.best_params_) #Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}


In [65]:
####### repeat steps 1 - 4 with new hyperparameters
#Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}

### Step 1: create model and calculate apparent performance metric of interest (P)

X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

model = LogisticRegression(penalty = 'l2', C= 0.0001, solver ='liblinear', class_weight = 'balanced')
model_fit = model.fit(X, y)

#print(model_fit.coef_)
#print(model_fit.score(X,y)) # 0.56

y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic) #0.7275951887773698
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## but get model accuracy of 0.65511978853664

## this is "P" from S4 https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000059 (step 1)


## Step 2: Bootstrapping validation 
n_iterations = 100
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model = LogisticRegression(penalty = 'l2', C= 0.0001, solver ='liblinear', class_weight = 'balanced') #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

#Bootstrap_CI = bootstrapped_stats["Optimisation"].quantile(q = 0.975)
conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

#0.499720075640974

0.6524357256773645
ACCURACY OF THE MODEL:  0.7219589819268004
    Sample ROC  Test ROC  Optimisation
0     0.650248  0.500314      0.149934
1     0.650061  0.497547      0.152514
2     0.654783  0.502013      0.152769
3     0.652975  0.500570      0.152405
4     0.652138  0.498791      0.153348
..         ...       ...           ...
95    0.651455  0.502237      0.149217
96    0.654212  0.500867      0.153345
97    0.651245  0.499644      0.151601
98    0.651724  0.501940      0.149783
99    0.653685  0.501460      0.152225

[100 rows x 3 columns]
0.5004077884754784
0.6476899298839871 0.34311121136063194
