In [None]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample

## read data 
CIP_data = pd.read_csv("CIP_data_encoded.csv")
CIP_data.columns

In [9]:
### Step 1: create model and calculate apparent performance metric of interest (P)

X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

model = LogisticRegression(class_weight = 'balanced')
model_fit = model.fit(X, y)

print(model_fit.coef_)
print(model_fit.score(X,y)) # 0.56

y_predict = model_fit.predict(X)

ROC_AUC_logistic = metrics.roc_auc_score(y, y_predict)
print(ROC_AUC_logistic) # 0.635
## this is "P" from S4 https://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000059 (step 1)

[[-0.13983754 -0.46610686 -0.00496913  0.41097443 -1.03919044 -0.5522501
   0.08719309  0.09945759  0.34317818  0.10792936 -1.47788579 -0.00205572
   0.89130444  0.79733214 -0.00588079  0.06355565 -0.89394985 -1.57024596
  -0.78945784 -0.4951679  -0.55945962  0.41341553 -0.12538835 -0.25564837
  -0.60209837  0.88889494 -0.34760487 -1.51563216  0.7122274  -0.39153648
   0.05722431 -0.58014035 -1.02565718 -0.06949486  0.26928946 -0.10262754
  -0.17520735 -0.40540676  0.41938533  0.87670761  0.08040403  0.43944065
   1.10322775  1.11086625]]
0.5620116102305155
0.6353175352991693


In [None]:
#

In [11]:
## Step 2: Bootstrapping validation 
n_iterations = 100
bootstrapped_stats = pd.DataFrame()
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

train = resample(CIP_data, replace=True, n_samples=len(CIP_data))

train.head()

X_train  = train[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y_train = train['Susceptible']

model_train = LogisticRegression(class_weight = 'balanced')
model_train = model_train.fit(X_train, y_train)

#print(model.coef_)
#print(model.score(X,y)) # 0.56

y_predict = model_train.predict(X_train)

ROC_AUC_logistic_train = metrics.roc_auc_score(y_train, y_predict)

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model_sample = LogisticRegression(class_weight = 'balanced') #calculate APPARENT performance - ROC
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_logistic_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_logistic_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_logistic_sample - ROC_AUC_logistic_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_logistic_sample,
            'Test ROC': ROC_AUC_logistic_test,
            'Optimisation': optomisation
        }
       )


TypeError: list indices must be integers or slices, not str

In [20]:
bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_logistic - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

Bootstrap_CI = (1 - 0.25)*bootstrapped_stats["Optimisation"].quantile(q = 1)
Upper_bootstrap_CI = optimization_corrected_performance + Bootstrap_CI
Lower_bootstrap_CI = optimization_corrected_performance - Bootstrap_CI

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

0.49859150835149
0.6085131891844167 0.3886698275185634
