In [43]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

## read data 
CIP_data = pd.read_csv("CIP_data_encoded.csv")

In [48]:
### Step 1: create model and calculate apparent performance metric of interest (P)
X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30) ## even doing trian/test split doesn't work...

neural_network_model = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 10000 ,hidden_layer_sizes= (4,4), random_state=1 )

neural_network_fit = neural_network_model.fit(X_train,y_train)
y_predict = neural_network_fit.predict(X_test)

ROC_AUC_neural_network = metrics.roc_auc_score(y_test, y_predict)

print(ROC_AUC_neural_network)

print(y_predict)

0.5
[0 0 0 ... 0 0 0]


In [49]:
## Step 2: Bootstrapping validation 
n_iterations = 10
bootstrapped_stats = []
## the test and train data for the bootstrapping will be the same, as above

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model_sample = neural_network_model.fit(X_sample,y_sample) #calculate APPARENT performance - ROC
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_neural_network_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_neural_network_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_neural_network_sample - ROC_AUC_neural_network_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_neural_network_sample,
            'Test ROC': ROC_AUC_neural_network_test,
            'Optimisation': optomisation
        }
       )
bootstrapped_stats_nn = pd.DataFrame(bootstrapped_stats)



In [50]:
## Step 3: Get average optimization

average_optimisation_nn = bootstrapped_stats_nn["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance_nn = ROC_AUC_neural_network - average_optimisation_nn ##

print(optimization_corrected_performance_nn)

## get CI 

Bootstrap_CI_nn = (1 - 0.25)*bootstrapped_stats_nn["Optimisation"].quantile(q = 1)
Upper_bootstrap_CI = optimization_corrected_performance_nn + Bootstrap_CI_nn
Lower_bootstrap_CI = optimization_corrected_performance_nn - Bootstrap_CI_nn

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

0.5
0.5 0.5
