In [3]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

## read data 
CIP_data = pd.read_csv("CIP_data_encode_prev.csv")

In [5]:
### Step 1: create model and calculate apparent performance metric of interest (P)
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_CLINIC', 'PREV_REGION']]
y = CIP_data['Susceptible']
print(y.sum()/len(y))


#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30) ## even doing trian/test split doesn't work...

neural_network_model = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 1000 ,hidden_layer_sizes= 44, random_state=10)

neural_network_fit = neural_network_model.fit(X,y)
y_predict_nn = neural_network_fit.predict(X)

ROC_AUC_neural_network = metrics.roc_auc_score(y, y_predict_nn)

print(ROC_AUC_neural_network)

print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict_nn)) ## but get model accuracy of 0.9470427693866846... so why is the ROC so low?  
## any why is this getting the same score as the random forest model?


0.8551121462924605
0.5086758099866742
ACCURACY OF THE MODEL:  0.8553077244481585


In [13]:

#### now try bootstrapping
#bootstrap data
n_iterations = 10
bootstrapped_stats = []

#1. Create model using all data and get ROC_AUC ("ROC_AUC_neural_network")
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']

model = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 1000 ,hidden_layer_sizes= 4, random_state=10)
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_neural_network_apparent = metrics.roc_auc_score(y, y_predict)

for i in range(n_iterations):
       #2. (A) Sample all individuals w/replacement
        sample = CIP_data.sample(frac = 1, replace=True) ##(a) sample n individuals with replacement
        X_sample = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
        y_sample = sample['Susceptible']


       #  (B) Develop predictive model and find apparent performance
        model_fit = model.fit(X_sample, y_sample)
        y_sample_predict = model_fit.predict(X_sample)
        ROC_AUC_neural_network_bootstrap_sample_performance = metrics.roc_auc_score(y_sample, y_sample_predict) 

       #  (C) Performance of predictive model on original sample (i.e. original population, X)
        y_test_predict = model_fit.predict(X)
        ROC_AUC_neural_network_bootstrap_test_performance = metrics.roc_auc_score(y, y_test_predict) ## 0.756384214489288
      ### (D) Calculate optimisation by getting (B) - (D) 
        optimism = ROC_AUC_neural_network_bootstrap_sample_performance - ROC_AUC_neural_network_bootstrap_test_performance



        bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_neural_network_bootstrap_sample_performance,
            'Test ROC': ROC_AUC_neural_network_bootstrap_test_performance,
            'Optimisation': optimism
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats.head())
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_neural_network_apparent - average_optimisation ##

print(optimization_corrected_performance)


In [14]:
## Step 3: Get average optimization

average_optimisation_nn = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance_nn = ROC_AUC_neural_network - average_optimisation_nn ##

print(optimization_corrected_performance_nn)

## get CI 

Bootstrap_CI_nn = (1 - 0.25)*bootstrapped_stats["Optimisation"].quantile(q = 1)
Upper_bootstrap_CI = optimization_corrected_performance_nn + Bootstrap_CI_nn
Lower_bootstrap_CI = optimization_corrected_performance_nn - Bootstrap_CI_nn

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

0.5
0.5 0.5


In [10]:
## Try oversampling (randomly duplicates examples in the minority class). Very imbalanced dataset, as CIPRO + is a strong minority 
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy = 0.5)

X_over, y_over = oversample.fit_resample(X,y)
print(oversample)
neural_network_model = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 1000 ,hidden_layer_sizes= 4, random_state=10)

neural_network_fit = neural_network_model.fit(X_over,y_over)
y_predict_nn = neural_network_fit.predict(X)

ROC_AUC_neural_network = metrics.roc_auc_score(y, y_predict_nn)

print(ROC_AUC_neural_network)

RandomOverSampler(sampling_strategy=0.5)
0.6969951393820488


In [22]:
### Try hyperparameter tuning with random oversampling 
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
space = dict()
space['activation'] = ['tanh', 'relu']
space['solver'] = ['sdg', 'adam', 'lbfgs']
space['alpha'] = np.logspace(-1, 1, 10)
space['learning_rate'] = ['constant','adaptive']
space['hidden_layer_sizes'] = [(4), (8), (12,)]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)


model = MLPClassifier(random_state=10, max_iter = 1000)

X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_CLINIC', 'PREV_REGION']]
y = CIP_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5)
X, y = oversample.fit_resample(X,y)
model_fit = model.fit(X, y)

search = RandomizedSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

500 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 740, in fit
    self._validate_params()
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/python3.10/site-packages/sklearn/base.py", line 570, in _validate_params
    validate_parameter_constraints(
  File "/Users/rem76/miniconda3/envs/GISP_init/lib/pytho

Best Score: 0.800876990556059
Best Hyperparameters: {'solver': 'lbfgs', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (12,), 'alpha': 5.994842503189409, 'activation': 'tanh'}


In [24]:
#### Try bootstrapping with oversampling and hyperparameters
#bootstrap data
n_iterations = 100
bootstrapped_stats = []

#1. Create model using all data and get ROC_AUC ("ROC_AUC_neural_network")
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']
X, y = oversample.fit_resample(X,y)
model = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 1000 ,hidden_layer_sizes= 12, random_state=10, learning_rate = 'adaptive' )
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_neural_network_apparent = metrics.roc_auc_score(y, y_predict)

for i in range(n_iterations):
       #2. (A) Sample all individuals w/replacement
        sample = CIP_data.sample(frac = 1, replace=True) ##(a) sample n individuals with replacement
        X_sample = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
        y_sample = sample['Susceptible']
        X_sample, y_sample = oversample.fit_resample(X_sample,y_sample)


       #  (B) Develop predictive model and find apparent performance
        model_fit = model.fit(X_sample, y_sample)
        y_sample_predict = model_fit.predict(X_sample)
        ROC_AUC_neural_network_bootstrap_sample_performance = metrics.roc_auc_score(y_sample, y_sample_predict) 

       #  (C) Performance of predictive model on original sample (i.e. original population, X)
        y_test_predict = model_fit.predict(X)
        ROC_AUC_neural_network_bootstrap_test_performance = metrics.roc_auc_score(y, y_test_predict) ## 0.756384214489288
      ### (D) Calculate optimisation by getting (B) - (D) 
        optimism = ROC_AUC_neural_network_bootstrap_sample_performance - ROC_AUC_neural_network_bootstrap_test_performance



        bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_neural_network_bootstrap_sample_performance,
            'Test ROC': ROC_AUC_neural_network_bootstrap_test_performance,
            'Optimisation': optimism
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats.head())
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_neural_network_apparent - average_optimisation ##

print(optimization_corrected_performance)


Bootstrap_CI_nn = (1 - 0.025)*bootstrapped_stats["Optimisation"].quantile(q = 1)
Upper_bootstrap_CI = optimization_corrected_performance + Bootstrap_CI_nn
Lower_bootstrap_CI = optimization_corrected_performance - Bootstrap_CI_nn


print(Upper_bootstrap_CI, Lower_bootstrap_CI)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [None]:
#### Try bootstrapping with oversampling and hyperparameters and threshold 
from sklearn.metrics import roc_curve, auc 
from sklearn.metrics import confusion_matrix

#bootstrap data
n_iterations = 100
bootstrapped_stats = []

#1. Create model using all data and get ROC_AUC ("ROC_AUC_neural_network")
X = CIP_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y = CIP_data['Susceptible']
X, y = oversample.fit_resample(X,y)
model = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 1000 ,hidden_layer_sizes= 12, random_state=10, learning_rate = 'adaptive' )
model_fit = model.fit(X, y)
y_predict = model_fit.predict(X)

ROC_AUC_neural_network_apparent = metrics.roc_auc_score(y, y_predict)

##threshold
threshold = 0.5
for i in range(n_iterations):
       #2. (A) Sample all individuals w/replacement
        sample = CIP_data.sample(frac = 1, replace=True) ##(a) sample n individuals with replacement
        X_sample = sample[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
        y_sample = sample['Susceptible']
        X_sample, y_sample = oversample.fit_resample(X_sample,y_sample)


       #  (B) Develop predictive model and find apparent performance
        model_fit = model.fit(X_sample, y_sample)
        y_sample_predict = model_fit.predict(X_sample)
        y_sample_predict_prob = model_fit.predict_proba(X_sample)

        ROC_AUC_neural_network_bootstrap_sample_performance = metrics.roc_auc_score(y_sample, y_sample_predict) 

       #  (C) Performance of predictive model on original sample (i.e. original population, X)
        y_test_predict = model_fit.predict(X)
        y_test_predict_prob = model_fit.predict_proba(X)

        ROC_AUC_neural_network_bootstrap_test_performance = metrics.roc_auc_score(y, y_test_predict) ## 0.756384214489288
      ### (D) Calculate optimisation by getting (B) - (D) 
        optimism = ROC_AUC_neural_network_bootstrap_sample_performance - ROC_AUC_neural_network_bootstrap_test_performance
     
      ### (i) Calculate sensitivity and specificity 
        if threshold is not None:
           y_sample_predict = np.where(y_sample_predict_prob[:, 1] > threshold, 1, 0)
           y_test_predict = np.where(y_test_predict_prob[:, 1] > threshold, 1, 0)

        tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_true=y, y_pred=y_test_predict).ravel()
        tn_sample, fp_sample, fn_sample, tp_sample = confusion_matrix(y_true=y_sample, y_pred=y_sample_predict).ravel()

        sensitivity_test = tp_test / (tp_test + fn_test)
        specificity_test = tn_test / (tn_test + fp_test)

        sensitivity_sample = tp_sample/ (tp_sample + fn_sample)
        specificity_sample = tn_sample / (tn_sample + fp_sample)

        fpr_test, tpr_test, threshold_test = roc_curve(y, y_test_predict_prob[:, 1], drop_intermediate=False)
        fpr_sample, tpr_sample, threshold_sample = roc_curve(y_sample, y_sample_predict_prob[:, 1], drop_intermediate=False)


        #ROC_AUC_nn_test = roc_auc = auc(fpr_test, tpr_test)
        #ROC_AUC_nn_sample = roc_auc = auc(fpr_sample, tpr_sample)


        bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_neural_network_bootstrap_sample_performance,
            'Test ROC': ROC_AUC_neural_network_bootstrap_test_performance,
            'Optimisation': optimism,
            'Sample Sensitivity': sensitivity_sample,
            'Sample Specificity': specificity_sample,
            'Test Sensitivity': sensitivity_test,
            'Test Specificity': specificity_test
        }
       )


bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats.head())
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_neural_network_apparent - average_optimisation ##

print(optimization_corrected_performance)


Bootstrap_CI_nn = (1 - 0.025)*bootstrapped_stats["Optimisation"].quantile(q = 1)
Upper_bootstrap_CI = optimization_corrected_performance + Bootstrap_CI_nn
Lower_bootstrap_CI = optimization_corrected_performance - Bootstrap_CI_nn


print(Upper_bootstrap_CI, Lower_bootstrap_CI)