In [60]:
%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

## read data 
CIP_data = pd.read_csv("CIP_data_encoded.csv")

In [62]:
X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

model = RandomForestClassifier(n_estimators = 100, min_samples_leaf=5, class_weight='balanced')
model_fit = model.fit(X, y)

print(model_fit.score(X,y)) # 56

y_predict = model_fit.predict(X)

ROC_AUC_random_forest = metrics.roc_auc_score(y, y_predict) ## 0.64
print(ROC_AUC_random_forest)
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## but get model accuracy of 0.5620116102305155



In [57]:

n_iterations = 10
bootstrapped_stats = []

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model = RandomForestClassifier(n_estimators = 200, min_samples_leaf=5, class_weight='balanced') #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_random_forest_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_random_forest_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_random_forest_sample - ROC_AUC_random_forest_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_random_forest_sample,
            'Test ROC': ROC_AUC_random_forest_test,
            'Optimisation': optomisation
        }
       )



In [59]:
bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats.head())
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_random_forest - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

Bootstrap_CI = (1 - 0.25)*bootstrapped_stats["Optimisation"].quantile(q = 1)
Upper_bootstrap_CI = optimization_corrected_performance + Bootstrap_CI
Lower_bootstrap_CI = optimization_corrected_performance - Bootstrap_CI

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

   Sample ROC  Test ROC  Optimisation
0    0.641429  0.503268      0.138161
1    0.649265  0.503870      0.145395
2    0.647191  0.503787      0.143404
3    0.644584  0.497558      0.147026
4    0.646933  0.498328      0.148605
0.49890380639822846
0.6103572997254488 0.3874503130710081
