In [67]:
%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

## read data 
CIP_data = pd.read_csv("CIP_data_encoded.csv")


In [73]:
X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

model = RandomForestClassifier(n_estimators = 100, min_samples_leaf=5, class_weight='balanced')
model_fit = model.fit(X, y)

print(model_fit.score(X,y)) # 0.6688061731577871

y_predict = model_fit.predict(X)

ROC_AUC_random_forest = metrics.roc_auc_score(y, y_predict) ## 0.684547956337969
print(ROC_AUC_random_forest)
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y, y_predict)) ## but get model accuracy of 0.6688061731577871


0.6688061731577871
0.684547956337969
ACCURACY OF THE MODEL:  0.6688061731577871


In [71]:

n_iterations = 10
bootstrapped_stats = []

for i in range(n_iterations):
       sample = resample(CIP_data, replace=True, n_samples=len(CIP_data)) ##(a) sample n individuals with replacement

       X_sample  = sample[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

       y_sample = sample['Susceptible']

       model = RandomForestClassifier(n_estimators = 100, min_samples_leaf=5, class_weight='balanced') #calculate APPARENT performance - ROC
       model_sample = model.fit(X_sample, y_sample)
       y_predict_sample = model_sample.predict(X_sample) 
       ROC_AUC_random_forest_sample = metrics.roc_auc_score(y_sample, y_predict_sample)

       y_test = model_sample.predict(X) #performance on original data  
       ROC_AUC_random_forest_test = metrics.roc_auc_score(y_sample, y_test)

       optomisation = ROC_AUC_random_forest_sample - ROC_AUC_random_forest_test #optimisation

       bootstrapped_stats.append(
        {
            'Sample ROC': ROC_AUC_random_forest_sample,
            'Test ROC': ROC_AUC_random_forest_test,
            'Optimisation': optomisation
        }
       )



In [74]:
bootstrapped_stats = pd.DataFrame(bootstrapped_stats)
print(bootstrapped_stats.head())
## Step 3: Get average optimization

average_optimisation = bootstrapped_stats["Optimisation"].mean() 

## Step 4: Get optimization-corrected performance

optimization_corrected_performance = ROC_AUC_random_forest - average_optimisation ##

print(optimization_corrected_performance)

## get CI 

conf_interval = np.percentile(bootstrapped_stats["Optimisation"],[2.5,97.5])
Upper_bootstrap_CI = optimization_corrected_performance +conf_interval[0]
Lower_bootstrap_CI = optimization_corrected_performance - conf_interval[1]

print(Upper_bootstrap_CI, Lower_bootstrap_CI)

   Sample ROC  Test ROC  Optimisation
0    0.684539  0.502071      0.182468
1    0.683403  0.502421      0.180983
2    0.682533  0.501669      0.180864
3    0.688031  0.498222      0.189809
4    0.685248  0.500038      0.185210
0.5008006741094921
0.6810491395222305 0.3113728986187605


In [69]:
## Random search https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

X = CIP_data[['ANC', 'ATL', 'BAL', 'BHM', 'BOS', 'BUF', 'CAM', 'CHI',
       'CIN', 'CLE', 'COL', 'DAL', 'DEN', 'DTR', 'FBG', 'GRB', 'HON', 'IND',
       'JAC', 'KCY', 'LAX', 'LBC', 'LVG', 'MIA', 'MIL', 'MIN', 'NOR', 'NYC',
       'OKC', 'ORA', 'PHI', 'PHX', 'PON', 'POR', 'RIC', 'SDG', 'SEA', 'SFO',
       'SLC', 'STL', 'WDC', 'MSMW', 'MSW', 'Oth/Unk/Missing']]

y = CIP_data['Susceptible']

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

space = dict()
space['n_estimators'] = np.arange(0, 200, 10)
space['max_features'] = ['auto', 'sqrt']
#space['max_depth'] = np.arange(0, 200, 10)
space['min_samples_split'] = [2,5,10]
space['min_samples_leaf'] = [1,2,4]
model = RandomForestClassifier(class_weight = 'balanced')
model_fit = model.fit(X, y)

search = RandomizedSearchCV(model, space, n_iter=10, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
#Best Score: 0.7078653885719192
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.1}
#Best Hyperparameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.05}

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


KeyboardInterrupt: 